In [738]:
import pandas as pd
import matplotlib as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures, SplineTransformer

In [707]:
data = pd.read_csv('cleaneddata.csv')
data['underweight'] = [1 if x < 18 else 0 for x in data['bmi']]
data['overweight'] = [1 if x > 31 and x <= 38 else 0 for x in data['bmi']]
data['severlyoverweight'] = [1 if x > 38 else 0 for x in data['bmi']]
data['nokid'] = [1 if x ==0 else 0 for x in data['children']]
data['onekid'] = [1 if x ==1 else 0 for x in data['children']]
data['twokid'] = [1 if x ==2 else 0 for x in data['children']]
data['threekid'] = [1 if x ==3 else 0 for x in data['children']]
data['fourkid'] = [1 if x ==4 else 0 for x in data['children']]
data['fivekid'] = [1 if x ==5 else 0 for x in data['children']]
data['kids'] = [1 if x in [1,2,3,4] else 0 for x in data['children']]
data['elderfemale'] = [1 if x > 60 else 0 for x in data['age']]

In [708]:
X = data[['smoker','age','is_east','bmi', 'is_north','underweight', 'overweight','severlyoverweight', 'nokid','kids', 'elderfemale']]
y = data['charges']

In [709]:
X['bmi'] = (X['bmi'] - X['bmi'].mean())/X['bmi'].std()
X['age'] = (X['age'] - X['age'].mean())/X['age'].std()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['bmi'] = (X['bmi'] - X['bmi'].mean())/X['bmi'].std()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['age'] = (X['age'] - X['age'].mean())/X['age'].std()


In [710]:
poly = PolynomialFeatures(degree=2)
poly_variables = poly.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(poly_variables, y, test_size = 0.2, random_state = 42)

regression = linear_model.LinearRegression()

model = regression.fit(X_train, y_train)
y_pred = model.predict(X_test)


print("root mean squared error: %.2f" % np.sqrt(mean_squared_error(y_test, y_pred)))
print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred))

root mean squared error: 4074.32
Coefficient of determination: 0.91


In [744]:
poly = SplineTransformer(n_knots=4,degree=4)
poly_variables = poly.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(poly_variables, y, test_size = 0.2, random_state = 42)

regression = linear_model.LinearRegression()

model = regression.fit(X_train, y_train)
y_pred = model.predict(X_test)


print("root mean squared error: %.2f" % np.sqrt(mean_squared_error(y_test, y_pred)))
print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred))

root mean squared error: 6178.33
Coefficient of determination: 0.79


In [736]:
poly = PolynomialFeatures(degree=2)
poly_variables = poly.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(poly_variables, y, test_size = 0.2, random_state = 42)

regression = linear_model.Lasso(alpha=0.8, tol=0.08)

model = regression.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("root mean squared error: %.2f" % np.sqrt(mean_squared_error(y_test, y_pred)))
print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred))

root mean squared error: 4172.48
Coefficient of determination: 0.91


In [732]:
poly = PolynomialFeatures(degree=2)
poly_variables = poly.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(poly_variables, y, test_size = 0.2, random_state = 42)

regression = linear_model.Ridge(alpha=0.1)

model = regression.fit(X_train, y_train)
y_pred = model.predict(X_test)


print("root mean squared error: %.2f" % np.sqrt(mean_squared_error(y_test, y_pred)))
print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred))

root mean squared error: 4076.84
Coefficient of determination: 0.91


In [713]:
poly = PolynomialFeatures(degree=2)
poly_variables = poly.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(poly_variables, y, test_size = 0.2, random_state = 42)

regression = linear_model.ElasticNet(alpha=0.031, l1_ratio=0.92, tol=0.08)

model = regression.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("root mean squared error: %.2f" % np.sqrt(mean_squared_error(y_test, y_pred)))
print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred))

root mean squared error: 4161.58
Coefficient of determination: 0.91


In [718]:
poly = PolynomialFeatures(degree=2)
poly_variables = poly.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(poly_variables, y, test_size = 0.2, random_state = 42)

regression = linear_model.BayesianRidge(alpha_1=0.07, alpha_2= 0.07, lambda_1= 0.01, lambda_2=0.01)

model = regression.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("root mean squared error: %.2f" % np.sqrt(mean_squared_error(y_test, y_pred)))
print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred))

root mean squared error: 4169.25
Coefficient of determination: 0.91


In [721]:
poly = PolynomialFeatures(degree=2)
poly_variables = poly.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(poly_variables, y, test_size = 0.2, random_state = 42)

regression = linear_model.RANSACRegressor()

model = regression.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("root mean squared error: %.2f" % np.sqrt(mean_squared_error(y_test, y_pred)))
print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred))

root mean squared error: 4288.23
Coefficient of determination: 0.90


In [733]:
poly = PolynomialFeatures(degree=2)
poly_variables = poly.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(poly_variables, y, test_size = 0.2, random_state = 42)

regression = linear_model.SGDRegressor()

model = regression.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("root mean squared error: %.2f" % np.sqrt(mean_squared_error(y_test, y_pred)))
print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred))

root mean squared error: 4262.80
Coefficient of determination: 0.90
