In [55]:
import pandas as pd
import matplotlib as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, max_error
from sklearn.preprocessing import PolynomialFeatures, SplineTransformer, RobustScaler

In [56]:
data = pd.read_csv('cleaneddata.csv')
data['underweight'] = [1 if x < 18 else 0 for x in data['bmi']]
data['overweight'] = [1 if x > 31 and x <= 38 else 0 for x in data['bmi']]
data['severlyoverweight'] = [1 if x > 38 else 0 for x in data['bmi']]
data['nokid'] = [1 if x ==0 else 0 for x in data['children']]
data['onekid'] = [1 if x ==1 else 0 for x in data['children']]
data['twokid'] = [1 if x ==2 else 0 for x in data['children']]
data['threekid'] = [1 if x ==3 else 0 for x in data['children']]
data['fourkid'] = [1 if x ==4 else 0 for x in data['children']]
data['fivekid'] = [1 if x ==5 else 0 for x in data['children']]
data['kids'] = [1 if x in [1,2,3,4] else 0 for x in data['children']]
data['elderfemale'] = [1 if x > 60 else 0 for x in data['age']]

In [57]:
X = data[['smoker','age','is_east','bmi', 'is_north','underweight', 'overweight','severlyoverweight', 'nokid','kids', 'elderfemale']]
y = data['charges']

In [58]:
# X['bmi'] = (X['bmi'] - X['bmi'].mean())/X['bmi'].std()
# X['age'] = (X['age'] - X['age'].mean())/X['age'].std()
# Q1 = X['bmi'].quantile(0.25)
# Q3 = X['bmi'].quantile(0.75)
# IQR = Q3 - Q1
# Q12 = X['age'].quantile(0.25)
# Q32 = X['age'].quantile(0.75)
# IQR2 = Q32 - Q12
# X['bmi'] = (X['bmi'] - X['bmi'].median())/IQR
# X['age'] = (X['age'] - X['age'].median())/IQR2
X['bmi'] = (X['bmi'] - X['bmi'].median())/X['bmi'].std()
X['age'] = (X['age'] - X['age'].median())/X['age'].std()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['bmi'] = (X['bmi'] - X['bmi'].median())/X['bmi'].std()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['age'] = (X['age'] - X['age'].median())/X['age'].std()


In [59]:
poly = PolynomialFeatures(degree=2)
poly_variables = poly.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(poly_variables, y, test_size = 0.2, random_state = 42)

regression = linear_model.LinearRegression()

model = regression.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_predtrain = model.predict(X_train)

print(np.sqrt(mean_squared_error(y_train, y_predtrain)))
print("Coefficient of determination: %.2f" % r2_score(y_train, y_predtrain))
print(max_error(y_test,y_pred))
print("root mean squared error: %.2f" % np.sqrt(mean_squared_error(y_test, y_pred)))
print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred))

4614.374609619625
Coefficient of determination: 0.84
18204.815658396863
root mean squared error: 4074.32
Coefficient of determination: 0.91


In [60]:
poly = SplineTransformer(n_knots=4,degree=4)
poly_variables = poly.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(poly_variables, y, test_size = 0.2, random_state = 42)

regression = linear_model.LinearRegression()

model = regression.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_predtrain = model.predict(X_train)

print(np.sqrt(mean_squared_error(y_train, y_predtrain)))
print("Coefficient of determination: %.2f" % r2_score(y_train, y_predtrain))

print("root mean squared error: %.2f" % np.sqrt(mean_squared_error(y_test, y_pred)))
print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred))

5938.189170785875
Coefficient of determination: 0.74
root mean squared error: 6178.33
Coefficient of determination: 0.79


In [61]:
poly = PolynomialFeatures(degree=2)
poly_variables = poly.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(poly_variables, y, test_size = 0.2, random_state = 42)

regression = linear_model.Lasso(alpha=0.8, tol=0.08)

model = regression.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_predtrain = model.predict(X_train)

print(np.sqrt(mean_squared_error(y_train, y_predtrain)))
print("Coefficient of determination: %.2f" % r2_score(y_train, y_predtrain))
print("root mean squared error: %.2f" % np.sqrt(mean_squared_error(y_test, y_pred)))
print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred))

4686.113077706826
Coefficient of determination: 0.84
root mean squared error: 4177.84
Coefficient of determination: 0.91


In [62]:
poly = PolynomialFeatures(degree=2)
poly_variables = poly.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(poly_variables, y, test_size = 0.2, random_state = 42)

regression = linear_model.Ridge(alpha=0.1)

model = regression.fit(X_train, y_train)
y_pred = model.predict(X_test)

y_predtrain = model.predict(X_train)

print(np.sqrt(mean_squared_error(y_train, y_predtrain)))
print("Coefficient of determination: %.2f" % r2_score(y_train, y_predtrain))
print("root mean squared error: %.2f" % np.sqrt(mean_squared_error(y_test, y_pred)))
print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred))

4615.306346757694
Coefficient of determination: 0.84
root mean squared error: 4076.73
Coefficient of determination: 0.91


In [63]:
poly = PolynomialFeatures(degree=2)
poly_variables = poly.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(poly_variables, y, test_size = 0.2, random_state = 42)

regression = linear_model.ElasticNet(alpha=0.031, l1_ratio=0.92, tol=0.08)

model = regression.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_predtrain = model.predict(X_train)

print(np.sqrt(mean_squared_error(y_train, y_predtrain)))
print("Coefficient of determination: %.2f" % r2_score(y_train, y_predtrain))
print("root mean squared error: %.2f" % np.sqrt(mean_squared_error(y_test, y_pred)))
print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred))

4637.657562453623
Coefficient of determination: 0.84
root mean squared error: 4161.16
Coefficient of determination: 0.91


In [64]:
poly = PolynomialFeatures(degree=2)
poly_variables = poly.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(poly_variables, y, test_size = 0.2, random_state = 42)

regression = linear_model.BayesianRidge(alpha_1=0.07, alpha_2= 0.07, lambda_1= 0.01, lambda_2=0.01)

model = regression.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_predtrain = model.predict(X_train)

print(np.sqrt(mean_squared_error(y_train, y_predtrain)))
print("Coefficient of determination: %.2f" % r2_score(y_train, y_predtrain))
print("root mean squared error: %.2f" % np.sqrt(mean_squared_error(y_test, y_pred)))
print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred))

4639.70178856866
Coefficient of determination: 0.84
root mean squared error: 4169.56
Coefficient of determination: 0.91


In [65]:
poly = PolynomialFeatures(degree=2)
poly_variables = poly.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(poly_variables, y, test_size = 0.2, random_state = 42)

regression = linear_model.RANSACRegressor()

model = regression.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("root mean squared error: %.2f" % np.sqrt(mean_squared_error(y_test, y_pred)))
print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred))

root mean squared error: 4427.72
Coefficient of determination: 0.89


In [66]:
poly = PolynomialFeatures(degree=2)
poly_variables = poly.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(poly_variables, y, test_size = 0.2, random_state = 42)

regression = linear_model.SGDRegressor()

model = regression.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("root mean squared error: %.2f" % np.sqrt(mean_squared_error(y_test, y_pred)))
print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred))

root mean squared error: 4250.36
Coefficient of determination: 0.90
