In [81]:
import pandas as pd
import matplotlib as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures

In [82]:
data = pd.read_csv('cleaneddata.csv')
X = data[['smoker','age', 'is_east', 'sex','bmi', 'is_north','children']]
y = data['charges']

In [83]:
X['bmi'] = (X['bmi'] - X['bmi'].mean())/X['bmi'].std()
X['age'] = (X['age'] - X['age'].mean())/X['age'].std()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['bmi'] = (X['bmi'] - X['bmi'].mean())/X['bmi'].std()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['age'] = (X['age'] - X['age'].mean())/X['age'].std()


In [84]:
poly = PolynomialFeatures(degree=2)
poly_variables = poly.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(poly_variables, y, test_size = 0.3, random_state = 4)

regression = linear_model.LinearRegression()

model = regression.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(model.n_features_in_)
print("Coefficients: \n", model.coef_)
print(model.intercept_)
print("root mean squared error: %.2f" % np.sqrt(mean_squared_error(y_test, y_pred)))
print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred))

36
Coefficients: 
 [-1.95450493e-13  1.23816843e+04  4.16874884e+03  2.18748315e+02
 -1.37361473e+02 -3.77542047e+02  1.56185978e+02  9.44664872e+02
  1.23816843e+04 -1.22562214e+02 -1.20902591e+03 -2.34761769e+02
  8.93290334e+03  1.74889513e+02 -1.84223111e+02  6.08412100e+02
 -2.14427136e+02  6.47511888e+01  1.27053498e+02 -5.95007631e+02
  6.55790539e+01  2.18748315e+02 -1.31485292e+02  6.97867169e+01
  4.00982904e+02  3.23779214e+01 -1.37361473e+02  1.79392234e+01
  4.09411497e+01 -2.20649438e+02 -2.09974149e+02  8.06390462e+02
 -1.36606377e+01  1.56185978e+02  4.06108230e+02 -1.08975851e+02]
6978.25369877465
root mean squared error: 4515.53
Coefficient of determination: 0.86


In [85]:
poly = PolynomialFeatures(degree=2)
poly_variables = poly.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(poly_variables, y, test_size = 0.3, random_state = 4)

regression = linear_model.Ridge(alpha=0.0001)

model = regression.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(model.n_features_in_)
print("Coefficients: \n", model.coef_)
print(model.intercept_)
print("root mean squared error: %.2f" % np.sqrt(mean_squared_error(y_test, y_pred)))
print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred))

36
Coefficients: 
 [    0.         12381.65863129  4168.74465563   218.745233
  -137.36293061  -377.53823766   156.18445788   944.66098317
 12381.65863244  -122.56275145 -1208.99989536  -234.7396172
  8932.89716067   174.90588151  -184.21565444   608.41186735
  -214.42400703    64.75230182   127.05325349  -595.00569694
    65.57945913   218.74523237  -131.48570048    69.78548118
   400.98253645    32.3789541   -137.3629306     17.93861568
    40.93960336  -220.64924383  -209.97432216   806.38817791
   -13.66081745   156.18445711   406.10886966  -108.9753936 ]
6978.264259985096
root mean squared error: 4515.53
Coefficient of determination: 0.86


In [86]:
poly = PolynomialFeatures(degree=2)
poly_variables = poly.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(poly_variables, y, test_size = 0.3, random_state = 4)

model = linear_model.Ridge()

# test this range of alpha
alpha_range = np.arange(0.001, 2, 0.01)

param = {'alpha':alpha_range}

# score base on r2
grid_search_r2 = GridSearchCV(model,param_grid=param,cv=5)
grid_search_r2.fit(X_train, y_train)

# score base on MAE
grid_search_MAE = GridSearchCV(model,param_grid=param,scoring ='neg_mean_absolute_error',cv=5)
grid_search_MAE.fit(X_train, y_train)

print('Scoring R2')
print('Best R2 score   : ', grid_search_r2.best_score_)
print('Best parameters : ', grid_search_r2.best_params_)


print('Scoring RMSE')
print('Best RMSE score  : ', np.sqrt(-grid_search_MAE.best_score_))
print('Best parameters : ', grid_search_MAE.best_params_)

Scoring R2
Best R2 score   :  0.8244988210534684
Best parameters :  {'alpha': np.float64(0.6709999999999999)}
Scoring RMSE
Best RMSE score  :  55.94426658729582
Best parameters :  {'alpha': np.float64(0.001)}


In [87]:
poly = PolynomialFeatures(degree=2)
poly_variables = poly.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(poly_variables, y, test_size = 0.3, random_state = 4)

regression = linear_model.Lasso(alpha=0.01, tol=0.1)


# test this range of alpha
alpha_range = np.arange(0.001, 2, 0.01)

param = {'alpha':alpha_range}

# score base on r2
grid_search_r2 = GridSearchCV(model,param_grid=param,cv=5)
grid_search_r2.fit(X_train, y_train)

# score base on MAE
grid_search_MAE = GridSearchCV(model,param_grid=param,scoring ='neg_mean_absolute_error',cv=5)
grid_search_MAE.fit(X_train, y_train)

print('Scoring R2')
print('Best R2 score   : ', grid_search_r2.best_score_)
print('Best parameters : ', grid_search_r2.best_params_)


print('Scoring RMSE')
print('Best RMSE score  : ', np.sqrt(-grid_search_MAE.best_score_))
print('Best parameters : ', grid_search_MAE.best_params_)

Scoring R2
Best R2 score   :  0.8244988210534684
Best parameters :  {'alpha': np.float64(0.6709999999999999)}
Scoring RMSE
Best RMSE score  :  55.94426658729582
Best parameters :  {'alpha': np.float64(0.001)}


In [88]:
poly = PolynomialFeatures(degree=2)
poly_variables = poly.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(poly_variables, y, test_size = 0.3, random_state = 4)

regression = linear_model.Lasso(alpha=1.67, tol=0.1)

model = regression.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Coefficients: \n", model.coef_)
print(model.intercept_)
print("root mean squared error: %.2f" % np.sqrt(mean_squared_error(y_test, y_pred)))
print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred))

Coefficients: 
 [ 0.00000000e+00  2.34220831e+04  4.20422777e+03  4.99243965e+02
  6.39859350e+01  1.70363632e+03  6.60944022e+02  5.33423563e+02
  4.23875954e+02  3.22175544e+02 -2.99759901e+02  7.60118732e+02
  8.40471892e+03  2.30481421e+02 -1.90982572e+02  5.26962481e+02
 -2.22453877e+02 -2.37713857e+02  1.00987088e+02 -4.52012241e+02
  2.88610148e+01 -5.68040389e+02 -4.68642876e+02 -1.24432021e+03
  4.76848143e+02  1.73656350e+02 -6.17083947e+02 -7.43720033e+02
  1.23146115e+01 -6.67695767e+01 -2.06137879e+02 -3.89254129e+01
 -1.98754819e+02 -1.65831884e+02  2.78157491e+02 -3.47133874e+01]
7584.023308916276
root mean squared error: 4585.91
Coefficient of determination: 0.85


In [89]:
poly = PolynomialFeatures(degree=3)
poly_variables = poly.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(poly_variables, y, test_size = 0.3, random_state = 4)

regression = linear_model.LinearRegression()

model = regression.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Coefficients: \n", model.coef_)
print(model.intercept_)
print("root mean squared error: %.2f" % np.sqrt(mean_squared_error(y_test, y_pred)))
print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred))

Coefficients: 
 [-1.37858801e-13  8.36820644e+03  4.18853997e+03  1.28488131e+02
 -1.46168411e+02  9.33955401e+01 -1.63867898e+02  3.65064814e+02
  8.36820644e+03 -1.95684298e+02 -4.68564984e+02  9.34232439e+00
  5.96296562e+03  4.98246523e+02 -5.59426786e+01  2.34324264e+02
  3.94953694e+02 -6.17232104e-01 -1.50146010e+02 -3.85947302e+01
 -2.96729380e+02  1.28488131e+02 -7.28013637e+00 -1.44850138e+02
  1.33592259e+02 -2.35803876e+02 -1.46168411e+02  1.33484029e+02
  1.92000687e+02 -2.62013199e+02  2.02062219e+02  4.16075567e+02
 -3.94004054e+01 -1.63867898e+02  1.11808861e+03 -1.68380662e+02
  8.36820644e+03 -1.95684298e+02 -4.68564984e+02  9.34232439e+00
  5.96296562e+03  4.98246523e+02 -5.59426786e+01 -2.38373382e+02
 -9.64974689e+02  1.18389261e+03  5.23909925e+02 -3.18843620e+02
  6.78137636e+01 -4.68564984e+02  2.30017992e+03 -1.33212718e+03
  5.87215277e+01 -7.69567272e+02  9.34232439e+00 -1.76651329e+02
 -3.00906390e+03 -2.62718109e+02 -9.88390320e+02 -1.19903161e+03
 -6.74034

In [90]:
poly = PolynomialFeatures(degree=4)
poly_variables = poly.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(poly_variables, y, test_size = 0.3, random_state = 4)

regression = linear_model.LinearRegression()

model = regression.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Coefficients: \n", model.coef_)
print(model.intercept_)
print("root mean squared error: %.2f" % np.sqrt(mean_squared_error(y_test, y_pred)))
print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred))

Coefficients: 
 [ 1.86621354e-11  6.00404985e+03  6.53269470e+02  3.03281121e+02
  7.94955973e+02 -5.52881591e+02 -8.17925602e+01  5.26569424e+02
  6.00404985e+03  1.09702073e+03 -5.88830803e+02  6.80478144e+01
  5.64142747e+03  6.15039698e+02 -8.87337447e+02  1.84198479e+03
  7.49529526e+02  9.94194986e+02 -1.10707834e+03  3.91456181e+02
  2.13638901e+03  3.03281121e+02 -5.21249367e+02 -7.35293711e+02
  1.44271761e+02 -8.84630461e+02  7.94955973e+02  1.06836919e+02
 -3.76634922e+02 -2.76259225e+03  2.62173730e+02  6.06877840e+02
  1.38449851e+03 -8.17925602e+01  9.88770466e+02  1.83145470e+03
  6.00404985e+03  1.09702073e+03 -5.88830803e+02  6.80478144e+01
  5.64142747e+03  6.15039698e+02 -8.87337447e+02  4.80783747e+02
 -6.36040494e+02  3.68054587e+02 -7.31376789e+02 -1.18190024e+03
 -1.13406482e+03 -5.88830803e+02  9.42685008e+02 -2.46701787e+02
  3.45460353e+02  1.12912527e+02  6.80478144e+01 -1.08637895e+03
 -1.87684956e+03  6.45742680e+02  1.00516259e+03 -1.11771794e+03
 -1.46183