In [1]:
import pandas as pd
import matplotlib as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures

In [2]:
data = pd.read_csv('cleaneddata.csv')
X = data[['smoker','age', 'is_east', 'sex','bmi', 'is_north','children']]
y = data['charges']

In [3]:
X['bmi'] = (X['bmi'] - X['bmi'].mean())/X['bmi'].std()
X['age'] = (X['age'] - X['age'].mean())/X['age'].std()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['bmi'] = (X['bmi'] - X['bmi'].mean())/X['bmi'].std()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['age'] = (X['age'] - X['age'].mean())/X['age'].std()


In [4]:
poly = PolynomialFeatures(degree=2)
poly_variables = poly.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(poly_variables, y, test_size = 0.2, random_state = 42)

regression = linear_model.LinearRegression()

model = regression.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(model.n_features_in_)
print("Coefficients: \n", model.coef_)
print(model.intercept_)
print("root mean squared error: %.2f" % np.sqrt(mean_squared_error(y_test, y_pred)))
print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred))

36
Coefficients: 
 [ 5.74564235e-13  1.22455462e+04  3.92271135e+03  2.75641378e+02
  2.29066869e+01 -4.57571981e+01  1.35495964e+02  1.03786143e+03
  1.22455462e+04 -2.15010003e+02 -3.76240416e+02 -1.97932075e+02
  8.97552180e+03 -5.71591434e+02 -2.92935783e+02  8.91616403e+02
 -8.99337138e+01  1.00590067e+02  1.25124195e+02 -6.16968439e+02
 -8.23963022e+01  2.75641378e+02 -4.20026474e+02 -3.67166927e+02
  2.49935338e+02 -1.26992342e+01  2.29066869e+01  5.79589389e+01
 -3.29648124e+02 -2.33851386e+02 -2.63855380e+02  5.52151163e+02
  1.08880042e+02  1.35495964e+02  4.43734874e+02 -9.32130833e+01]
6590.457751893835
root mean squared error: 4577.96
Coefficient of determination: 0.89


In [5]:
poly = PolynomialFeatures(degree=2)
poly_variables = poly.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(poly_variables, y, test_size = 0.2, random_state = 42)

regression = linear_model.Ridge(alpha=0.0001)

model = regression.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(model.n_features_in_)
print("Coefficients: \n", model.coef_)
print(model.intercept_)
print("root mean squared error: %.2f" % np.sqrt(mean_squared_error(y_test, y_pred)))
print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred))

36
Coefficients: 
 [    0.         12245.52407015  3922.7080789    275.63945844
    22.90497322   -45.75625091   135.49463155  1037.85857262
 12245.5240751   -215.01142017  -376.21903306  -197.91271314
  8975.51499317  -571.57848561  -292.92949555   891.61665459
   -89.93159156   100.59059552   125.12388716  -616.96687602
   -82.39572123   275.639458    -420.02670049  -367.16631067
   249.93506431   -12.69886131    22.90497292    57.95997065
  -329.64819221  -233.85070386  -263.85590464   552.14963245
   108.88012488   135.49463303   443.73509558   -93.21275033]
6590.465860110088
root mean squared error: 4577.96
Coefficient of determination: 0.89


In [6]:
poly = PolynomialFeatures(degree=2)
poly_variables = poly.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(poly_variables, y, test_size = 0.2, random_state = 42)

model = linear_model.Ridge()

# test this range of alpha
alpha_range = np.arange(0.001, 2, 0.01)

param = {'alpha':alpha_range}

# score base on r2
grid_search_r2 = GridSearchCV(model,param_grid=param,cv=5)
grid_search_r2.fit(X_train, y_train)

# score base on MAE
grid_search_MAE = GridSearchCV(model,param_grid=param,scoring ='neg_mean_absolute_error',cv=5)
grid_search_MAE.fit(X_train, y_train)

print('Scoring R2')
print('Best R2 score   : ', grid_search_r2.best_score_)
print('Best parameters : ', grid_search_r2.best_params_)


print('Scoring RMSE')
print('Best RMSE score  : ', np.sqrt(-grid_search_MAE.best_score_))
print('Best parameters : ', grid_search_MAE.best_params_)

Scoring R2
Best R2 score   :  0.82193889958598
Best parameters :  {'alpha': np.float64(1.0309999999999997)}
Scoring RMSE
Best RMSE score  :  54.90542118665193
Best parameters :  {'alpha': np.float64(0.001)}


In [7]:
poly = PolynomialFeatures(degree=2)
poly_variables = poly.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(poly_variables, y, test_size = 0.2, random_state = 42)

regression = linear_model.Lasso(alpha=0.01, tol=0.1)


# test this range of alpha
alpha_range = np.arange(0.001, 2, 0.01)

param = {'alpha':alpha_range}

# score base on r2
grid_search_r2 = GridSearchCV(model,param_grid=param,cv=5)
grid_search_r2.fit(X_train, y_train)

# score base on MAE
grid_search_MAE = GridSearchCV(model,param_grid=param,scoring ='neg_mean_absolute_error',cv=5)
grid_search_MAE.fit(X_train, y_train)

print('Scoring R2')
print('Best R2 score   : ', grid_search_r2.best_score_)
print('Best parameters : ', grid_search_r2.best_params_)


print('Scoring RMSE')
print('Best RMSE score  : ', np.sqrt(-grid_search_MAE.best_score_))
print('Best parameters : ', grid_search_MAE.best_params_)

Scoring R2
Best R2 score   :  0.82193889958598
Best parameters :  {'alpha': np.float64(1.0309999999999997)}
Scoring RMSE
Best RMSE score  :  54.90542118665193
Best parameters :  {'alpha': np.float64(0.001)}


In [8]:
poly = PolynomialFeatures(degree=2)
poly_variables = poly.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(poly_variables, y, test_size = 0.2, random_state = 42)

regression = linear_model.Lasso(alpha=1.67, tol=0.1)

model = regression.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Coefficients: \n", model.coef_)
print(model.intercept_)
print("root mean squared error: %.2f" % np.sqrt(mean_squared_error(y_test, y_pred)))
print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred))

Coefficients: 
 [ 0.00000000e+00  2.29013262e+04  3.85468421e+03  6.34468967e+02
  1.97814211e+02  1.64855281e+03  2.37422943e+02  5.56914711e+02
  3.54887210e+02  8.62757415e+00  2.49341890e+02  6.53466470e+02
  8.66375447e+03 -2.29884426e+02 -1.69184423e+02  8.14710448e+02
 -1.30002858e+02 -1.66573790e+02  1.86762519e+02 -4.66762139e+02
 -5.02266389e+01 -6.64806101e+02 -6.85855847e+02 -1.41564860e+03
  4.72505080e+02  1.99192259e+02 -5.38611932e+02 -5.06976791e+02
 -1.52360347e+02 -3.79241506e+01 -2.39085431e+02 -4.55367517e+01
 -9.26493664e+01 -0.00000000e+00  3.02312281e+02 -2.63737863e+01]
7331.179361069648
root mean squared error: 4706.97
Coefficient of determination: 0.88


In [9]:
poly = PolynomialFeatures(degree=3)
poly_variables = poly.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(poly_variables, y, test_size = 0.2, random_state = 42)

regression = linear_model.LinearRegression()

model = regression.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Coefficients: \n", model.coef_)
print(model.intercept_)
print("root mean squared error: %.2f" % np.sqrt(mean_squared_error(y_test, y_pred)))
print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred))

Coefficients: 
 [-2.61743119e-12  8.80129931e+03  4.45031646e+03  3.75946165e+02
  6.13348855e+01 -1.82787341e+01 -2.62003323e+02  1.58313024e+02
  8.80129931e+03 -5.14245311e+02 -1.13565508e+03 -5.19277887e+02
  6.12906286e+03  1.52000738e+02 -3.19035457e+01  8.71849339e+02
  3.84418314e+02 -4.54113842e+02 -1.88242364e+02 -1.26035918e+02
 -3.63317928e+02  3.75946165e+02 -3.51643406e+02 -2.62275115e+02
  1.24461435e+02 -1.71096350e+02  6.13348855e+01  4.26278274e+02
  2.51453606e+02 -3.79856602e+01  4.94718288e+01  3.99386371e+02
  2.48399182e+02 -2.62003323e+02  9.41129900e+02 -3.16580567e+01
  8.80129931e+03 -5.14245311e+02 -1.13565508e+03 -5.19277887e+02
  6.12906286e+03  1.52000738e+02 -3.19035457e+01 -6.15471603e+02
 -8.61326866e+01  1.36530628e+03  4.29882993e+02  3.23780026e+02
 -1.41735746e+02 -1.13565508e+03  4.82768674e+03 -1.91788547e+03
  2.70534659e+03 -1.02471924e+03 -5.19277887e+02 -1.56087846e+03
 -2.65291248e+03 -4.71943886e+02 -5.37784909e+02  1.65330835e+02
 -4.78245

In [10]:
poly = PolynomialFeatures(degree=4)
poly_variables = poly.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(poly_variables, y, test_size = 0.2, random_state = 42)

regression = linear_model.LinearRegression()

model = regression.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Coefficients: \n", model.coef_)
print(model.intercept_)
print("root mean squared error: %.2f" % np.sqrt(mean_squared_error(y_test, y_pred)))
print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred))

Coefficients: 
 [-3.16296378e-11  5.90593002e+03  1.97422965e+03  1.81249136e+02
  5.37960284e+02 -6.72536421e+02 -3.05556518e+02 -8.76697411e+01
  5.90593002e+03 -5.21933925e+02 -3.48932996e+02 -3.17609582e+02
  5.82498331e+03  1.01531106e+03  1.49667728e+03  1.46705446e+03
  8.12166419e+02  2.42949215e+02 -7.32036495e+02  7.13538732e+02
  7.68730570e+02  1.81249136e+02 -5.68183709e+02 -2.22407889e+02
  2.16442526e+02 -1.49759369e+02  5.37960284e+02  2.62687734e+02
 -1.63732478e+02 -8.99952356e+02  6.49497529e+02  6.19435338e+02
  1.60088169e+03 -3.05556518e+02  5.06164377e+02  7.66563077e+02
  5.90593002e+03 -5.21933925e+02 -3.48932996e+02 -3.17609582e+02
  5.82498331e+03  1.01531106e+03  1.49667728e+03 -2.91553056e+02
  4.90017652e+02  1.72339195e+02  9.42871331e+02  7.11504295e+02
  2.46670278e+02 -3.48932996e+02  1.76030686e+03 -2.02270770e+03
 -2.90793051e+02 -1.24001052e+03 -3.17609582e+02 -1.27124657e+03
 -1.68651758e+03  1.16825037e+02  1.10439470e+03  7.60523559e+02
 -6.10755