In [64]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.linear_model import Ridge

### **Pre-processing data**

In [41]:
df = pd.read_csv('temp_data.csv')
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [42]:
cols = ['region']

In [43]:
df = df.drop(cols, 1)

In [44]:
dum_df = pd.get_dummies(df, columns = ['sex', 'smoker'])
dum_df

Unnamed: 0,age,bmi,children,charges,sex_female,sex_male,smoker_no,smoker_yes
0,19,27.900,0,16884.92400,1,0,0,1
1,18,33.770,1,1725.55230,0,1,1,0
2,28,33.000,3,4449.46200,0,1,1,0
3,33,22.705,0,21984.47061,0,1,1,0
4,32,28.880,0,3866.85520,0,1,1,0
...,...,...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,0,1,1,0
1334,18,31.920,0,2205.98080,1,0,1,0
1335,18,36.850,0,1629.83350,1,0,1,0
1336,21,25.800,0,2007.94500,1,0,1,0


In [45]:
dum_df = dum_df.drop(['smoker_no', 'sex_female'], 1)

In [46]:
dum_df

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.900,0,16884.92400,0,1
1,18,33.770,1,1725.55230,1,0
2,28,33.000,3,4449.46200,1,0
3,33,22.705,0,21984.47061,1,0
4,32,28.880,0,3866.85520,1,0
...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1,0
1334,18,31.920,0,2205.98080,0,0
1335,18,36.850,0,1629.83350,0,0
1336,21,25.800,0,2007.94500,0,0


In [47]:
y = dum_df['charges']
X = dum_df.drop('charges', 1)

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [49]:
scaler = StandardScaler()
X_train['bmi'] = scaler.fit_transform(X_train[['bmi']])
X_test['bmi'] = scaler.transform(X_test[['bmi']])

In [58]:
ridge_regression = Ridge(alpha = 0.01)
ridge_regression.fit(X_train, y_train)
y_pred_train = ridge_regression.predict(X_train)
y_pred_test = ridge_regression.predict(X_test)
print(np.sqrt(mean_squared_error(y_pred_train, y_train)))
print(np.sqrt(mean_squared_error(y_pred_test, y_test)))

6039.441453488698
6139.450534686704


In [52]:
parameters = {
    'alpha': [1e-3, 2e-3, 5e-3, 1e-2, 2e-2, 5e-2, 1e-1, 2e-1, 5e-1]
}
ridge_regressor = RandomizedSearchCV(ridge_regression, parameters,
                                scoring = make_scorer(mean_squared_error), cv = 5)
ridge_regressor.fit(X_train, y_train)

RandomizedSearchCV(cv=5, estimator=Ridge(alpha=0.01),
                   param_distributions={'alpha': [0.001, 0.002, 0.005, 0.01,
                                                  0.02, 0.05, 0.1, 0.2, 0.5]},
                   scoring=make_scorer(mean_squared_error))

In [57]:
print(ridge_regressor.best_params_)
print(np.sqrt(ridge_regressor.best_score_))

{'alpha': 0.5}
6077.266143003868


In [61]:
y_ridge_train = ridge_regressor.predict(X_train)
y_ridge_test = ridge_regressor.predict(X_test)

In [62]:
print(np.sqrt(mean_squared_error(y_ridge_train, y_train)))
print(np.sqrt(mean_squared_error(y_ridge_test, y_test)))

6039.506982722914
6135.265950885455


In [67]:
print(ridge_regressor.coef_)

[  257.72205118  1930.1441516    482.56159104  -207.69676472
 24225.84082729]
