In [36]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.linear_model import Ridge

### **Pre-processing data**

In [2]:
df = pd.read_csv('temp_data.csv')

In [3]:
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [4]:
cols = ['sex', 'region']
cols

['sex', 'region']

In [8]:
df = df.drop(cols, 1)

In [9]:
dum_df = pd.get_dummies(df, columns = ['smoker'])
dum_df

Unnamed: 0,age,bmi,children,charges,smoker_no,smoker_yes
0,19,27.900,0,16884.92400,0,1
1,18,33.770,1,1725.55230,1,0
2,28,33.000,3,4449.46200,1,0
3,33,22.705,0,21984.47061,1,0
4,32,28.880,0,3866.85520,1,0
...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1,0
1334,18,31.920,0,2205.98080,1,0
1335,18,36.850,0,1629.83350,1,0
1336,21,25.800,0,2007.94500,1,0


In [10]:
dum_df = dum_df.drop('smoker_no', 1)

In [11]:
dum_df

Unnamed: 0,age,bmi,children,charges,smoker_yes
0,19,27.900,0,16884.92400,1
1,18,33.770,1,1725.55230,0
2,28,33.000,3,4449.46200,0
3,33,22.705,0,21984.47061,0
4,32,28.880,0,3866.85520,0
...,...,...,...,...,...
1333,50,30.970,3,10600.54830,0
1334,18,31.920,0,2205.98080,0
1335,18,36.850,0,1629.83350,0
1336,21,25.800,0,2007.94500,0


In [12]:
y = np.log1p(dum_df['charges'])
X = dum_df.drop('charges', 1)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [14]:
scaler = StandardScaler()
X_train['bmi'] = scaler.fit_transform(X_train[['bmi']])
X_test['bmi'] = scaler.transform(X_test[['bmi']])

In [15]:
X_train['age'] = X_train['age'].apply(lambda x: x // 10)
X_test['age'] = X_test['age'].apply(lambda x: x // 10)

In [16]:
ridge_regression = Ridge(alpha = 0.01)
ridge_regression.fit(X_train, y_train)
print(mean_squared_error(X_train, y_train))
print(mean_squared_error(X_test, y_test))

0.7417779725822176
0.7867728746198581


In [37]:
parameters = {
    'alpha': [1e-3, 2e-3, 5e-3, 1e-2, 2e-2, 5e-2, 1e-1, 2e-1, 5e-1]
}
ridge_regressor = GridSearchCV(ridge_regression, parameters,
                                scoring = make_scorer(mean_squared_error), cv = 5)
ridge_regressor.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=Ridge(alpha=0.01),
             param_grid={'alpha': [0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1,
                                   0.2, 0.5]},
             scoring=make_scorer(mean_squared_error))

In [38]:
print(ridge_regressor.best_params_)
print(ridge_regressor.best_score_)

{'alpha': 0.001}
0.22007406251556652


In [39]:
y_ridge = ridge_regressor.predict(X_test)
y_ridge

array([ 8.26800594,  8.97989447,  9.22048146,  8.30229336,  9.18093068,
       10.40505266,  9.27078736,  9.2428415 ,  7.99931522,  8.7000247 ,
        9.28218067,  8.48437135, 10.31524372,  8.96559573,  7.80858476,
        9.0160595 ,  7.89537814,  9.50992332,  8.57550176,  9.6657497 ,
        8.8246431 ,  8.62193488,  9.52932419,  9.2903461 ,  8.53176511,
        9.04293051,  8.22012515, 11.16465475,  8.2140523 , 10.29945293,
        9.09429853,  9.00235528, 10.95550348, 10.15704326,  9.70290423,
       10.7080879 ,  7.98749197,  9.74539539,  7.90118227,  8.71641001,
        8.50607042,  9.19899401,  9.32817713,  9.1157416 ,  8.29568309,
        9.21167112,  7.96986123,  8.95635811,  8.83387737, 11.19152576,
        9.6105284 ,  8.59765284,  9.12084709, 10.4336878 ,  8.23028239,
        8.52745974,  9.06872669,  9.29561617,  9.39147588, 10.74107614,
       10.13371586,  9.13105808,  8.36721037, 10.72576302,  8.82366974,
       10.16408011,  9.63127282,  8.31815061,  8.70211799,  9.40

In [40]:
mean_squared_error(y_ridge, y_test)

0.17961198382783342