In [18]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge

### **Pre-processing data**

In [2]:
df = pd.read_csv('temp_data.csv')

In [3]:
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [4]:
cols = ['sex', 'region']
cols

['sex', 'region']

In [8]:
df = df.drop(cols, 1)

In [9]:
dum_df = pd.get_dummies(df, columns = ['smoker'])
dum_df

Unnamed: 0,age,bmi,children,charges,smoker_no,smoker_yes
0,19,27.900,0,16884.92400,0,1
1,18,33.770,1,1725.55230,1,0
2,28,33.000,3,4449.46200,1,0
3,33,22.705,0,21984.47061,1,0
4,32,28.880,0,3866.85520,1,0
...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1,0
1334,18,31.920,0,2205.98080,1,0
1335,18,36.850,0,1629.83350,1,0
1336,21,25.800,0,2007.94500,1,0


In [10]:
dum_df = dum_df.drop('smoker_no', 1)

In [11]:
dum_df

Unnamed: 0,age,bmi,children,charges,smoker_yes
0,19,27.900,0,16884.92400,1
1,18,33.770,1,1725.55230,0
2,28,33.000,3,4449.46200,0
3,33,22.705,0,21984.47061,0
4,32,28.880,0,3866.85520,0
...,...,...,...,...,...
1333,50,30.970,3,10600.54830,0
1334,18,31.920,0,2205.98080,0
1335,18,36.850,0,1629.83350,0
1336,21,25.800,0,2007.94500,0


In [12]:
y = np.log1p(dum_df['charges'])
X = dum_df.drop('charges', 1)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [14]:
scaler = StandardScaler()
X_train['bmi'] = scaler.fit_transform(X_train[['bmi']])
X_test['bmi'] = scaler.transform(X_test[['bmi']])

In [15]:
X_train['age'] = X_train['age'].apply(lambda x: x // 10)
X_test['age'] = X_test['age'].apply(lambda x: x // 10)

In [16]:
ridge_regression = Ridge(alpha = 0.01)
ridge_regression.fit(X_train, y_train)
print(ridge_regression.score(X_train, y_train))
print(ridge_regression.score(X_test, y_test))

0.7417779725822176
0.7867728746198581


In [28]:
parameters = {
    'alpha': [1e-3, 2e-3, 5e-3, 1e-2, 2e-2, 5e-2, 1e-1, 2e-1, 5e-1]
}
ridge_regressor = GridSearchCV(ridge_regression, parameters, cv = 5)
ridge_regressor.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=Ridge(alpha=0.01),
             param_grid={'alpha': [0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1,
                                   0.2, 0.5]})

In [29]:
print(ridge_regressor.best_params_)
print(ridge_regressor.best_score_)

{'alpha': 0.5}
0.7381592831932811


In [30]:
y_ridge = ridge_regressor.predict(X_test)
y_ridge

array([ 8.26906117,  8.98073506,  9.22128743,  8.30332698,  9.1817482 ,
       10.40145475,  9.27154826,  9.24362001,  8.00040942,  8.70086118,
        9.2829344 ,  8.48529026, 10.31177926,  8.9664587 ,  7.80979912,
        9.01689067,  7.89653782,  9.5106505 ,  8.5764802 ,  9.66246054,
        8.8254913 ,  8.62288406,  9.53003915,  9.29110805,  8.53268088,
        9.04374476,  8.22119718, 11.16090237,  8.21512815, 10.29590819,
        9.09517063,  9.00319509, 10.9517793 , 10.15356152,  9.70353656,
       10.70441601,  7.98859362,  9.74205605,  7.9023383 ,  8.71731303,
        8.50707923,  9.19972329,  9.32892861,  9.11660019,  8.29672087,
        9.21246928,  7.97098735,  8.95715004,  8.83480997, 11.18775646,
        9.6111922 ,  8.5985271 ,  9.12170247, 10.43016207,  8.23134802,
        8.52845508,  9.06952468,  9.29636144,  9.3922777 , 10.73747369,
       10.13026218,  9.13190702,  8.36821646, 10.72215685,  8.8245954 ,
       10.1606073 ,  9.63192355,  8.31916088,  8.70301665,  9.40

In [31]:
mean_squared_error(y_ridge, y_test)

0.179602879443211