In [2]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import Ridge

### **Pre-processing data**

In [3]:
df = pd.read_csv('temp_data.csv')
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [4]:
df = df.drop(["region"], 1)

In [5]:
df

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.900,0,yes,16884.92400
1,18,male,33.770,1,no,1725.55230
2,28,male,33.000,3,no,4449.46200
3,33,male,22.705,0,no,21984.47061
4,32,male,28.880,0,no,3866.85520
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,10600.54830
1334,18,female,31.920,0,no,2205.98080
1335,18,female,36.850,0,no,1629.83350
1336,21,female,25.800,0,no,2007.94500


In [6]:
y = np.log10(df['charges'])
X = df.drop('charges', 1)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [8]:
ohe = OneHotEncoder()
scaler = StandardScaler()
ridge = Ridge(alpha = 0.1)
ct = make_column_transformer(
    (ohe, ["sex", "smoker"]),
    (scaler, ["bmi"]),
    remainder = "passthrough"
)
pipeline = make_pipeline(ct, ridge)

In [9]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(),
                                                  ['sex', 'smoker']),
                                                 ('standardscaler',
                                                  StandardScaler(),
                                                  ['bmi'])])),
                ('ridge', Ridge(alpha=0.1))])

In [10]:
y_pred = pipeline.predict(X_test)
y_predict = 10 ** y_pred

In [11]:
print(mean_squared_error(y_pred, y_test))

0.036655611092324805


In [12]:
param_grid = {
    "ridge__alpha": [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
}
search = GridSearchCV(pipeline, param_grid, scoring = mean_squared_error)
search.fit(X_train, y_train)
print(search.best_params_)

{'ridge__alpha': 0.01}


In [13]:
y_pred = search.predict(X_test)
y_predict = 10 ** y_pred
print(mean_squared_error(y_pred, y_test))

0.03665686378909146
