In [47]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet

### **Pre-processing data**

In [30]:
df = pd.read_csv('temp_data.csv')

In [31]:
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [32]:
cols = ['sex', 'region']
cols

['sex', 'region']

In [33]:
df = df.drop(cols, 1)

In [34]:
dum_df = pd.get_dummies(df, columns = ['smoker'])
dum_df

Unnamed: 0,age,bmi,children,charges,smoker_no,smoker_yes
0,19,27.900,0,16884.92400,0,1
1,18,33.770,1,1725.55230,1,0
2,28,33.000,3,4449.46200,1,0
3,33,22.705,0,21984.47061,1,0
4,32,28.880,0,3866.85520,1,0
...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1,0
1334,18,31.920,0,2205.98080,1,0
1335,18,36.850,0,1629.83350,1,0
1336,21,25.800,0,2007.94500,1,0


In [35]:
dum_df = dum_df.drop('smoker_no', 1)

In [36]:
dum_df

Unnamed: 0,age,bmi,children,charges,smoker_yes
0,19,27.900,0,16884.92400,1
1,18,33.770,1,1725.55230,0
2,28,33.000,3,4449.46200,0
3,33,22.705,0,21984.47061,0
4,32,28.880,0,3866.85520,0
...,...,...,...,...,...
1333,50,30.970,3,10600.54830,0
1334,18,31.920,0,2205.98080,0
1335,18,36.850,0,1629.83350,0
1336,21,25.800,0,2007.94500,0


In [37]:
y = np.log1p(dum_df['charges'])
X = dum_df.drop('charges', 1)

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [39]:
scaler = StandardScaler()
X_train['bmi'] = scaler.fit_transform(X_train[['bmi']])
X_test['bmi'] = scaler.transform(X_test[['bmi']])

In [40]:
X_train['age'] = X_train['age'].apply(lambda x: x // 10)
X_test['age'] = X_test['age'].apply(lambda x: x // 10)

In [41]:
linear_regression = LinearRegression()
linear_regression.fit(X_train, y_train)
y_pred = linear_regression.predict(X_test)
mse = mean_squared_error(y_pred, y_test)
rmse2 = np.sqrt(mse)
print(rmse2)

0.49315823223323557


In [48]:
lasso_regression = Lasso(alpha = 0.01)
lasso_regression.fit(X_train, y_train)
y_pred = lasso_regression.predict(X_test)
mse = mean_squared_error(y_pred, y_test)
rmse2 = np.sqrt(mse)
print(rmse2)

0.4977096803112698


In [49]:
ridge_regression = Ridge(alpha = 0.005)
ridge_regression.fit(X_train, y_train)
y_pred = ridge_regression.predict(X_test)
mse = mean_squared_error(y_pred, y_test)
rmse2 = np.sqrt(mse)
print(rmse2)

0.49316020563945234


In [50]:
elastic_net = ElasticNet(alpha = 0.01)
elastic_net.fit(X_train, y_train)
y_pred = elastic_net.predict(X_test)
mse = mean_squared_error(y_pred, y_test)
rmse2 = np.sqrt(mse)
print(rmse2)

0.49792385237993975


In [45]:
random_forest = RandomForestRegressor()
random_forest.fit(X_train, y_train)
y_pred = random_forest.predict(X_test)
mse = mean_squared_error(y_pred, y_test)
rmse2 = np.sqrt(mse)
print(rmse2)

0.47650974951066055
