In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings 
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('cleaned_Insurance.csv')
df

Unnamed: 0,age,bmi,children,expenses,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,19,27.9,0,16884.92,1,0,0,1,0,0,0,1
1,18,33.8,1,1725.55,0,1,1,0,0,0,1,0
2,28,33.0,3,4449.46,0,1,1,0,0,0,1,0
3,33,22.7,0,21984.47,0,1,1,0,0,1,0,0
4,32,28.9,0,3866.86,0,1,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1332,50,31.0,3,10600.55,0,1,1,0,0,1,0,0
1333,18,31.9,0,2205.98,1,0,1,0,1,0,0,0
1334,18,36.9,0,1629.83,1,0,1,0,0,0,1,0
1335,21,25.8,0,2007.95,1,0,1,0,0,0,0,1


In [3]:
X = df.drop(columns = {'expenses'})
y = df['expenses']

**Train Test Split**

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 10)

**Finding the best alpha value**

In [5]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso

estimator = Lasso()
param_grid = {'alpha' : range(10, 20)}

gs = GridSearchCV(estimator, param_grid, cv = 5, scoring = 'neg_root_mean_squared_error')
gs.fit(X_train, y_train)
gs.best_params_

{'alpha': 16}

In [6]:
#Modelling
model = Lasso(alpha = 16)
model.fit(X_train, y_train)

print('Coefficients: ', model.coef_.tolist())

Coefficients:  [255.41672873366147, 335.0881076407792, 505.69015201518704, 212.85702825676304, -1.2251475071766879e-13, -24058.639361029444, 6.680423691270069e-12, 549.9662217524934, 304.5238277974574, -310.1065960633805, -0.0]


**Drop the column {'region_southwest'} as coefficient is 0 and columns {'sex_male', 'smoker_yes'} too as they are almost 0**

In [7]:
X_train

Unnamed: 0,age,bmi,children,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
981,31,25.9,3,0,1,0,1,0,0,0,1
906,44,32.3,1,1,0,1,0,0,0,1,0
22,18,34.1,0,0,1,1,0,0,0,1,0
1260,28,37.1,1,0,1,1,0,0,0,0,1
1064,42,25.3,1,1,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
1180,24,29.9,0,1,0,1,0,0,1,0,0
1147,55,21.5,1,0,1,1,0,0,0,0,1
527,51,25.8,1,1,0,1,0,0,0,0,1
1149,18,30.3,0,1,0,1,0,1,0,0,0


In [8]:
X_train = X_train.drop(columns = {'region_southwest', 'sex_male', 'smoker_yes'})
X_test = X_test.drop(columns = {'region_southwest', 'sex_male', 'smoker_yes'})

# **Final Model**

In [9]:
X_train

Unnamed: 0,age,bmi,children,sex_female,smoker_no,region_northeast,region_northwest,region_southeast
981,31,25.9,3,0,0,0,0,0
906,44,32.3,1,1,1,0,0,1
22,18,34.1,0,0,1,0,0,1
1260,28,37.1,1,0,1,0,0,0
1064,42,25.3,1,1,1,0,0,0
...,...,...,...,...,...,...,...,...
1180,24,29.9,0,1,1,0,1,0
1147,55,21.5,1,0,1,0,0,0
527,51,25.8,1,1,1,0,0,0
1149,18,30.3,0,1,1,1,0,0


In [10]:
#Modelling
model = Lasso(alpha = 0)
model.fit(X_train, y_train)

print('Coefficients: ', model.coef_.tolist())
print('Intercept: ', model.intercept_)

#Prediction
ypred_train = model.predict(X_train)
ypred_test = model.predict(X_test)

#Evaluation
from sklearn.metrics import mean_squared_error

rmse_train = np.sqrt(mean_squared_error(y_train, ypred_train))
rmse_test = np.sqrt(mean_squared_error(y_test, ypred_test))

from sklearn.model_selection import cross_val_score

cv = abs(cross_val_score(model, X_train, y_train, cv = 5, scoring = 'neg_root_mean_squared_error').mean())

print('\n')
print('RMSE(train): ', rmse_train)
print('CV-Score: ', cv)
print('RMSE(test): ', rmse_test)

if (abs(rmse_train - cv) <= 0.05*rmse_train) and (abs(rmse_train - rmse_test) <= 0.05*rmse_train):
  print('Good Model')
else:
  print('Bad Model')

Coefficients:  [255.35225841276323, 338.48497920193364, 515.698097472346, 284.8816885129136, -24164.79047204638, 682.1292828874103, 441.5595012827086, -315.7563533813873]
Intercept:  11206.293976968036


RMSE(train):  6058.584392804124
CV-Score:  6123.075028202857
RMSE(test):  5997.974813586831
Good Model
