In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings 
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('cleaned_Insurance.csv')
df

Unnamed: 0,age,bmi,children,expenses,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,19,27.9,0,16884.92,1,0,0,1,0,0,0,1
1,18,33.8,1,1725.55,0,1,1,0,0,0,1,0
2,28,33.0,3,4449.46,0,1,1,0,0,0,1,0
3,33,22.7,0,21984.47,0,1,1,0,0,1,0,0
4,32,28.9,0,3866.86,0,1,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1332,50,31.0,3,10600.55,0,1,1,0,0,1,0,0
1333,18,31.9,0,2205.98,1,0,1,0,1,0,0,0
1334,18,36.9,0,1629.83,1,0,1,0,0,0,1,0
1335,21,25.8,0,2007.95,1,0,1,0,0,0,0,1


In [3]:
X = df.drop(columns = {'expenses'})
y = df['expenses']

**Train Test Split**

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 10)

In [5]:
X_train

Unnamed: 0,age,bmi,children,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
981,31,25.9,3,0,1,0,1,0,0,0,1
906,44,32.3,1,1,0,1,0,0,0,1,0
22,18,34.1,0,0,1,1,0,0,0,1,0
1260,28,37.1,1,0,1,1,0,0,0,0,1
1064,42,25.3,1,1,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
1180,24,29.9,0,1,0,1,0,0,1,0,0
1147,55,21.5,1,0,1,1,0,0,0,0,1
527,51,25.8,1,1,0,1,0,0,0,0,1
1149,18,30.3,0,1,0,1,0,1,0,0,0


In [6]:
#Modelling
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)

#Prediction
ypred_train = model.predict(X_train)

#Evaluation
from sklearn.metrics import mean_squared_error

rmse_train = np.sqrt(mean_squared_error(y_train, ypred_train))

from sklearn.model_selection import cross_val_score

cv = np.sqrt(abs(cross_val_score(model, X_train, y_train, cv = 5, scoring = 'neg_mean_squared_error').mean()))

print('RMSE(train): ', rmse_train)
print('CV-Score: ', cv)

RMSE(train):  6058.584392804124
CV-Score:  6139.717783324624


In [7]:
import statsmodels.api as sm

model = sm.OLS(y_train, X_train).fit()
model.summary()

0,1,2,3
Dep. Variable:,expenses,R-squared:,0.754
Model:,OLS,Adj. R-squared:,0.752
Method:,Least Squares,F-statistic:,406.6
Date:,"Thu, 27 Feb 2025",Prob (F-statistic):,1.08e-316
Time:,09:20:59,Log-Likelihood:,-10827.0
No. Observations:,1069,AIC:,21670.0
Df Residuals:,1060,BIC:,21720.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
age,255.3523,13.384,19.079,0.000,229.090,281.615
bmi,338.4850,31.713,10.673,0.000,276.258,400.712
children,515.6981,156.339,3.299,0.001,208.930,822.466
sex_female,-70.2301,463.357,-0.152,0.880,-979.431,838.970
sex_male,-355.1118,473.104,-0.751,0.453,-1283.439,573.216
smoker_no,-1.23e+04,466.642,-26.348,0.000,-1.32e+04,-1.14e+04
smoker_yes,1.187e+04,507.364,23.395,0.000,1.09e+04,1.29e+04
region_northeast,373.8107,373.428,1.001,0.317,-358.931,1106.553
region_northwest,133.2409,373.116,0.357,0.721,-598.889,865.371

0,1,2,3
Omnibus:,226.446,Durbin-Watson:,2.085
Prob(Omnibus):,0.0,Jarque-Bera (JB):,519.744
Skew:,1.152,Prob(JB):,1.38e-113
Kurtosis:,5.521,Cond. No.,3.75e+17


**Dropping column {'sex_female'} (p > 0.05) as p is highest**

In [8]:
X_train = X_train.drop(columns = {'sex_female'})
X_test = X_test.drop(columns = {'sex_female'})

In [9]:
import statsmodels.api as sm

model = sm.OLS(y_train, X_train).fit()
model.summary()

0,1,2,3
Dep. Variable:,expenses,R-squared:,0.754
Model:,OLS,Adj. R-squared:,0.752
Method:,Least Squares,F-statistic:,406.6
Date:,"Thu, 27 Feb 2025",Prob (F-statistic):,1.08e-316
Time:,09:20:59,Log-Likelihood:,-10827.0
No. Observations:,1069,AIC:,21670.0
Df Residuals:,1060,BIC:,21720.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
age,255.3523,13.384,19.079,0.000,229.090,281.615
bmi,338.4850,31.713,10.673,0.000,276.258,400.712
children,515.6981,156.339,3.299,0.001,208.930,822.466
sex_male,-284.8817,374.434,-0.761,0.447,-1019.598,449.835
smoker_no,-1.234e+04,733.251,-16.832,0.000,-1.38e+04,-1.09e+04
smoker_yes,1.182e+04,783.269,15.094,0.000,1.03e+04,1.34e+04
region_northeast,350.4007,462.693,0.757,0.449,-557.497,1258.298
region_northwest,109.8309,461.387,0.238,0.812,-795.506,1015.167
region_southeast,-647.4849,536.349,-1.207,0.228,-1699.912,404.942

0,1,2,3
Omnibus:,226.446,Durbin-Watson:,2.085
Prob(Omnibus):,0.0,Jarque-Bera (JB):,519.744
Skew:,1.152,Prob(JB):,1.38e-113
Kurtosis:,5.521,Cond. No.,4.85e+17


**Dropping column {'region_northwest'} (p > 0.05) as p is highest**

In [10]:
X_train = X_train.drop(columns = {'region_northwest'})
X_test = X_test.drop(columns = {'region_northwest'})

In [11]:
import statsmodels.api as sm

model = sm.OLS(y_train, X_train).fit()
model.summary()

0,1,2,3
Dep. Variable:,expenses,R-squared:,0.754
Model:,OLS,Adj. R-squared:,0.752
Method:,Least Squares,F-statistic:,406.6
Date:,"Thu, 27 Feb 2025",Prob (F-statistic):,1.08e-316
Time:,09:20:59,Log-Likelihood:,-10827.0
No. Observations:,1069,AIC:,21670.0
Df Residuals:,1060,BIC:,21720.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
age,255.3523,13.384,19.079,0.000,229.090,281.615
bmi,338.4850,31.713,10.673,0.000,276.258,400.712
children,515.6981,156.339,3.299,0.001,208.930,822.466
sex_male,-284.8817,374.434,-0.761,0.447,-1019.598,449.835
smoker_no,-1.223e+04,1092.437,-11.197,0.000,-1.44e+04,-1.01e+04
smoker_yes,1.193e+04,1148.658,10.388,0.000,9678.834,1.42e+04
region_northeast,240.5698,537.550,0.448,0.655,-814.213,1295.352
region_southeast,-757.3159,539.192,-1.405,0.160,-1815.320,300.689
region_southwest,-441.5595,537.264,-0.822,0.411,-1495.782,612.663

0,1,2,3
Omnibus:,226.446,Durbin-Watson:,2.085
Prob(Omnibus):,0.0,Jarque-Bera (JB):,519.744
Skew:,1.152,Prob(JB):,1.38e-113
Kurtosis:,5.521,Cond. No.,430.0


**Dropping column {'region_northeast'} (p > 0.05) as p is highest**

In [12]:
X_train = X_train.drop(columns = {'region_northeast'})
X_test = X_test.drop(columns = {'region_northeast'})

In [13]:
import statsmodels.api as sm

model = sm.OLS(y_train, X_train).fit()
model.summary()

0,1,2,3
Dep. Variable:,expenses,R-squared:,0.754
Model:,OLS,Adj. R-squared:,0.753
Method:,Least Squares,F-statistic:,465.0
Date:,"Thu, 27 Feb 2025",Prob (F-statistic):,5.33e-318
Time:,09:20:59,Log-Likelihood:,-10827.0
No. Observations:,1069,AIC:,21670.0
Df Residuals:,1061,BIC:,21710.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
age,255.3230,13.379,19.084,0.000,229.071,281.575
bmi,338.7574,31.695,10.688,0.000,276.565,400.949
children,513.7227,156.217,3.289,0.001,207.193,820.253
sex_male,-284.3949,374.291,-0.760,0.448,-1018.831,450.041
smoker_no,-1.212e+04,1062.188,-11.409,0.000,-1.42e+04,-1e+04
smoker_yes,1.206e+04,1115.147,10.810,0.000,9867.091,1.42e+04
region_southeast,-879.1525,465.240,-1.890,0.059,-1792.048,33.743
region_southwest,-561.8482,465.022,-1.208,0.227,-1474.315,350.619

0,1,2,3
Omnibus:,226.538,Durbin-Watson:,2.086
Prob(Omnibus):,0.0,Jarque-Bera (JB):,519.101
Skew:,1.154,Prob(JB):,1.9e-113
Kurtosis:,5.516,Cond. No.,415.0


**Dropping column {'sex_male'} (p > 0.05) as p is highest**

In [14]:
X_train = X_train.drop(columns = {'sex_male'})
X_test = X_test.drop(columns = {'sex_male'})

In [15]:
import statsmodels.api as sm

model = sm.OLS(y_train, X_train).fit()
model.summary()

0,1,2,3
Dep. Variable:,expenses,R-squared:,0.754
Model:,OLS,Adj. R-squared:,0.753
Method:,Least Squares,F-statistic:,542.6
Date:,"Thu, 27 Feb 2025",Prob (F-statistic):,2.93e-319
Time:,09:20:59,Log-Likelihood:,-10827.0
No. Observations:,1069,AIC:,21670.0
Df Residuals:,1062,BIC:,21700.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
age,255.5250,13.374,19.107,0.000,229.283,281.767
bmi,337.3888,31.637,10.664,0.000,275.310,399.468
children,515.6292,156.166,3.302,0.001,209.200,822.059
smoker_no,-1.222e+04,1052.963,-11.609,0.000,-1.43e+04,-1.02e+04
smoker_yes,1.192e+04,1100.760,10.829,0.000,9760.687,1.41e+04
region_southeast,-881.3891,465.139,-1.895,0.058,-1794.084,31.306
region_southwest,-556.6317,464.879,-1.197,0.231,-1468.816,355.553

0,1,2,3
Omnibus:,227.504,Durbin-Watson:,2.082
Prob(Omnibus):,0.0,Jarque-Bera (JB):,523.714
Skew:,1.156,Prob(JB):,1.89e-114
Kurtosis:,5.532,Cond. No.,410.0


**Dropping column {'region_southwest'} (p > 0.05) as p is highest**

In [16]:
X_train = X_train.drop(columns = {'region_southwest'})
X_test = X_test.drop(columns = {'region_southwest'})

In [17]:
import statsmodels.api as sm

model = sm.OLS(y_train, X_train).fit()
model.summary()

0,1,2,3
Dep. Variable:,expenses,R-squared:,0.754
Model:,OLS,Adj. R-squared:,0.753
Method:,Least Squares,F-statistic:,650.6
Date:,"Thu, 27 Feb 2025",Prob (F-statistic):,2.23e-320
Time:,09:20:59,Log-Likelihood:,-10828.0
No. Observations:,1069,AIC:,21670.0
Df Residuals:,1063,BIC:,21700.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
age,255.5616,13.376,19.106,0.000,229.315,281.808
bmi,334.0041,31.517,10.597,0.000,272.161,395.847
children,512.9277,156.182,3.284,0.001,206.468,819.387
smoker_no,-1.231e+04,1050.624,-11.718,0.000,-1.44e+04,-1.02e+04
smoker_yes,1.185e+04,1099.279,10.777,0.000,9690.271,1.4e+04
region_southeast,-682.0639,434.418,-1.570,0.117,-1534.478,170.350

0,1,2,3
Omnibus:,228.398,Durbin-Watson:,2.078
Prob(Omnibus):,0.0,Jarque-Bera (JB):,527.055
Skew:,1.16,Prob(JB):,3.5599999999999997e-115
Kurtosis:,5.541,Cond. No.,410.0


**Dropping column {'region_southeast'} (p > 0.05) as p is highest**

In [18]:
X_train = X_train.drop(columns = {'region_southeast'})
X_test = X_test.drop(columns = {'region_southeast'})

In [19]:
import statsmodels.api as sm

model = sm.OLS(y_train, X_train).fit()
model.summary()

0,1,2,3
Dep. Variable:,expenses,R-squared:,0.753
Model:,OLS,Adj. R-squared:,0.752
Method:,Least Squares,F-statistic:,811.5
Date:,"Thu, 27 Feb 2025",Prob (F-statistic):,2.52e-321
Time:,09:20:59,Log-Likelihood:,-10829.0
No. Observations:,1069,AIC:,21670.0
Df Residuals:,1064,BIC:,21690.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
age,256.5015,13.372,19.182,0.000,230.263,282.740
bmi,320.0288,30.255,10.578,0.000,260.662,379.395
children,517.2952,156.264,3.310,0.001,210.674,823.917
smoker_no,-1.21e+04,1042.958,-11.605,0.000,-1.41e+04,-1.01e+04
smoker_yes,1.202e+04,1094.618,10.979,0.000,9870.493,1.42e+04

0,1,2,3
Omnibus:,228.038,Durbin-Watson:,2.072
Prob(Omnibus):,0.0,Jarque-Bera (JB):,528.048
Skew:,1.156,Prob(JB):,2.1699999999999997e-115
Kurtosis:,5.551,Cond. No.,407.0


So..Now, no columns have p > 0.05 ---> This is our **Final Model**

# **Final Model**

In [20]:
#Modelling
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)

#Prediction
ypred_train = model.predict(X_train)
ypred_test = model.predict(X_test)

#Evaluation
from sklearn.metrics import mean_squared_error

rmse_train = np.sqrt(mean_squared_error(y_train, ypred_train))
rmse_test = np.sqrt(mean_squared_error(y_test, ypred_test))

from sklearn.model_selection import cross_val_score

cv = np.sqrt(abs(cross_val_score(model, X_train, y_train, cv = 5, scoring = 'neg_mean_squared_error').mean()))

print('RMSE(train): ', rmse_train)
print('CV-Score: ', cv)
print('RMSE(test): ', rmse_test)

if (abs(rmse_train - cv) <= 0.05*rmse_train) and (abs(rmse_train - rmse_test) <= 0.05*rmse_train):
  print('Good Model')
else:
  print('Bad Model')

RMSE(train):  6071.922859322231
CV-Score:  6128.69541090626
RMSE(test):  6012.461560078545
Good Model
