In [2]:
import numpy as np
import pandas as pd 

from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score
from sklearn.model_selection import train_test_split
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.linear_model import LinearRegression, Lasso, Ridge

import seaborn as sns
import matplotlib.pyplot as plt





In [4]:
medical_df = pd.read_csv(r"E:\Data-py\VVV\raw__data__raw\medical_insurance.csv")
medical_df


Unnamed: 0,age,gender,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [5]:
medical_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   gender    1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [6]:
medical_df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [7]:
medical_df['smoker'] = medical_df['smoker'].replace({'yes':1,'no':0})

In [8]:
medical_df['gender'] = medical_df['gender'].replace({'female':0,'male':1})
medical_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   gender    1338 non-null   int64  
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   int64  
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(4), object(1)
memory usage: 73.3+ KB


In [9]:
medical_df.drop(['region'], axis =1, inplace=True)

In [10]:
medical_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   gender    1338 non-null   int64  
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   int64  
 5   charges   1338 non-null   float64
dtypes: float64(2), int64(4)
memory usage: 62.8 KB


## Feature Selection

In [11]:
medical_df

Unnamed: 0,age,gender,bmi,children,smoker,charges
0,19,0,27.900,0,1,16884.92400
1,18,1,33.770,1,0,1725.55230
2,28,1,33.000,3,0,4449.46200
3,33,1,22.705,0,0,21984.47061
4,32,1,28.880,0,0,3866.85520
...,...,...,...,...,...,...
1333,50,1,30.970,3,0,10600.54830
1334,18,0,31.920,0,0,2205.98080
1335,18,0,36.850,0,0,1629.83350
1336,21,0,25.800,0,0,2007.94500


## No Multicolinearity

In [12]:
vif_list = []
for i in range(medical_df.shape[1]-1):
    vif_score = variance_inflation_factor(medical_df.to_numpy(),i)
    vif_list.append(vif_score)


In [13]:
for i in range(medical_df.shape[1]-1):
    vif_score = variance_inflation_factor(medical_df.to_numpy(),i)
    print(f"VIF score for {medical_df.columns[i]}", vif_score)

VIF score for age 9.20696141724788
VIF score for gender 2.0069477927418795
VIF score for bmi 8.78168786760361
VIF score for children 1.803762467317736
VIF score for smoker 3.982120365668009


## Model Training

In [14]:
x = medical_df.drop('charges', axis =1)
y = medical_df['charges']


In [15]:
x_train, x_test, y_train, y_test = train_test_split(x,y, random_state=22,test_size=0.25)

In [16]:
x_train

Unnamed: 0,age,gender,bmi,children,smoker
194,18,1,34.430,0,0
879,37,0,29.500,2,0
1230,52,1,34.485,3,1
117,29,0,27.940,1,1
567,41,1,30.590,2,0
...,...,...,...,...,...
356,46,1,43.890,3,0
960,19,0,39.615,1,0
812,54,1,21.010,2,0
132,53,0,35.900,2,0


In [17]:
x_test

Unnamed: 0,age,gender,bmi,children,smoker
1231,20,0,21.80,0,1
768,64,0,39.70,0,0
847,23,1,50.38,1,0
510,56,1,32.11,1,0
363,21,0,26.40,1,0
...,...,...,...,...,...
1091,55,0,29.83,0,0
1001,24,1,32.70,0,1
969,39,0,34.32,5,0
342,60,0,27.55,0,0


In [18]:
y_train

194      1137.46970
879      6311.95200
1230    60021.39897
117     19107.77960
567      7256.72310
           ...     
356      8944.11510
960      2730.10785
812     11013.71190
132     11163.56800
885     19719.69470
Name: charges, Length: 1003, dtype: float64

In [19]:
y_test

1231    20167.33603
768     14319.03100
847      2438.05520
510     11763.00090
363      2597.77900
           ...     
1091    11286.53870
1001    34472.84100
969      8596.82780
342     13217.09450
811      6360.99360
Name: charges, Length: 335, dtype: float64

## Instantiating Linear Regression Model

In [20]:
linear_reg = LinearRegression()
linear_reg.fit(x_train,y_train)

In [21]:
linear_reg.intercept_  # Gradient Descent C and M

-12794.814744005202

In [22]:
linear_reg.coef_

array([  263.0843498 ,  -295.12480401,   344.84099118,   474.65517618,
       23271.43489804])

#### Evaluation on Training Data

In [23]:
y_pred_train = linear_reg.predict(x_train)
y_pred_train

array([ 3518.45407483,  8061.42579083, 37177.68864907, ...,
        9311.03491825, 14477.75773119, 29051.09959472])

In [24]:
mse = mean_squared_error(y_train, y_pred_train)
mse

36922825.74537853

In [25]:
mae = mean_absolute_error(y_train,y_pred_train)
mae

4278.836525087316

In [26]:
root_mse = np.sqrt(mse)
root_mse

6076.415534291457

In [27]:
r2score = r2_score(y_train,y_pred_train)
r2score

0.7462015795625183

#### Evaluation on Testing Data

In [28]:
y_pred_test = linear_reg.predict(x_test)


In [29]:
mse_test = mean_squared_error(y_test,y_pred_test)
mse_test

36287772.396299444

In [30]:
mae_test = mean_absolute_error(y_test, y_pred_test)
mae_test

4161.9214762431

In [31]:
r2_test = r2_score(y_test, y_pred_test)
r2_test 

0.7575405070402109

### Lasso Regression 

##### Evaluation on Training Data 

In [32]:
linear_reg_lasso = Lasso()
linear_reg_lasso.fit(x_train,y_train)

In [33]:
y_pred_train_lasso = linear_reg_lasso.predict(x_train)
mse = mean_squared_error(y_train,y_pred_train_lasso)
mae = mean_absolute_error(y_train,y_pred_train_lasso)
r_square_lasso = r2_score(y_train,y_pred_train_lasso)
r_square_lasso

0.7462015005031226

#### Evaluation on Testing Data

In [34]:
y_pred_test_lasso = linear_reg_lasso.predict(x_test)
mse = mean_squared_error(y_test,y_pred_test_lasso)
mae = mean_absolute_error(y_test,y_pred_test_lasso)
r_square_lasso = r2_score(y_test,y_pred_test_lasso)
r_square_lasso

0.7575220266588686

## RIDGE Regression

In [35]:
ridge_model = Ridge()
ridge_model.fit(x_train,y_train)

#### Gradient Descent

In [36]:
ridge_model.coef_

array([  263.01752074,  -286.64693815,   344.86084723,   474.66214842,
       23130.91532501])

In [37]:
ridge_model.intercept_


-12767.841551954854

## Evaluation on Training Data 

In [38]:
y_pred_train_ridge = ridge_model.predict(x_train)


In [39]:
mse = mean_squared_error(y_train,y_pred_train_ridge)
mae = mean_absolute_error(y_train,y_pred_train_ridge)
r_square_lasso = r2_score(y_train,y_pred_train_ridge)
r_square_lasso

0.746179287601984

## Evaluation on Testing Data

In [40]:
y_pred_test_ridge = linear_reg_lasso.predict(x_test)
mse = mean_squared_error(y_test,y_pred_test_ridge)
mae = mean_absolute_error(y_test,y_pred_test_ridge)
r_square_lasso = r2_score(y_test,y_pred_test_ridge)
r_square_lasso

0.7575220266588686