In [2]:
import pandas as pd
import numpy as np

from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import matplotlib.pyplot as plt
import seaborn as sns

import pickle
import json

import warnings
warnings.filterwarnings('ignore')

In [None]:
To predict medical insurance charges or amount by uing age, gender, bmi, number of 
children, smoker, and its region

In [4]:
df= pd.read_csv("medical_insurance.csv")
df


Unnamed: 0,age,gender,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   gender    1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [6]:
df['age']

0       19
1       18
2       28
3       33
4       32
        ..
1333    50
1334    18
1335    18
1336    21
1337    61
Name: age, Length: 1338, dtype: int64

In [9]:
df['gender']

0       female
1         male
2         male
3         male
4         male
         ...  
1333      male
1334    female
1335    female
1336    female
1337    female
Name: gender, Length: 1338, dtype: object

In [10]:
df['gender'].value_counts()

gender
male      676
female    662
Name: count, dtype: int64

In [11]:
df['gender'].value_counts().to_dict()

{'male': 676, 'female': 662}

In [14]:
df['gender'].replace({'male':1, 'female':0}, inplace=True)


In [15]:
df['gender'].value_counts().to_dict()

{1: 676, 0: 662}

In [16]:
df['smoker']

0       yes
1        no
2        no
3        no
4        no
       ... 
1333     no
1334     no
1335     no
1336     no
1337    yes
Name: smoker, Length: 1338, dtype: object

In [17]:
df['smoker'].value_counts()

smoker
no     1064
yes     274
Name: count, dtype: int64

In [18]:
df['smoker'].value_counts().to_dict()

{'no': 1064, 'yes': 274}

In [19]:
df['smoker'].replace({'no':0, 'yes':1}, inplace=True)

In [20]:
df['smoker'].value_counts().to_dict()

{0: 1064, 1: 274}

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   gender    1338 non-null   int64  
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   int64  
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(4), object(1)
memory usage: 73.3+ KB


In [22]:
df['region']

0       southwest
1       southeast
2       southeast
3       northwest
4       northwest
          ...    
1333    northwest
1334    northeast
1335    southeast
1336    southwest
1337    northwest
Name: region, Length: 1338, dtype: object

In [23]:
df = pd.get_dummies(df, columns=['region'], dtype = int)
df

Unnamed: 0,age,gender,bmi,children,smoker,charges,region_northeast,region_northwest,region_southeast,region_southwest
0,19,0,27.900,0,1,16884.92400,0,0,0,1
1,18,1,33.770,1,0,1725.55230,0,0,1,0
2,28,1,33.000,3,0,4449.46200,0,0,1,0
3,33,1,22.705,0,0,21984.47061,0,1,0,0
4,32,1,28.880,0,0,3866.85520,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...
1333,50,1,30.970,3,0,10600.54830,0,1,0,0
1334,18,0,31.920,0,0,2205.98080,1,0,0,0
1335,18,0,36.850,0,0,1629.83350,0,0,1,0
1336,21,0,25.800,0,0,2007.94500,0,0,0,1


In [24]:
df.corr()

Unnamed: 0,age,gender,bmi,children,smoker,charges,region_northeast,region_northwest,region_southeast,region_southwest
age,1.0,-0.020856,0.109272,0.042469,-0.025019,0.299008,0.002475,-0.000407,-0.011642,0.010016
gender,-0.020856,1.0,0.046371,0.017163,0.076185,0.057292,-0.002425,-0.011156,0.017117,-0.004184
bmi,0.109272,0.046371,1.0,0.012759,0.00375,0.198341,-0.138156,-0.135996,0.270025,-0.006205
children,0.042469,0.017163,0.012759,1.0,0.007673,0.067998,-0.022808,0.024806,-0.023066,0.021914
smoker,-0.025019,0.076185,0.00375,0.007673,1.0,0.787251,0.002811,-0.036945,0.068498,-0.036945
charges,0.299008,0.057292,0.198341,0.067998,0.787251,1.0,0.006349,-0.039905,0.073982,-0.04321
region_northeast,0.002475,-0.002425,-0.138156,-0.022808,0.002811,0.006349,1.0,-0.320177,-0.345561,-0.320177
region_northwest,-0.000407,-0.011156,-0.135996,0.024806,-0.036945,-0.039905,-0.320177,1.0,-0.346265,-0.320829
region_southeast,-0.011642,0.017117,0.270025,-0.023066,0.068498,0.073982,-0.345561,-0.346265,1.0,-0.346265
region_southwest,0.010016,-0.004184,-0.006205,0.021914,-0.036945,-0.04321,-0.320177,-0.320829,-0.346265,1.0


In [26]:
df.corr().loc[['charges']]

Unnamed: 0,age,gender,bmi,children,smoker,charges,region_northeast,region_northwest,region_southeast,region_southwest
charges,0.299008,0.057292,0.198341,0.067998,0.787251,1.0,0.006349,-0.039905,0.073982,-0.04321


In [27]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [30]:
x= df.drop('charges', axis = 1)
vif_values = []
for i in range (x.shape[1]):
    vif_value = variance_inflation_factor(x.values, i)
    vif_values.append(vif_value)

In [31]:
s1 = pd.Series(vif_values, index = x.columns)
s1

age                  1.016822
gender               1.008900
bmi                  1.106630
children             1.004011
smoker               1.012074
region_northeast     8.603069
region_northwest     8.636205
region_southeast    11.535195
region_southwest     9.218449
dtype: float64

In [33]:
x= df.drop('charges', axis = 1)
y = df['charges'] 
x_train,x_test, y_train,y_test = train_test_split(x,y,test_size=0.2, random_state=0,shuffle=True) 

In [35]:
linear_reg_model = LinearRegression()
linear_reg_model.fit(x_train, y_train)


In [37]:
# To Train Data
y_pred_train = linear_reg_model.predict(x_train)
mse=mean_squared_error(y_train,y_pred_train)
print("MSE:",mse )

rmse = np.sqrt(mse)
print("RMSE:",rmse )

mae = mean_absolute_error(y_train ,y_pred_train)
print("MAE :",mae)

R2 = r2_score(y_train ,y_pred_train)
print('r_squared :',R2)

n = x_train.shape[0] # no of rows
k = x_train.shape[1] # no of columns
adj_r2 = 1 - (((1-R2)*(n-1))/(n-k-1))
print("Adjusted R2 Value: ", adj_r2)

MSE: 37701533.128629126
RMSE: 6140.157418880165
MAE : 4234.551143314699
r_squared : 0.7370262574551634
Adjusted R2 Value:  0.7347934615278959


In [38]:
# Test Data
y_pred_test = linear_reg_model.predict(x_test)

mse = mean_squared_error(y_test ,y_pred_test)
print("MSE :",mse) # squared unit

rmse = np.sqrt(mse)
print("RMSE :",rmse)

mae = mean_absolute_error(y_test ,y_pred_test)
print("MAE :",mae)

R2 = r2_score(y_test ,y_pred_test)
print('r_squared :',R2)

n = x_test.shape[0] # no of rows
k = x_test.shape[1] # no of columns
adj_r2 = 1 - (((1-R2)*(n-1))/(n-k-1))
print("Adjusted R2 Value: ", adj_r2)

MSE : 31827950.22952383
RMSE : 5641.626558850189
MAE : 3933.272649405234
r_squared : 0.7999876970680434
Adjusted R2 Value:  0.7930105237099518


In [39]:
x.columns


Index(['age', 'gender', 'bmi', 'children', 'smoker', 'region_northeast',
       'region_northwest', 'region_southeast', 'region_southwest'],
      dtype='object')

In [40]:
label_enc_data = {"smoker" : {'no': 0, 'yes': 1},
                "gender" : {'male':1, 'female':0}}

column_names = list(x_train.columns)
print(column_names)

['age', 'gender', 'bmi', 'children', 'smoker', 'region_northeast', 'region_northwest', 'region_southeast', 'region_southwest']


In [41]:
age = 34
gender = 'male'
bmi =35.78
children = 2
smoker = "yes"
region = 'northwest'

In [43]:
test_array=np.zeros((1,x_train.shape[1]))
test_array

array([[0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [47]:
linear_reg_model.predict(x_test[45:46])

array([5092.58923328])

In [48]:
linear_reg_model.predict(x_test.head(5))

array([11169.92711879,  9486.70908541, 38181.12305256, 16266.31328948,
        6914.64800729])

In [49]:
linear_reg_model.predict(test_array)

array([-12311.91360565])

In [50]:
test_array

array([[0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [65]:
age = 25
gender = 'female'
bmi =29.22
children = 0
smoker = "no"
region = 'northeast'

In [62]:
print(column_names)

['age', 'gender', 'bmi', 'children', 'smoker', 'region_northeast', 'region_northwest', 'region_southeast', 'region_southwest']


In [66]:
test_array=np.zeros((1,x_train.shape[1]))

test_array[0,0] = age
test_array[0,1]= label_enc_data['gender'][gender]
test_array[0,2] = bmi
test_array[0,3] = children
test_array[0,4] =label_enc_data['smoker'][smoker]
region = 'southwest'
region = f"region_{region}"
region_index = column_names.index(region)
print("region index :",region_index)
test_array[0,region_index] = 1
test_array

region index : 8


array([[25.  ,  0.  , 29.22,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  1.  ]])

In [67]:
prediction = linear_reg_model.predict(test_array)[0]
print("Predicted Price is:", np.around(prediction,3))

Predicted Price is: 3569.324
