**Steps:**
1. importing all required libraries
2. data loading
3. EDA
   i. treating null values
   ii. deleting duplicates
   iii. deleting outliers
   iv. Checking multicollinearity
   v. label encoding
4. Visualization
5. Feature Selection
6. Model Creation

In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [6]:
insurance=pd.read_csv("new_insurance_data.csv")
insurance

Unnamed: 0,age,sex,bmi,children,smoker,Claim_Amount,past_consultations,num_of_steps,Hospital_expenditure,NUmber_of_past_hospitalizations,Anual_Salary,region,charges
0,18.0,male,23.210,0.0,no,29087.54313,17.0,715428.0,4.720921e+06,0.0,5.578497e+07,southeast,1121.87390
1,18.0,male,30.140,0.0,no,39053.67437,7.0,699157.0,4.329832e+06,0.0,1.370089e+07,southeast,1131.50660
2,18.0,male,33.330,0.0,no,39023.62759,19.0,702341.0,6.884861e+06,0.0,7.352311e+07,southeast,1135.94070
3,18.0,male,33.660,0.0,no,28185.39332,11.0,700250.0,4.274774e+06,0.0,7.581968e+07,southeast,1136.39940
4,18.0,male,34.100,0.0,no,14697.85941,16.0,711584.0,3.787294e+06,0.0,2.301232e+07,southeast,1137.01100
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1333,33.0,female,35.530,0.0,yes,63142.25346,32.0,1091267.0,1.703805e+08,2.0,3.101107e+09,northwest,55135.40209
1334,31.0,female,38.095,1.0,yes,43419.95227,31.0,1107872.0,2.015152e+08,2.0,3.484216e+09,northeast,58571.07448
1335,52.0,male,34.485,3.0,yes,52458.92353,25.0,1092005.0,2.236450e+08,2.0,3.640807e+09,northwest,60021.39897
1336,45.0,male,30.360,0.0,yes,69927.51664,34.0,1106821.0,2.528924e+08,3.0,4.006359e+09,southeast,62592.87309


In [8]:
insurance.shape

(1338, 13)

In [9]:
insurance.isnull().sum()

age                                 9
sex                                 0
bmi                                 3
children                            5
smoker                              0
Claim_Amount                       14
past_consultations                  6
num_of_steps                        3
Hospital_expenditure                4
NUmber_of_past_hospitalizations     2
Anual_Salary                        6
region                              0
charges                             0
dtype: int64

In [10]:
insurance.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'Claim_Amount',
       'past_consultations', 'num_of_steps', 'Hospital_expenditure',
       'NUmber_of_past_hospitalizations', 'Anual_Salary', 'region', 'charges'],
      dtype='object')

In [11]:
for i in insurance.columns:
    if insurance[i].dtypes=='object':
        insurance[i]= insurance[i].fillna(insurance[i].mode()[0])
    else:
        insurance[i]= insurance[i].fillna(insurance[i].mean())

In [12]:
insurance.isnull().sum()

age                                0
sex                                0
bmi                                0
children                           0
smoker                             0
Claim_Amount                       0
past_consultations                 0
num_of_steps                       0
Hospital_expenditure               0
NUmber_of_past_hospitalizations    0
Anual_Salary                       0
region                             0
charges                            0
dtype: int64

In [16]:
for i in insurance.columns:
    if ((insurance[i].dtypes=='int64') & ( insurance[i].dtypes=='float64')):
        plt.histplot(insurance[i])
        plt.xlabel(i)
        plt.ylabel("count")
        plt.show()

In [17]:
for i in insurance.columns:
     if ((insurance[i].dtypes=='int64') & ( insurance[i].dtypes=='float64')):
        plt.boxplot(insurance[i])
        plt.xlabel(i)
        plt.ylabel("count")
        plt.show()
    

In [18]:
insurance.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'Claim_Amount',
       'past_consultations', 'num_of_steps', 'Hospital_expenditure',
       'NUmber_of_past_hospitalizations', 'Anual_Salary', 'region', 'charges'],
      dtype='object')

In [19]:
col_list=[ 'bmi', 
       'past_consultations',  'Hospital_expenditure',
       'Anual_Salary']

In [20]:
for i in col_list:
    Q1= insurance[i].quantile(0.25)
    Q3= insurance[i].quantile(0.75)
    IQR=Q3-Q1
    insurance = insurance[(insurance[i]>Q1-1.5*IQR)&(insurance[i]<Q3+1.5*IQR)]

In [21]:
insurance.shape

(1030, 13)

In [22]:
insurance.duplicated().sum()

0

In [28]:
#to check Multi Collinearity
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [30]:
col_list=[]

In [31]:
for i in insurance.columns:
    if((insurance[i].dtypes!="object") & ( i!="charges")):
        col_list.append(i)

In [32]:
#creating new dataframe
X=insurance[col_list]

In [36]:
#creating new dataframe to store the VIF values
vif_data=pd.DataFrame()
vif_data["features"]=X.columns
vif_data["VIF"]=[variance_inflation_factor(X.values,i) for i in range(len(X.columns))]
vif_data

Unnamed: 0,features,VIF
0,age,15.452095
1,bmi,26.330788
2,children,2.029618
3,Claim_Amount,5.67866
4,past_consultations,6.258017
5,num_of_steps,61.574692
6,Hospital_expenditure,5.204376
7,NUmber_of_past_hospitalizations,12.05206
8,Anual_Salary,5.481823


In [38]:
insurance=insurance.drop(["num_of_steps"],axis=1)

In [40]:
col_list=[]
for i in insurance.columns:
    if((insurance[i].dtypes!="object") & ( i!="charges")):
        col_list.append(i)

X=insurance[col_list]
#creating new dataframe to store the VIF values
vif_data=pd.DataFrame()
vif_data["features"]=X.columns
vif_data["VIF"]=[variance_inflation_factor(X.values,i) for i in range(len(X.columns))]
vif_data

Unnamed: 0,features,VIF
0,age,14.484893
1,bmi,12.261123
2,children,2.017441
3,Claim_Amount,5.24364
4,past_consultations,5.842169
5,Hospital_expenditure,4.929952
6,NUmber_of_past_hospitalizations,10.687768
7,Anual_Salary,5.380679


In [41]:
insurance=insurance.drop(["age"],axis=1)

In [42]:
col_list=[]
for i in insurance.columns:
    if((insurance[i].dtypes!="object") & ( i!="charges")):
        col_list.append(i)

X=insurance[col_list]
#creating new dataframe to store the VIF values
vif_data=pd.DataFrame()
vif_data["features"]=X.columns
vif_data["VIF"]=[variance_inflation_factor(X.values,i) for i in range(len(X.columns))]
vif_data

Unnamed: 0,features,VIF
0,bmi,10.323093
1,children,1.978843
2,Claim_Amount,5.228284
3,past_consultations,5.821933
4,Hospital_expenditure,4.929287
5,NUmber_of_past_hospitalizations,8.31057
6,Anual_Salary,4.723495


In [43]:
insurance=insurance.drop(["bmi"],axis=1)

In [44]:
col_list=[]
for i in insurance.columns:
    if((insurance[i].dtypes!="object") & ( i!="charges")):
        col_list.append(i)

X=insurance[col_list]
#creating new dataframe to store the VIF values
vif_data=pd.DataFrame()
vif_data["features"]=X.columns
vif_data["VIF"]=[variance_inflation_factor(X.values,i) for i in range(len(X.columns))]
vif_data

Unnamed: 0,features,VIF
0,children,1.977037
1,Claim_Amount,4.547332
2,past_consultations,5.049165
3,Hospital_expenditure,4.49263
4,NUmber_of_past_hospitalizations,7.245972
5,Anual_Salary,4.688833


In [45]:
insurance=insurance.drop(["NUmber_of_past_hospitalizations"],axis=1)

In [46]:
col_list=[]
for i in insurance.columns:
    if((insurance[i].dtypes!="object") & ( i!="charges")):
        col_list.append(i)

X=insurance[col_list]
#creating new dataframe to store the VIF values
vif_data=pd.DataFrame()
vif_data["features"]=X.columns
vif_data["VIF"]=[variance_inflation_factor(X.values,i) for i in range(len(X.columns))]
vif_data

Unnamed: 0,features,VIF
0,children,1.713101
1,Claim_Amount,4.152224
2,past_consultations,4.670212
3,Hospital_expenditure,4.320876
4,Anual_Salary,4.279483


In [47]:
X

Unnamed: 0,children,Claim_Amount,past_consultations,Hospital_expenditure,Anual_Salary
0,0.0,29087.54313,17.0,4.720921e+06,5.578497e+07
1,0.0,39053.67437,7.0,4.329832e+06,1.370089e+07
2,0.0,39023.62759,19.0,6.884861e+06,7.352311e+07
3,0.0,28185.39332,11.0,4.274774e+06,7.581968e+07
4,0.0,14697.85941,16.0,3.787294e+06,2.301232e+07
...,...,...,...,...,...
1046,1.0,51168.25474,23.0,1.665982e+07,3.712188e+08
1048,3.0,46619.40230,27.0,1.361938e+07,3.812895e+08
1050,0.0,24382.58056,21.0,1.028991e+07,3.745310e+08
1062,2.0,21596.43846,10.0,1.083030e+07,3.696849e+08


In [49]:
y=insurance.iloc[:,-1] #to consider all rows and only the last column
y

0        1121.87390
1        1131.50660
2        1135.94070
3        1136.39940
4        1137.01100
           ...     
1046    19107.77960
1048    19199.94400
1050    19350.36890
1062    19798.05455
1069    20234.85475
Name: charges, Length: 1030, dtype: float64

In [53]:
#Dividing the data into train and test

In [54]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [57]:
x_train,x_test, y_train, y_test= train_test_split(X,y, test_size=0.2)

In [58]:
#now we need to build the model
#Creating an object for the model
l_model=LinearRegression()

In [59]:
#to train the model
l_model.fit(x_train,y_train)

In [60]:
prediction=l_model.predict(x_test)
prediction

array([12714.05291195,  6617.79465929,  5333.52678118, 15595.14207213,
        4767.28538247,  9079.74902113,  1559.08540644,  5290.98553852,
        6286.01349693, 13632.45650978, 10746.7641256 ,  6349.29255322,
        8282.36830003, 17495.16008697, 10395.50406079, 12985.80057969,
       13977.9580911 ,  6000.46768654,  4070.78966147,  9568.6427117 ,
       11520.65753108, 10389.99013846,  9793.42624902,  4276.26588942,
        4136.2499325 ,  3294.56094233, 10033.28010997,  7564.51062346,
       10521.30038095,  1659.44198798, 19654.52821976,  2777.27695798,
        4492.70838755,  5388.14456649,  6772.00393788, 11275.02689564,
        9047.34418189,  6317.66399485,  8328.9281831 ,  4819.84285492,
       10780.00031734,  5765.52094641,  4185.76769945,  5839.91405344,
        8251.3021314 ,  5045.87177078,  5089.91665511, 10670.12486167,
       12915.16783794, 18658.6861881 ,  9149.32093596,  4606.19557457,
        7646.04133626,  1988.80306209,  8292.62051395,  5270.1862607 ,
      

In [61]:
from sklearn.metrics import * # to import all methods available

In [63]:
r2score=r2_score(y_test,prediction)
r2score

0.7419084827253821

In [65]:
X.head(3)

Unnamed: 0,children,Claim_Amount,past_consultations,Hospital_expenditure,Anual_Salary
0,0.0,29087.54313,17.0,4720920.992,55784970.05
1,0.0,39053.67437,7.0,4329831.676,13700885.19
2,0.0,39023.62759,19.0,6884860.774,73523107.27


In [66]:
l_model.predict([[2,37000,20,4500000,120000]])



array([2304.75607199])