In [None]:
# Import all required libraries

import pandas as pd
import numpy as np

In [None]:
# Create a dataframe

df = pd.read_csv("new_insurance_data.csv")
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,Claim_Amount,past_consultations,num_of_steps,Hospital_expenditure,NUmber_of_past_hospitalizations,Anual_Salary,region,charges
0,18.0,male,23.21,0.0,no,29087.54313,17.0,715428.0,4720920.992,0.0,55784970.05,southeast,1121.8739
1,18.0,male,30.14,0.0,no,39053.67437,7.0,699157.0,4329831.676,0.0,13700885.19,southeast,1131.5066
2,18.0,male,33.33,0.0,no,39023.62759,19.0,702341.0,6884860.774,0.0,73523107.27,southeast,1135.9407
3,18.0,male,33.66,0.0,no,28185.39332,11.0,700250.0,4274773.55,0.0,75819679.6,southeast,1136.3994
4,18.0,male,34.1,0.0,no,14697.85941,16.0,711584.0,3787293.921,0.0,23012320.01,southeast,1137.011


In [None]:
# Replace null values with the correct corresponding values

for i in df.columns:
    if df[i].dtypes == "object":
        df[i] = df[i].fillna(df[i].mode()[0])
    else:
        df[i] = df[i].fillna(df[i].mean())

In [None]:
# Check if values between IQR and extract only those values

def remove_outliers(column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] > lower_bound) & (df[column] < upper_bound)]

columns_to_check = ['bmi', 'Hospital_expenditure', 'Anual_Salary', 'charges']
for column in columns_to_check:
    df = remove_outliers(column)

In [None]:
# Get VIF for each column , if vif > 5 , we should not use it

from statsmodels.stats.outliers_influence import variance_inflation_factor
col_list = []
for col in df.columns:
    if (df[col].dtypes != "object") & (col !="charges"):
        col_list.append(col)

x = df[col_list]
vif_data = pd.DataFrame()
vif_data['features'] = x.columns
vif_data['VIF'] = [variance_inflation_factor(x.values , i) for i in range(len(x.columns))]
print(vif_data)

                          features        VIF
0                              age  15.452095
1                              bmi  26.330788
2                         children   2.029618
3                     Claim_Amount   5.678660
4               past_consultations   6.258017
5                     num_of_steps  61.574692
6             Hospital_expenditure   5.204376
7  NUmber_of_past_hospitalizations  12.052060
8                     Anual_Salary   5.481823


In [None]:
df.drop(columns="num_of_steps" , inplace = True)

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
col_list = []
for col in df.columns:
    if (df[col].dtypes != "object") & (col !="charges"):
        col_list.append(col)

x = df[col_list]
vif_data = pd.DataFrame()
vif_data['features'] = x.columns
vif_data['VIF'] = [variance_inflation_factor(x.values , i) for i in range(len(x.columns))]
print(vif_data)

                          features        VIF
0                              age  14.484893
1                              bmi  12.261123
2                         children   2.017441
3                     Claim_Amount   5.243640
4               past_consultations   5.842169
5             Hospital_expenditure   4.929952
6  NUmber_of_past_hospitalizations  10.687768
7                     Anual_Salary   5.380679


In [None]:
df.drop(columns="bmi" , inplace = True)

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
col_list = []
for col in df.columns:
    if (df[col].dtypes != "object") & (col !="charges"):
        col_list.append(col)

x = df[col_list]
vif_data = pd.DataFrame()
vif_data['features'] = x.columns
vif_data['VIF'] = [variance_inflation_factor(x.values , i) for i in range(len(x.columns))]
print(vif_data)

                          features        VIF
0                              age  12.195367
1                         children   2.003728
2                     Claim_Amount   4.742503
3               past_consultations   5.279585
4             Hospital_expenditure   4.549780
5  NUmber_of_past_hospitalizations  10.576371
6                     Anual_Salary   5.137503


In [None]:
df.drop(columns=["age",'NUmber_of_past_hospitalizations'] , inplace = True)

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
col_list = []
for col in df.columns:
    if (df[col].dtypes != "object") & (col !="charges"):
        col_list.append(col)

x = df[col_list]
vif_data = pd.DataFrame()
vif_data['features'] = x.columns
vif_data['VIF'] = [variance_inflation_factor(x.values , i) for i in range(len(x.columns))]
print(vif_data)

               features       VIF
0              children  1.713101
1          Claim_Amount  4.152224
2    past_consultations  4.670212
3  Hospital_expenditure  4.320876
4          Anual_Salary  4.279483


In [None]:
df.columns

Index(['sex', 'children', 'smoker', 'Claim_Amount', 'past_consultations',
       'Hospital_expenditure', 'Anual_Salary', 'region', 'charges'],
      dtype='object')

In [None]:
# Define X and y

X = df.drop(columns=['sex', 'smoker', 'region', 'charges'])
y = df['charges']

In [None]:
categorical_cols = X.select_dtypes(include=["object"]).columns
numerical_cols = X.select_dtypes(exclude=["object"]).columns

In [None]:
# Create a pipeline

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_percentage_error

In [None]:
numerical_pipeline = Pipeline(steps=[
    ('scaler', StandardScaler())
])

In [None]:
categorical_pipeline = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_cols),
        ('cat', categorical_pipeline, categorical_cols)
    ]
)

In [None]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0)


In [None]:
pipeline.fit(X_train, y_train)


In [None]:
y_pred = pipeline.predict(X_test)


In [None]:
# Metrics
print("R2 Score:", r2_score(y_test, y_pred))
print("Mean Absolute Percentage Error:", mean_absolute_percentage_error(y_test, y_pred))


R2 Score: 0.8436003409636821
Mean Absolute Percentage Error: 0.281574393932055
