In [1]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

import warnings

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split


from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.linear_model import LinearRegression

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV




### PROBLEM STATEMENT

To predict the charges for medical insurance using supervised machine learning
considering linear regression problem

### DATA GATHERING 

In [2]:
df = pd.read_csv("medical_insurance.csv")
df

FileNotFoundError: [Errno 2] No such file or directory: 'medical_insurance.csv'

### EXPLORATORY DATA ANALYSIS

In [None]:
df.info()

In [None]:
row_count = df.shape[0]
row_count

In [None]:
columns_count = df.shape[1]
columns_count

In [None]:
df.describe()        # statistical analysis

In [None]:
df["age"].unique()

In [None]:
df["age"].value_counts()

In [None]:
df["bmi"].unique()

In [None]:
df["bmi"].value_counts()

In [None]:
df["children"].unique()

In [None]:
df["children"].value_counts()

In [None]:
df["smoker"].unique()

In [None]:
df["smoker"].value_counts()

In [None]:
df["region"].unique()

In [None]:
df["region"].value_counts()

In [None]:
df["charges"].unique()

In [None]:
df["charges"].value_counts()

In [None]:
df.isna().sum()     # There is no messing values

#### Converting catogorical data into numerical format by using label encoding and one hot encoding

In [None]:
df["sex"].replace({"male":0,"female":1},inplace = True)

In [None]:
sex_values = {"male":0,"female":1}
sex_values

In [None]:
df["smoker"].replace({"no":0,"yes":1},inplace = True)

In [None]:
smoker_values = {"no":0,"yes":1}
smoker_values 

In [None]:
df = pd.get_dummies(df,columns = ["region"])

In [None]:
df

In [None]:
df.info()       # after converting catogorical data into numerical format

### Detect outliers

In [None]:
plt.figure(figsize = (30,10))
df.boxplot()

### FEATURE ENGINEERING

#### Detect and imputing outliers

In [None]:
q1 = df["bmi"].quantile(0.25)
q2 = df["bmi"].quantile(0.50)
q3 = df["bmi"].quantile(0.75)

iqr = q3 - q1

upper_tail = q3 + 1.5 * iqr
lower_tail = q1 - 1.5 * iqr


print("q1",q1)
print("q2",q2)
print("q3",q3)

print("upper_tail", upper_tail)
print("lower_tail", lower_tail)

In [None]:
df["bmi"].loc[(df["bmi"] > upper_tail) | (df["bmi"] < lower_tail)]

In [None]:
median_bmi = df["bmi"].loc[(df["bmi"] <= upper_tail) & (df["bmi"] >= lower_tail)].median()
median_bmi

In [None]:
warnings.filterwarnings ("ignore")

In [None]:
df["bmi"].loc[(df["bmi"] > upper_tail) | (df["bmi"] < lower_tail)] = median_bmi

In [None]:
df["bmi"].loc[(df["bmi"] > upper_tail) | (df["bmi"] < lower_tail)] 

### FEATURE SELECTION

### Linearity

In [None]:
df1 = df.drop("charges",axis = 1)
df2 = df["charges"]
df3 = pd.concat([df1,df2],axis = 1)
df3

In [None]:
df3.corr()

In [None]:
plt.figure(figsize = (30,2))
sns.heatmap(df3.corr().tail(1),annot = True)

### No Multicolinearity

In [None]:
df1

In [None]:
Vif_list = []

for i in range(df1.shape[1]):
    
    vif = variance_inflation_factor(df1.to_numpy(),i)
    
    Vif_list.append(vif)
    
df2 = pd.Series(Vif_list, index = df1.columns)

df2.plot(kind = "barh")
    
    

### MODEL BUILDING

In [None]:
x = df.drop("charges",axis = 1)
y = df["charges"]

In [None]:
x_train, x_test, y_train , y_test = train_test_split(x,y, train_size=0.8 , random_state=11)

In [None]:
model = LinearRegression()     #create instance of the model
model

In [None]:
model.fit(x_train , y_train)    # we fit the model on training datasets

In [None]:
model.coef_           # It will give slop

### MODEL EVALUATION

#### Training dataset

In [None]:
y_pred = model.predict(x_train)
y_pred

In [None]:
residual = y_train - y_pred 
residual

In [None]:
sns.kdeplot(residual)           # plot resudual

In [None]:
MSE = mean_squared_error(y_train , y_pred)

RMSE = np.sqrt(MSE)

MAE = mean_absolute_error(y_train , y_pred)

R2_training = model.score(x_train, y_train)

adj_r2_training = 1 - (((1 - R2_training) * (x_train.shape[0] - 1)) / (x_train.shape[0] - x_train.shape[1] - 1))

print("MSE on training dataset --->", MSE)

print("RMSE on training dataset --->",RMSE)

print("MAE on training dataset --->", MAE)

print("R2 on training dataset --->", R2_training )

print("Adjusted R Squared Value --->",adj_r2_training)

#### Testing dataset

In [None]:
y_pred_test = model.predict(x_test)
y_pred_test

In [None]:
test_residual = y_test - y_pred_test
test_residual

In [None]:
sns.kdeplot(test_residual)           # plot test_resudual

In [None]:
MSE = mean_squared_error(y_test , y_pred_test)

RMSE = np.sqrt(MSE)

MAE = mean_absolute_error(y_test ,y_pred_test)

R2_testing = model.score(x_test, y_test)

adj_r2_testing  = 1 - (((1 - R2_testing) * (x_test.shape[0] - 1)) / (x_test.shape[0] - x_test.shape[1] - 1))


print("MSE on training dataset --->", MSE)

print("RMSE on training dataset --->",RMSE)

print("MAE on training dataset --->", MAE)

print("R2 on training dataset --->", R2_testing)

print("Adjusted R Squared Value :",adj_r2_testing)



In [None]:
print("Accuracy on training dataset", R2_training )
print("Accuracy on testing dataset", R2_testing)

## Normality of residual

#### Hypothesis testing

In [None]:
from scipy.stats import shapiro

#### shapiro test

In [None]:
_ , p_val = shapiro(residual)
print("P_Value:",p_val)
if p_val >= 0.05:
    print("Null Hypothesis is Accepted")
    print("Data is Normally Distributed")
else:
    print("Null Hypothesis is Rejected and Alternate Hypothesis is Accepted")
    print("Data is not Normally Distributed")

#### QQ plot

In [None]:
import statsmodels.api as sm

In [None]:
sm.qqplot(residual, line = "45", fit = True)

#### homoscedasticity

In [None]:
sns.scatterplot(x = y_train , y = residual)

### Regularization

### reduce overfitting

### Ridge regression

In [None]:
#by default value of alpha

In [None]:
ridge_reg_model = Ridge(alpha= 1.0) 
ridge_reg_model.fit(x_train, y_train)

### Training Dataset

In [None]:
y_pred_train_ridge = ridge_reg_model.predict(x_train)

mse = mean_squared_error(y_train, y_pred_train_ridge)
print("MSE :",mse)

rmse = np.sqrt(mse)
print("RMSE :",rmse)

mae = mean_absolute_error(y_train,y_pred_train_ridge)
print("MAE :",mae)

ridge_r2_train = r2_score(y_train, y_pred_train_ridge)
print("R2 Scored :",ridge_r2_train )

### Testing Dataset

In [None]:
y_pred_test = ridge_reg_model.predict(x_test)

mse = mean_squared_error(y_test, y_pred_test )
print("MSE :",mse)

rmse = np.sqrt(mse)
print("RMSE :",rmse)

mae = mean_absolute_error(y_test, y_pred_test) 
print("MAE :",mae)

ridge_r2_test = r2_score(y_test, y_pred_test )
print("R2 Scored :",ridge_r2_test)

## lasso regression

In [None]:
lasso_reg_model = Lasso(alpha= 1.0)  # by default value
lasso_reg_model.fit(x_train, y_train)

### Training Dataset

In [None]:
y_pred_train_lasso = lasso_reg_model.predict(x_train)

mse = mean_squared_error(y_train, y_pred_train_lasso)
print("MSE :",mse)

rmse = np.sqrt(mse)
print("RMSE :",rmse)

mae = mean_absolute_error(y_train,y_pred_train_lasso)
print("MAE :",mae)

lasso_r2_tarin = r2_score(y_train, y_pred_train_lasso)
print("R2 Scored :", lasso_r2_tarin  )

### Testing Dataset

In [None]:
y_pred_test_lasso = lasso_reg_model.predict(x_test)

mse = mean_squared_error(y_test, y_pred_test_lasso )
print("MSE :",mse)

rmse = np.sqrt(mse)
print("RMSE :",rmse)

mae = mean_absolute_error(y_test, y_pred_test_lasso) 
print("MAE :",mae)

lasso_r2_test = r2_score(y_test, y_pred_test_lasso )
print("R2 Scored :",lasso_r2_test )

## Hyperparameter Tunning

### GridSearch cv

In [None]:
ridge_model  = Ridge()

param_grid={"alpha":np.arange(0.01,3,0.01)}

g_s_cv_ridge_model=GridSearchCV(ridge_model,param_grid,n_jobs=-1)

g_s_cv_ridge_model.fit(x_train,y_train)

In [None]:
g_s_cv_ridge_model.best_estimator_

In [None]:
ridge_reg_model = Ridge(alpha=0.99) 
ridge_reg_model.fit(x_train, y_train)

### Training Dataset

#### gridsearch cv

#### ridge regression

In [None]:
y_pred_train_gs_cv = ridge_reg_model.predict(x_train)

mse = mean_squared_error(y_train, y_pred_train_gs_cv)
print("MSE :",mse)

rmse = np.sqrt(mse)
print("RMSE :",rmse)

mae = mean_absolute_error(y_train,y_pred_train_gs_cv)
print("MAE :",mae)

ridge_gscv_train_r2 = r2_score(y_train,y_pred_train_gs_cv)
print("R2 Scored :",ridge_gscv_train_r2  )

### Testing Dataset

In [None]:
y_pred_test_gs_cv = ridge_reg_model.predict(x_test)

mse = mean_squared_error(y_test, y_pred_test_gs_cv)
print("MSE :",mse)

rmse = np.sqrt(mse)
print("RMSE :",rmse)

mae = mean_absolute_error(y_test, y_pred_test_gs_cv) 
print("MAE :",mae)

ridge_gscv_test_r2 = r2_score(y_test, y_pred_test_gs_cv )
print("R2 Scored :",ridge_gscv_test_r2)

#### lasso regression

In [None]:
lasso_model = Lasso()


param_grid = {"alpha": np.arange(0.01,3,0.01)}


lasso_model_gs_cv = GridSearchCV(lasso_model, param_grid, n_jobs=-1)

lasso_model_gs_cv.fit(x_train, y_train)

In [None]:
lasso_model_gs_cv.best_estimator_

In [None]:
lasso_reg_model = Lasso(alpha= 2.9899999999999998) 
lasso_reg_model.fit(x_train, y_train)

### Training Dataset

In [None]:
y_pred_train_lasso_gscv = lasso_reg_model.predict(x_train)

mse = mean_squared_error(y_train, y_pred_train_lasso_gscv)
print("MSE :",mse)

rmse = np.sqrt(mse)
print("RMSE :",rmse)

mae = mean_absolute_error(y_train,y_pred_train_lasso_gscv)
print("MAE :",mae)

gscv_lasso__train_r2 = r2_score(y_train, y_pred_train_lasso_gscv)
print("R2 Scored :", gscv_lasso__train_r2 )

### Testing Dataset

In [None]:
y_pred_test_lasso = lasso_reg_model.predict(x_test)

mse = mean_squared_error(y_test, y_pred_test_lasso )
print("MSE :",mse)

rmse = np.sqrt(mse)
print("RMSE :",rmse)

mae = mean_absolute_error(y_test, y_pred_test_lasso) 
print("MAE :",mae)

lasso_r2_gscv_test = r2_score(y_test, y_pred_test_lasso )
print("R2 Scored :",lasso_r2_gscv_test )

### randimized search cv

#### ridge regression

In [None]:
ridge_model = Ridge()

param_grid = {"alpha": np.arange(0.01,3,0.01)}

ridge_model_rscv = RandomizedSearchCV(ridge_model, param_grid, n_jobs=-1)

ridge_model_rscv .fit(x_train, y_train)


In [None]:
ridge_model_rscv.best_estimator_ 

In [None]:
ridge_reg_model = Ridge(alpha=1.07) 
ridge_reg_model.fit(x_train, y_train)

### Training Dataset

In [None]:
y_pred_train_randm = ridge_reg_model.predict(x_train)

mse = mean_squared_error(y_train, y_pred_train_randm)
print("MSE :",mse)

rmse = np.sqrt(mse)
print("RMSE :",rmse)

mae = mean_absolute_error(y_train,y_pred_train_randm)
print("MAE :",mae)

ridge_r2_randm_train = r2_score(y_train, y_pred_train_randm)
print("R2 Scored :",ridge_r2_randm_train )

### Testing Dataset

In [None]:
y_pred_test_randm = ridge_reg_model.predict(x_test)

mse = mean_squared_error(y_test, y_pred_test_randm  )
print("MSE :",mse)

rmse = np.sqrt(mse)
print("RMSE :",rmse)

mae = mean_absolute_error(y_test, y_pred_test_randm ) 
print("MAE :",mae)

ridge_r2_randm_test = r2_score(y_test, y_pred_test_randm )
print("R2 Scored :",ridge_r2_randm_test)

### lasso regression

In [None]:
lasso_model = Lasso()

param_grid = {"alpha": np.arange(0.01,3,0.01)}

lasso_model_rscv = RandomizedSearchCV(lasso_model, param_grid, n_jobs=-1)

lasso_model_rscv.fit(x_train, y_train)


In [None]:
lasso_model_rscv.best_estimator_

In [None]:
lasso_reg_model = Lasso(alpha= 2.7399999999999998) 
lasso_reg_model.fit(x_train, y_train)

### Training Dataset

In [None]:
y_pred_train_lasso_randm = lasso_reg_model.predict(x_train)

mse = mean_squared_error(y_train, y_pred_train_lasso_randm)
print("MSE :",mse)

rmse = np.sqrt(mse)
print("RMSE :",rmse)

mae = mean_absolute_error(y_train,y_pred_train_lasso_randm)
print("MAE :",mae)

lasso_r2_randm_train = r2_score(y_train,y_pred_train_lasso_randm)
print("R2 Scored :", lasso_r2_randm_train)

### Testing Dataset

In [None]:
y_pred_test_randm = lasso_reg_model.predict(x_test)

mse = mean_squared_error(y_test, y_pred_test_randm  )
print("MSE :",mse)

rmse = np.sqrt(mse)
print("RMSE :",rmse)

mae = mean_absolute_error(y_test, y_pred_test_randm ) 
print("MAE :",mae)

lasso_r2_randm_test = r2_score(y_test, y_pred_test_randm  )
print("R2 Scored :",lasso_r2_randm_test)

## ACCURACY

In [None]:
print("Accuracy on training dataset", R2_training )
print("Accuracy on testing dataset", R2_testing)
print("*"*50)
print("After taking by default value of alpha \n Accuracy is")
print("Ridge Regression")
print("R2 Scored :",ridge_r2_train )
print("R2 Scored :",ridge_r2_test)
print("*"*50)
print("lasso regression")
print("R2 Scored :", lasso_r2_tarin  )
print("R2 Scored :",lasso_r2_test )
print("*"*50)
print("after hyperparameter tunning")
print("By using gridsearch cv")
print("accuracy")
print("ridge regression")
print("R2 Scored :",ridge_gscv_train_r2  )
print("R2 Scored :",ridge_gscv_test_r2)
print("*"*50)
print("lasso regression")
print("R2 Scored :", gscv_lasso__train_r2 )
print("R2 Scored :",lasso_r2_gscv_test )
print("*"*50)
print("By using gridsearch cv")
print("accuracy")
print("ridge regression")
print("R2 Scored :",ridge_r2_randm_train )
print("R2 Scored :",ridge_r2_randm_test)
print("*"*50)
print("lasso regression")
print("R2 Scored :", lasso_r2_randm_train)
print("R2 Scored :",lasso_r2_randm_test)

### single user input testing

In [None]:
x.head(1).T

In [None]:
# age = 19.000
# sex = "male"
# bmi = 27.900
# children = 0.000
# smoker = "no"
# charges = 16884.924
# region = "southest "

In [None]:
import pickle

In [None]:
with open ("linear_model.pkl","wb")as f:
    pickle.dump("model",f)

In [None]:
import json

In [None]:
json_data = {"sex":sex_values,
            "smoker":smoker_values,
            "columns":list(column_names)}
json_data

In [None]:
with open ("json_file.json","w")as f:
    json.dump(json_data,f)

In [None]:
# region1 = "region_" + region
# region1

In [None]:
column_names = x.columns
column_names

In [None]:
age = 19.0
sex = "female"
bmi = 27.9
children = 0.0
smoker = "no"
region = "southeast"

region = "region_" + region
region

region_index = np.where(column_names == region)[0][0]
region_index

In [None]:
list(column_names).index(region)

In [None]:
array = np.zeros(len(column_names))

array[0] = age
array[1] = json_data['sex'][sex]
array[2] = bmi
array[3] = children
array[4] = json_data['smoker'][smoker]
array[region_index] = 1

array

In [None]:
charges = round(model.predict([array])[0],2)
print("Predicted Medical Insurance Charges is :", charges, "/- Rs. Only")