In [62]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error,r2_score

import pickle
import json

import warnings
warnings.filterwarnings('ignore')

# Problem Statment

In [None]:
To predict Medical insurance charges by using KNN Algorithm

# DATA Gathering

In [2]:
df  = pd.read_csv('medical_insurance.csv')
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


# EDA 

In [3]:
df.info() ##### Some of the features having object dtype we need to convert it into int/float to work on model

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [7]:
df['age'].value_counts().to_dict()

{18: 69,
 19: 68,
 50: 29,
 51: 29,
 47: 29,
 46: 29,
 45: 29,
 20: 29,
 48: 29,
 52: 29,
 22: 28,
 49: 28,
 54: 28,
 53: 28,
 21: 28,
 26: 28,
 24: 28,
 25: 28,
 28: 28,
 27: 28,
 23: 28,
 43: 27,
 29: 27,
 30: 27,
 41: 27,
 42: 27,
 44: 27,
 31: 27,
 40: 27,
 32: 26,
 33: 26,
 56: 26,
 34: 26,
 55: 26,
 57: 26,
 37: 25,
 59: 25,
 58: 25,
 36: 25,
 38: 25,
 35: 25,
 39: 25,
 61: 23,
 60: 23,
 63: 23,
 62: 23,
 64: 22}

In [8]:
df['sex'].value_counts().to_dict()

{'male': 676, 'female': 662}

In [10]:
df['children'].value_counts().to_dict()

{0: 574, 1: 324, 2: 240, 3: 157, 4: 25, 5: 18}

In [12]:
df['smoker'].value_counts().to_dict()

{'no': 1064, 'yes': 274}

In [13]:
df['region'].value_counts().to_dict()

{'southeast': 364, 'southwest': 325, 'northwest': 325, 'northeast': 324}

# Feature Engineering

In [15]:
df['sex'].replace({'male': 1, 'female': 0},inplace=True)

In [19]:
df['smoker'] = df['smoker'].replace({'no': 0, 'yes': 1})

In [21]:
df = pd.get_dummies(df,columns=['region'])

In [22]:
df

Unnamed: 0,age,sex,bmi,children,smoker,charges,region_northeast,region_northwest,region_southeast,region_southwest
0,19,0,27.900,0,1,16884.92400,0,0,0,1
1,18,1,33.770,1,0,1725.55230,0,0,1,0
2,28,1,33.000,3,0,4449.46200,0,0,1,0
3,33,1,22.705,0,0,21984.47061,0,1,0,0
4,32,1,28.880,0,0,3866.85520,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...
1333,50,1,30.970,3,0,10600.54830,0,1,0,0
1334,18,0,31.920,0,0,2205.98080,1,0,0,0
1335,18,0,36.850,0,0,1629.83350,0,0,1,0
1336,21,0,25.800,0,0,2007.94500,0,0,0,1


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   age               1338 non-null   int64  
 1   sex               1338 non-null   int64  
 2   bmi               1338 non-null   float64
 3   children          1338 non-null   int64  
 4   smoker            1338 non-null   int64  
 5   charges           1338 non-null   float64
 6   region_northeast  1338 non-null   uint8  
 7   region_northwest  1338 non-null   uint8  
 8   region_southeast  1338 non-null   uint8  
 9   region_southwest  1338 non-null   uint8  
dtypes: float64(2), int64(4), uint8(4)
memory usage: 68.1 KB


# Preprocessing

In [None]:
For Scaling there are two types 1) Normalization 
                                2) Standardization 

In [None]:
#1)First model create by using Normalization method

In [26]:
xdf = df.drop('charges',axis = 1)
y = df['charges']
xdf

Unnamed: 0,age,sex,bmi,children,smoker,region_northeast,region_northwest,region_southeast,region_southwest
0,19,0,27.900,0,1,0,0,0,1
1,18,1,33.770,1,0,0,0,1,0
2,28,1,33.000,3,0,0,0,1,0
3,33,1,22.705,0,0,0,1,0,0
4,32,1,28.880,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...
1333,50,1,30.970,3,0,0,1,0,0
1334,18,0,31.920,0,0,1,0,0,0
1335,18,0,36.850,0,0,0,0,1,0
1336,21,0,25.800,0,0,0,0,0,1


In [28]:
normal_scalar = MinMaxScaler()
array = normal_scalar.fit_transform(xdf)

In [29]:
Normal_df = pd.DataFrame(array, columns = xdf.columns)

Normal_df

Unnamed: 0,age,sex,bmi,children,smoker,region_northeast,region_northwest,region_southeast,region_southwest
0,0.021739,0.0,0.321227,0.0,1.0,0.0,0.0,0.0,1.0
1,0.000000,1.0,0.479150,0.2,0.0,0.0,0.0,1.0,0.0
2,0.217391,1.0,0.458434,0.6,0.0,0.0,0.0,1.0,0.0
3,0.326087,1.0,0.181464,0.0,0.0,0.0,1.0,0.0,0.0
4,0.304348,1.0,0.347592,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
1333,0.695652,1.0,0.403820,0.6,0.0,0.0,1.0,0.0,0.0
1334,0.000000,0.0,0.429379,0.0,0.0,1.0,0.0,0.0,0.0
1335,0.000000,0.0,0.562012,0.0,0.0,0.0,0.0,1.0,0.0
1336,0.065217,0.0,0.264730,0.0,0.0,0.0,0.0,0.0,1.0


# Train test Split and model building

In [30]:
x = Normal_df
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=45)

In [31]:
x_train

Unnamed: 0,age,sex,bmi,children,smoker,region_northeast,region_northwest,region_southeast,region_southwest
1075,0.304348,0.0,0.366694,0.2,0.0,0.0,0.0,1.0,0.0
761,0.108696,1.0,0.517622,0.2,0.0,0.0,0.0,0.0,1.0
6,0.608696,0.0,0.470272,0.2,0.0,0.0,0.0,1.0,0.0
1171,0.543478,0.0,0.288943,0.4,1.0,0.0,0.0,0.0,1.0
127,0.739130,0.0,0.576809,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...
580,0.891304,1.0,0.255582,0.2,0.0,1.0,0.0,0.0,0.0
163,0.304348,0.0,0.372343,0.4,0.0,0.0,0.0,0.0,1.0
607,0.891304,0.0,0.207022,0.0,1.0,0.0,1.0,0.0,0.0
414,0.021739,0.0,0.516277,0.0,0.0,0.0,1.0,0.0,0.0


In [32]:
x_test

Unnamed: 0,age,sex,bmi,children,smoker,region_northeast,region_northwest,region_southeast,region_southwest
910,0.086957,1.0,0.332257,0.2,0.0,0.0,1.0,0.0,0.0
823,0.565217,0.0,0.372612,0.4,0.0,0.0,0.0,1.0,0.0
677,0.913043,1.0,0.414044,0.6,1.0,0.0,1.0,0.0,0.0
559,0.021739,1.0,0.526500,0.0,0.0,0.0,1.0,0.0,0.0
1169,0.413043,0.0,0.488162,0.2,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
1080,0.000000,1.0,0.156578,0.4,0.0,0.0,0.0,1.0,0.0
699,0.108696,0.0,0.627119,0.4,0.0,0.0,0.0,1.0,0.0
1127,0.369565,0.0,0.535378,0.4,0.0,0.0,0.0,1.0,0.0
1032,0.260870,0.0,0.322034,0.0,0.0,1.0,0.0,0.0,0.0


In [33]:
KNN_rg = KNeighborsRegressor()
KNN_rg.fit(x_train,y_train)

# Evaluation

In [35]:
#Testing data evaluation

y_pred = KNN_rg.predict(x_test)
y_pred[40:45]

array([34368.557  ,  2746.2515 , 16744.01162,  5874.76118,  2320.2006 ])

In [36]:
y_test[40:45]

1218    41661.60200
882      2585.85065
686      7729.64575
1153     5630.45785
1331    10795.93733
Name: charges, dtype: float64

In [37]:
#Testing data evaluation

MSE = mean_squared_error(y_test,y_pred)
print('MSE is :\n',MSE)

RMSE = np.sqrt(MSE)
print('RMSE is :\n',RMSE)

MAE = mean_absolute_error(y_test,y_pred)
print('MAE is :\n',MAE)

r2_value = r2_score(y_test,y_pred)
print('r2_value is :\n',r2_value)

MSE is :
 38860149.76414058
RMSE is :
 6233.790962499511
MAE is :
 3859.226989962687
r2_value is :
 0.6997306611844084


In [39]:
#Training data evaluation
y_pred_train = KNN_rg.predict(x_train)

MSE = mean_squared_error(y_train,y_pred_train)
print('MSE is :\n',MSE)

RMSE = np.sqrt(MSE)
print('RMSE is :\n',RMSE)

MAE = mean_absolute_error(y_train,y_pred_train)
print('MAE is :\n',MAE)

r2_value = r2_score(y_train,y_pred_train)
print('r2_value is :\n',r2_value)

MSE is :
 21414995.483676966
RMSE is :
 4627.633896893419
MAE is :
 2750.2373465723367
r2_value is :
 0.8580126201461273


# Now lets find out the best value of K - Hyperparameter Tuning

In [43]:
KNN_model = KNeighborsRegressor()
hyperparameter = {'n_neighbors':np.arange(2,20),'p':[1,2]}
GSCV_KNN_Model = GridSearchCV(KNN_model,hyperparameter,cv = 5)
GSCV_KNN_Model.fit(x_train,y_train)

GSCV_KNN_Model.best_estimator_

In [46]:
#Training data Evalution
KNN_reg = GSCV_KNN_Model.best_estimator_

y_pred_train = KNN_reg.predict(x_train)

MSE = mean_squared_error(y_train,y_pred_train)
print('MSE is :\n',MSE)

RMSE = np.sqrt(MSE)
print('RMSE is :\n',RMSE)

MAE = mean_absolute_error(y_train,y_pred_train)
print('MAE is :\n',MAE)

r2_value = r2_score(y_train,y_pred_train)
print('r2_value is :\n',r2_value)

MSE is :
 23447875.572582867
RMSE is :
 4842.300648718837
MAE is :
 2963.9190391042052
r2_value is :
 0.8445340594057869


In [48]:
#Testing data Evalution
KNN_reg = GSCV_KNN_Model.best_estimator_

y_pred = KNN_reg.predict(x_test)

MSE = mean_squared_error(y_test,y_pred)
print('MSE is :\n',MSE)

RMSE = np.sqrt(MSE)
print('RMSE is :\n',RMSE)

MAE = mean_absolute_error(y_test,y_pred)
print('MAE is :\n',MAE)

r2_value = r2_score(y_test,y_pred)
print('r2_value is :\n',r2_value)

MSE is :
 36149292.532096535
RMSE is :
 6012.428172718285
MAE is :
 3761.1470308208955
r2_value is :
 0.7206772430588937


# Now let's create a model having standardized data

In [51]:
std_scalar = StandardScaler()
array2 = std_scalar.fit_transform(xdf)

In [52]:
std_df = pd.DataFrame(array2,columns=xdf.columns)
std_df

Unnamed: 0,age,sex,bmi,children,smoker,region_northeast,region_northwest,region_southeast,region_southwest
0,-1.438764,-1.010519,-0.453320,-0.908614,1.970587,-0.565267,-0.566418,-0.611324,1.765481
1,-1.509965,0.989591,0.509621,-0.078767,-0.507463,-0.565267,-0.566418,1.635795,-0.566418
2,-0.797954,0.989591,0.383307,1.580926,-0.507463,-0.565267,-0.566418,1.635795,-0.566418
3,-0.441948,0.989591,-1.305531,-0.908614,-0.507463,-0.565267,1.765481,-0.611324,-0.566418
4,-0.513149,0.989591,-0.292556,-0.908614,-0.507463,-0.565267,1.765481,-0.611324,-0.566418
...,...,...,...,...,...,...,...,...,...
1333,0.768473,0.989591,0.050297,1.580926,-0.507463,-0.565267,1.765481,-0.611324,-0.566418
1334,-1.509965,-1.010519,0.206139,-0.908614,-0.507463,1.769076,-0.566418,-0.611324,-0.566418
1335,-1.509965,-1.010519,1.014878,-0.908614,-0.507463,-0.565267,-0.566418,1.635795,-0.566418
1336,-1.296362,-1.010519,-0.797813,-0.908614,-0.507463,-0.565267,-0.566418,-0.611324,1.765481


# TRAIN_TEST_SPLIT and MODEL BUILDING

In [53]:
x = std_df
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=45)

In [55]:
KNN_regressor = KNeighborsRegressor()
KNN_regressor.fit(x_train,y_train)

In [56]:
#Evaluation of Testing Data 

y_pred = KNN_regressor.predict(x_test)

MSE = mean_squared_error(y_test,y_pred)
print('MSE is :\n',MSE)

RMSE = np.sqrt(MSE)
print('RMSE is :\n',RMSE)

MAE = mean_absolute_error(y_test,y_pred)
print('MAE is :\n',MAE)

r2_value = r2_score(y_test,y_pred)
print('r2_value is :\n',r2_value)

MSE is :
 31735587.779542595
RMSE is :
 5633.434811865901
MAE is :
 3473.515361544776
r2_value is :
 0.7547815945814802


In [57]:
#Evaluation of Training Data 

y_pred_train = KNN_regressor.predict(x_train)

MSE = mean_squared_error(y_train,y_pred_train)
print('MSE is :\n',MSE)

RMSE = np.sqrt(MSE)
print('RMSE is :\n',RMSE)

MAE = mean_absolute_error(y_train,y_pred_train)
print('MAE is :\n',MAE)

r2_value = r2_score(y_train,y_pred_train)
print('r2_value is :\n',r2_value)

MSE is :
 20176580.229438942
RMSE is :
 4491.834839955599
MAE is :
 2737.2580016809343
r2_value is :
 0.8662236579329134


# Hyperparameter Tuning for standardized data

In [58]:
KNN_std_model = KNeighborsRegressor()
hyperparameter = {'n_neighbors':np.arange(2,20),'p':[1,2]}
GSCV_KNN_std_Model = GridSearchCV(KNN_std_model,hyperparameter,cv = 5)
GSCV_KNN_std_Model.fit(x_train,y_train)

GSCV_KNN_std_Model.best_estimator_

In [59]:
#Training data Evalution
KNN_reg_std = GSCV_KNN_std_Model.best_estimator_

y_pred_train = KNN_reg_std.predict(x_train)

MSE = mean_squared_error(y_train,y_pred_train)
print('MSE is :\n',MSE)

RMSE = np.sqrt(MSE)
print('RMSE is :\n',RMSE)

MAE = mean_absolute_error(y_train,y_pred_train)
print('MAE is :\n',MAE)

r2_value = r2_score(y_train,y_pred_train)
print('r2_value is :\n',r2_value)

MSE is :
 23098074.14316827
RMSE is :
 4806.045582718528
MAE is :
 2948.856426578772
r2_value is :
 0.8468533402326064


In [60]:
#Testing data Evalution

y_pred = KNN_reg_std.predict(x_test)

MSE = mean_squared_error(y_test,y_pred)
print('MSE is :\n',MSE)

RMSE = np.sqrt(MSE)
print('RMSE is :\n',RMSE)

MAE = mean_absolute_error(y_test,y_pred)
print('MAE is :\n',MAE)

r2_value = r2_score(y_test,y_pred)
print('r2_value is :\n',r2_value)

MSE is :
 33329204.64160566
RMSE is :
 5773.145125631752
MAE is :
 3605.4652119083153
r2_value is :
 0.7424678416906301


In [None]:
KNN_regressor model is working best among all the model 

In [61]:
KNN_regressor

In [63]:
with open ("KNN_model.pkl",'wb')as f:
    pickle.dump(KNN_regressor,f)

In [64]:
std_scalar

In [65]:
with open('Std_sclar.pkl','wb')as f:
    pickle.dump(std_scalar,f)

# Testing on single row

In [82]:
age= 25
sex= 'female'
bmi= 11.22
children= 5
smoker= 'yes'
region = 'northwest'


In [67]:
column_names = x.columns
column_names

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region_northeast',
       'region_northwest', 'region_southeast', 'region_southwest'],
      dtype='object')

In [68]:
test_array = np.zeros(len(x.columns))
test_array

array([0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [69]:
label_encoded_data = {'sex':{'male':1,'female':0},
                     'smoker':{'yes':1,'no':0}
                     }

label_encoded_data['smoker'][smoker]
label_encoded_data['sex'][sex]

0

In [75]:
test_array[0] = age
test_array[1] = label_encoded_data['sex'][sex]
test_array[2] = bmi
test_array[3] = children
test_array[4] = label_encoded_data['smoker'][smoker]

In [83]:
region = 'region_' + region
region

'region_northwest'

In [84]:
region_index = np.where(column_names == region)[0][0]
region_index

6

In [85]:
test_array[region_index] = 1

In [86]:
test_array

array([25.  ,  0.  , 11.22,  5.  ,  1.  ,  0.  ,  1.  ,  0.  ,  0.  ])

In [94]:
scaled_test_array = std_scalar.transform([test_array])
scaled_test_array

array([[-1.01155712, -1.0105187 , -3.18958193,  3.24061871,  1.97058663,
        -0.56526686,  1.76548098, -0.61132367, -0.56641788]])

In [93]:
np.around(KNN_regressor.predict(scaled_test_array)[0],2)

14091.11

In [88]:
project_data = {'sex':{'male':1,'female':0},
                     'smoker':{'yes':1,'no':0},
                'columns':list(x.columns)
                     }

In [89]:
import json

with open("Project_data.json",'w') as f:
    json.dump(project_data,f)