In [1]:
import pandas as pd 
import numpy as np 
import os

In [2]:
df = pd.read_csv('insurance.csv')
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [3]:
def categorize_bmi(bmi):
    if bmi < 18.5:
        return 0
    elif 18.5 <= bmi < 24.9:
        return 1
    elif 25 <= bmi < 29.9:
        return 2
    else:
        return 3

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [5]:
cat_data = df.select_dtypes(include='O')
cat_data.head()

Unnamed: 0,sex,smoker,region
0,female,yes,southwest
1,male,no,southeast
2,male,no,southeast
3,male,no,northwest
4,male,no,northwest


In [6]:
num_data = df.select_dtypes(exclude="O")
num_data.head()

Unnamed: 0,age,bmi,children,charges
0,19,27.9,0,16884.924
1,18,33.77,1,1725.5523
2,28,33.0,3,4449.462
3,33,22.705,0,21984.47061
4,32,28.88,0,3866.8552


In [7]:
cat_data.columns = ['gender','smoking','region']

In [8]:
cat_data['gender'].value_counts()

gender
male      676
female    662
Name: count, dtype: int64

In [9]:
cat_data['smoking'].value_counts()

smoking
no     1064
yes     274
Name: count, dtype: int64

In [10]:
cat_data['region'].value_counts()

region
southeast    364
southwest    325
northwest    325
northeast    324
Name: count, dtype: int64

In [11]:
from sklearn.preprocessing import OneHotEncoder

In [12]:
cat_data['gender'] = cat_data['gender'].map({'male':1,'female':0})
cat_data['smoking'] = cat_data['smoking'].map({'no':0,'yes':1})


In [13]:
cat_data = cat_data.drop('region',axis=1)

In [14]:
cat_data

Unnamed: 0,gender,smoking
0,0,1
1,1,0
2,1,0
3,1,0
4,1,0
...,...,...
1333,1,0
1334,0,0
1335,0,0
1336,0,0


In [15]:
num_data

Unnamed: 0,age,bmi,children,charges
0,19,27.900,0,16884.92400
1,18,33.770,1,1725.55230
2,28,33.000,3,4449.46200
3,33,22.705,0,21984.47061
4,32,28.880,0,3866.85520
...,...,...,...,...
1333,50,30.970,3,10600.54830
1334,18,31.920,0,2205.98080
1335,18,36.850,0,1629.83350
1336,21,25.800,0,2007.94500


In [16]:
num_data['bmi'] = num_data['bmi'].apply(categorize_bmi)

In [17]:
num_data['charges'] = num_data['charges'].astype(int)

In [18]:
complete_df = pd.concat([cat_data,num_data],axis=1)
complete_df.head()

Unnamed: 0,gender,smoking,age,bmi,children,charges
0,0,1,19,2,0,16884
1,1,0,18,3,1,1725
2,1,0,28,3,3,4449
3,1,0,33,1,0,21984
4,1,0,32,2,0,3866


In [19]:
x = complete_df.drop('charges',axis=1)
x

Unnamed: 0,gender,smoking,age,bmi,children
0,0,1,19,2,0
1,1,0,18,3,1
2,1,0,28,3,3
3,1,0,33,1,0
4,1,0,32,2,0
...,...,...,...,...,...
1333,1,0,50,3,3
1334,0,0,18,3,0
1335,0,0,18,3,0
1336,0,0,21,2,0


In [20]:
y = complete_df[['charges']]
y

Unnamed: 0,charges
0,16884
1,1725
2,4449
3,21984
4,3866
...,...
1333,10600
1334,2205
1335,1629
1336,2007


In [21]:
from sklearn.model_selection import  train_test_split

In [22]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [23]:
from sklearn.ensemble import RandomForestRegressor

In [24]:
rdf = RandomForestRegressor()

In [25]:
rdf.fit(x_train,y_train)

  return fit_method(estimator, *args, **kwargs)


In [26]:
rdf.score(x_train,y_train)

0.9442122766387708

In [27]:
rdf.score(x_test,y_test)


0.827421921716156

In [28]:
pred = rdf.predict(x_test)

In [29]:
compare = y_test
compare['prediction'] = pred

In [30]:
y_test=y_test.drop('prediction',axis=1)

In [31]:
from sklearn.preprocessing import StandardScaler

In [32]:
st = StandardScaler()

In [33]:
x_train_scaled = st.fit_transform(x_train)
x_test_scaled = st.fit_transform(x_test)


In [34]:
rdf.fit(x_train_scaled,y_train)

  return fit_method(estimator, *args, **kwargs)


In [35]:
rdf.score(x_train_scaled,y_train)

0.9438572547424117

In [36]:
rdf.score(x_test_scaled,y_test['charges'])

0.8278938331348743

In [37]:
compare.iloc[0:10]

Unnamed: 0,charges,prediction
764,9095,9303.943
887,5272,5113.047851
890,29330,28532.64
1293,9301,9656.411667
259,33750,35378.938119
1312,4536,11878.353833
899,2117,1749.68669
752,14210,14152.085333
1286,3732,5893.464667
707,10264,14368.898


## KNN ALGORITHM

In [38]:
from sklearn.neighbors import KNeighborsRegressor

In [39]:
knn=KNeighborsRegressor()

In [40]:
knn.fit(x_train,y_train)

In [41]:
knn.score(x_train,y_train)

0.6467363893864511

In [42]:
round(knn.score(x_train,y_train)*100)

65

In [43]:
knn.score(x_test,y_test)

0.46483202973872817

In [44]:
round(knn.score(x_test,y_test)*100)

46

In [45]:
import joblib

In [46]:
joblib.dump(knn,'knn_model.lb')

['knn_model.lb']