In [1]:
# Imports
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Decision Tree
from sklearn.tree import DecisionTreeClassifier

import warnings
warnings.filterwarnings("ignore")

In [2]:
df=pd.read_csv("insurance.csv")

In [3]:
df.head()

Unnamed: 0,age,gender,bmi,children,smoker,charges,insuranceclaim
0,19,0,27.9,0,1,16884.924,1
1,18,1,33.77,1,0,1725.5523,1
2,28,1,33.0,3,0,4449.462,0
3,33,1,22.705,0,0,21984.47061,0
4,32,1,28.88,0,0,3866.8552,1


In [5]:
df.shape

(1338, 7)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             1338 non-null   int64  
 1   gender          1338 non-null   int64  
 2   bmi             1338 non-null   float64
 3   children        1338 non-null   int64  
 4   smoker          1338 non-null   int64  
 5   charges         1338 non-null   float64
 6   insuranceclaim  1338 non-null   int64  
dtypes: float64(2), int64(5)
memory usage: 73.3 KB


In [7]:
df.isnull().sum()

age               0
gender            0
bmi               0
children          0
smoker            0
charges           0
insuranceclaim    0
dtype: int64

In [8]:
X=df.drop("insuranceclaim",axis=1)
y=df["insuranceclaim"]

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=1)

In [10]:
# model
# gini index
model = DecisionTreeClassifier()
model.fit(X_train,y_train)

DecisionTreeClassifier()

In [11]:
# train score
model.score(X_train,y_train)

1.0

In [12]:
# predict
y_pred = model.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98       157
           1       0.99      0.99      0.99       245

    accuracy                           0.99       402
   macro avg       0.98      0.98      0.98       402
weighted avg       0.99      0.99      0.99       402



In [13]:
# entropy

model2 = DecisionTreeClassifier(criterion="entropy")
model2.fit(X_train,y_train)

DecisionTreeClassifier(criterion='entropy')

In [14]:
# predict
y_pred = model2.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98       157
           1       0.99      0.99      0.99       245

    accuracy                           0.99       402
   macro avg       0.98      0.98      0.98       402
weighted avg       0.99      0.99      0.99       402



In [15]:
# depth
model.get_depth()

9

In [16]:
# prunning
# max depth

model3 = DecisionTreeClassifier(max_depth=8)
model3.fit(X_train,y_train)

DecisionTreeClassifier(max_depth=8)

In [17]:
# predict
y_pred = model3.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      0.97      0.98       157
           1       0.98      1.00      0.99       245

    accuracy                           0.99       402
   macro avg       0.99      0.99      0.99       402
weighted avg       0.99      0.99      0.99       402



In [18]:
# min samples leaf
model4 = DecisionTreeClassifier(min_samples_leaf=15)
model4.fit(X_train,y_train)

DecisionTreeClassifier(min_samples_leaf=15)

In [19]:
# predict
y_pred = model4.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.92      0.92      0.92       157
           1       0.95      0.95      0.95       245

    accuracy                           0.94       402
   macro avg       0.93      0.94      0.93       402
weighted avg       0.94      0.94      0.94       402



In [20]:
# Random Forest

from sklearn.ensemble import RandomForestClassifier

In [21]:
rf = RandomForestClassifier(n_estimators=75, max_depth=18)
rf.fit(X_train,y_train)

RandomForestClassifier(max_depth=18, n_estimators=75)

In [22]:
y_pred = rf.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.93      0.96      0.94       157
           1       0.97      0.95      0.96       245

    accuracy                           0.96       402
   macro avg       0.95      0.96      0.95       402
weighted avg       0.96      0.96      0.96       402

