In [1]:
pip install seaborn


[notice] A new release of pip available: 22.1.2 -> 22.2
[notice] To update, run: python.exe -m pip install --upgrade pip
Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install scipy


[notice] A new release of pip available: 22.1.2 -> 22.2
[notice] To update, run: python.exe -m pip install --upgrade pip
Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install sklearn


[notice] A new release of pip available: 22.1.2 -> 22.2
[notice] To update, run: python.exe -m pip install --upgrade pip
Note: you may need to restart the kernel to use updated packages.


In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

In [5]:
df = pd.read_csv("Resultant_CSV.csv")

# 1. Selecting Important Features

In [6]:
#Selecting top 6 Important Features using chi2 score

X = df.iloc[:,0:11]
Y = df.iloc[:,-1]

bestfeatures = SelectKBest(score_func=chi2, k=6)
fit = bestfeatures.fit(X,Y)

dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)

featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Columns','Chi-Score']  

X_new = fit.fit_transform(X,Y)
cols=featureScores[featureScores['Chi-Score']>10]['Columns']

X_new = pd.DataFrame(X_new,columns=cols)


# 2. Splitting Data into Training and Testing Dataset

In [7]:
#Splitting Data into Training and Testing Dataset

train_x,test_x,train_y,test_y  = train_test_split(df[cols],df['stroke'],random_state=1200,test_size=0.25)

train_x.shape,test_x.shape,train_y.shape,test_y.shape


((3292, 6), (1098, 6), (3292,), (1098,))

# 3. Training the classifier model on training dataset

In [8]:
#The model used here is Decision Tree for classification
#Training Phase

model = DecisionTreeClassifier(criterion='entropy')
model.fit(train_x,train_y)

# 4. Testing the classifier model on testing dataset

In [9]:
#Model predicting on Test Dataset
#Testing Phase

pred_y=model.predict(test_x)

# 5. Evaluating the model on various metrics

In [10]:
#Calculating Accuracy of the model

accuracy=accuracy_score(pred_y,test_y)*100
print("Accuracy of the model is {:.2f}".format(accuracy))

Accuracy of the model is 91.89


In [11]:
#Confusion Matrix of the model

confusion_matrix(pred_y,test_y)

array([[999,  43],
       [ 46,  10]], dtype=int64)

In [12]:
#All classes are treated equally to evaluate the overall performance of the classifier. Hence used Macro Average Precision

precision_score(test_y, pred_y, average='macro')

0.5686523169728543

In [13]:
f1_score(test_y, pred_y, average='macro')

0.5704206468175644

# 6. Hyperparameter Tuning

In [14]:
#Creating a model on criterion based on Gini Impurity and random state as 6.

model = DecisionTreeClassifier(criterion='gini', random_state=6)
model.fit(train_x,train_y)

In [15]:
pred_y=model.predict(test_x)

In [16]:
#After setting these hyperparameters, the Accuracy of the model has increased from 92.26 to 93.08

accuracy=accuracy_score(pred_y,test_y)*100
print("Accuracy of the model is {:.2f}".format(accuracy))

Accuracy of the model is 93.08


# 7. Saving Model to a pickle file

In [17]:
pickle.dump(model, open('classifier_model.pkl', 'wb'))