In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split,RandomizedSearchCV
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [None]:
df=pd.read_csv('/content/drive/MyDrive/datasets/stroke.csv')
df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [None]:
df.isnull().sum()
# df.describe()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

#Data Preprocessing

In [None]:
df.bmi.fillna(df.bmi.mean(),inplace=True)

In [None]:
lst=['gender','ever_married','work_type','Residence_type','smoking_status']
dict1={}
for col in lst:
  dict1[col]=LabelEncoder()
  df[col]=dict1[col].fit_transform(df[col])

In [None]:
df.stroke.value_counts()

0    4861
1     249
Name: stroke, dtype: int64

In [None]:
df_0=df[df.stroke==0]
df_0
df_1=df[df.stroke==1]
df_1
df_0.count()
df_1.count()

id                   249
gender               249
age                  249
hypertension         249
heart_disease        249
ever_married         249
work_type            249
Residence_type       249
avg_glucose_level    249
bmi                  249
smoking_status       249
stroke               249
dtype: int64

#Under Sampling

In [None]:
df_0_under=df[df.stroke==0].sample(249)
df_0_under.count()

id                   249
gender               249
age                  249
hypertension         249
heart_disease        249
ever_married         249
work_type            249
Residence_type       249
avg_glucose_level    249
bmi                  249
smoking_status       249
stroke               249
dtype: int64

In [None]:
df_under=pd.concat([df_0_under,df_1])
df_under.stroke.value_counts()

0    249
1    249
Name: stroke, dtype: int64

In [None]:
x=df_under.iloc[:,:-1]
x
y=df_under.iloc[:,-1]
y

4332    0
767     0
1115    0
2962    0
3951    0
       ..
244     1
245     1
246     1
247     1
248     1
Name: stroke, Length: 498, dtype: int64

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=1)
sc=StandardScaler()
x_train_scaled=sc.fit_transform(x_train)
x_test_scaled=sc.transform(x_test)

In [None]:
model_tree=DecisionTreeClassifier(max_depth=20)
model_neighbor=KNeighborsClassifier(n_neighbors=17)
model_svc=SVC(kernel='linear',C=0.01)
model_bayes=GaussianNB()

In [None]:
models=[model_tree,model_neighbor,model_svc,model_bayes]
accuracy={}
for model in models:
  model.fit(x_train_scaled,y_train)
  y_pred=model.predict(x_test_scaled)
  accuracy[f'{model}_acc']=accuracy_score(y_test,y_pred)
  print(f'Classification report {model}')
  print(classification_report(y_test,y_pred))

Classification report DecisionTreeClassifier(max_depth=20)
              precision    recall  f1-score   support

           0       0.67      0.62      0.64        68
           1       0.70      0.74      0.72        82

    accuracy                           0.69       150
   macro avg       0.68      0.68      0.68       150
weighted avg       0.69      0.69      0.69       150

Classification report KNeighborsClassifier(n_neighbors=17)
              precision    recall  f1-score   support

           0       0.69      0.68      0.68        68
           1       0.73      0.74      0.74        82

    accuracy                           0.71       150
   macro avg       0.71      0.71      0.71       150
weighted avg       0.71      0.71      0.71       150

Classification report SVC(C=0.01, kernel='linear')
              precision    recall  f1-score   support

           0       0.73      0.65      0.69        68
           1       0.73      0.80      0.77        82

    accuracy 

#Over Sampling

In [None]:
df_1_over=df[df.stroke==1].sample( 4861,replace=True)
df_1_over.count()

id                   4861
gender               4861
age                  4861
hypertension         4861
heart_disease        4861
ever_married         4861
work_type            4861
Residence_type       4861
avg_glucose_level    4861
bmi                  4861
smoking_status       4861
stroke               4861
dtype: int64

In [None]:
df_over=pd.concat([df_0,df_1_over])
df_over.count()
df_over.stroke.value_counts()

0    4861
1    4861
Name: stroke, dtype: int64

In [None]:
x1=df_over.iloc[:,:-1]
y1=df_over.iloc[:,-1]


In [None]:
x_train,x_test,y_train,y_test=train_test_split(x1,y1,test_size=0.3,random_state=1)
sc=StandardScaler()
x_train_scaled=sc.fit_transform(x_train)
x_test_scaled=sc.transform(x_test)

In [None]:
model_tree=DecisionTreeClassifier(max_depth=20)
model_neighbor=KNeighborsClassifier(n_neighbors=17)
model_svc=SVC(kernel='linear',C=0.01)
model_bayes=GaussianNB()

In [None]:
models=[model_tree,model_neighbor,model_svc,model_bayes]
accuracy={}
for model in models:
  model.fit(x_train_scaled,y_train)
  y_pred=model.predict(x_test_scaled)
  accuracy[f'{model}_acc']=accuracy_score(y_test,y_pred)
  print(f'Classification report {model}')
  print(classification_report(y_test,y_pred))

Classification report DecisionTreeClassifier(max_depth=20)
              precision    recall  f1-score   support

           0       1.00      0.93      0.97      1454
           1       0.94      1.00      0.97      1463

    accuracy                           0.97      2917
   macro avg       0.97      0.97      0.97      2917
weighted avg       0.97      0.97      0.97      2917

Classification report KNeighborsClassifier(n_neighbors=17)
              precision    recall  f1-score   support

           0       0.96      0.69      0.80      1454
           1       0.76      0.97      0.85      1463

    accuracy                           0.83      2917
   macro avg       0.86      0.83      0.83      2917
weighted avg       0.86      0.83      0.83      2917

Classification report SVC(C=0.01, kernel='linear')
              precision    recall  f1-score   support

           0       0.79      0.74      0.76      1454
           1       0.75      0.80      0.78      1463

    accuracy 

#SMOTE

In [None]:
from imblearn.over_sampling import SMOTE
over_sampler=SMOTE()


In [None]:
x2=df.drop(columns='stroke')
y2=df.stroke
x2
y2


0       1
1       1
2       1
3       1
4       1
       ..
5105    0
5106    0
5107    0
5108    0
5109    0
Name: stroke, Length: 5110, dtype: int64

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x2,y2,test_size=0.3,random_state=1)

In [None]:
x_train_sampled,y_train_sampled=over_sampler.fit_resample(x_train,y_train)
y_train_sampled.value_counts()
y_test.value_counts()

0    1450
1      83
Name: stroke, dtype: int64

In [None]:
y_test.value_counts()

0    1450
1      83
Name: stroke, dtype: int64

In [None]:
sc=StandardScaler()
x_train_scaled=sc.fit_transform(x_train_sampled)
x_test_scaled=sc.transform(x_test)

In [None]:
smote_model=KNeighborsClassifier(n_neighbors=17,metric='euclidean')
smote_model.fit(x_train_scaled,y_train_sampled)
y_pred=smote_model.predict(x_test_scaled)


In [None]:
print(accuracy_score(y_test,y_pred))

0.7606001304631441


In [None]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.96      0.78      0.86      1450
           1       0.09      0.40      0.15        83

    accuracy                           0.76      1533
   macro avg       0.53      0.59      0.51      1533
weighted avg       0.91      0.76      0.82      1533



In [None]:
model_tree=DecisionTreeClassifier(max_depth=20)
model_neighbor=KNeighborsClassifier(n_neighbors=17)
model_svc=SVC(kernel='linear',C=0.01)
model_bayes=GaussianNB()

In [None]:

models=[model_tree,model_neighbor,model_svc,model_bayes]
accuracy={}
for model in models:
  model.fit(x_train_scaled,y_train_sampled)
  y_pred=model.predict(x_test_scaled)
  accuracy[f'{model}_acc']=accuracy_score(y_test,y_pred)
  print(f'Classification report {model}')
  print(classification_report(y_test,y_pred))

Classification report DecisionTreeClassifier(max_depth=20)
              precision    recall  f1-score   support

           0       0.95      0.89      0.92      1450
           1       0.11      0.25      0.16        83

    accuracy                           0.85      1533
   macro avg       0.53      0.57      0.54      1533
weighted avg       0.91      0.85      0.88      1533

Classification report KNeighborsClassifier(n_neighbors=17)
              precision    recall  f1-score   support

           0       0.96      0.78      0.86      1450
           1       0.09      0.40      0.15        83

    accuracy                           0.76      1533
   macro avg       0.53      0.59      0.51      1533
weighted avg       0.91      0.76      0.82      1533

Classification report SVC(C=0.01, kernel='linear')
              precision    recall  f1-score   support

           0       0.97      0.79      0.87      1450
           1       0.14      0.60      0.23        83

    accuracy 