In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from imblearn.under_sampling import RandomUnderSampler
from sklearn.utils import shuffle

In [None]:
df = pd.read_csv('/content/drive/MyDrive/ML/strokes.csv')

In [None]:
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,30669,Male,3.0,0,0,No,children,Rural,95.12,18.0,,0
1,30468,Male,58.0,1,0,Yes,Private,Urban,87.96,39.2,never smoked,0
2,16523,Female,8.0,0,0,No,Private,Urban,110.89,17.6,,0
3,56543,Female,70.0,0,0,Yes,Private,Rural,69.04,35.9,formerly smoked,0
4,46136,Male,14.0,0,0,No,Never_worked,Rural,161.28,19.1,,0


## Part A

In [None]:
one_hot = pd.get_dummies(df['gender']) # one-hot encoding for gender
del df['gender']
df = df.join(one_hot)
df.head()

Unnamed: 0,id,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,Female,Male,Other
0,30669,3.0,0,0,No,children,Rural,95.12,18.0,,0,0,1,0
1,30468,58.0,1,0,Yes,Private,Urban,87.96,39.2,never smoked,0,0,1,0
2,16523,8.0,0,0,No,Private,Urban,110.89,17.6,,0,1,0,0
3,56543,70.0,0,0,Yes,Private,Rural,69.04,35.9,formerly smoked,0,1,0,0
4,46136,14.0,0,0,No,Never_worked,Rural,161.28,19.1,,0,0,1,0


In [None]:
bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100] # binning ages
df['Age_group'] = pd.cut(df['age'], bins)
del df['age']
df.head()

Unnamed: 0,id,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,Female,Male,Other,Age_group
0,30669,0,0,No,children,Rural,95.12,18.0,,0,0,1,0,"(0, 10]"
1,30468,1,0,Yes,Private,Urban,87.96,39.2,never smoked,0,0,1,0,"(50, 60]"
2,16523,0,0,No,Private,Urban,110.89,17.6,,0,1,0,0,"(0, 10]"
3,56543,0,0,Yes,Private,Rural,69.04,35.9,formerly smoked,0,1,0,0,"(60, 70]"
4,46136,0,0,No,Never_worked,Rural,161.28,19.1,,0,0,1,0,"(10, 20]"


In [None]:
label_encoder = LabelEncoder() # label encoder for ages
df['Age_group'] = label_encoder.fit_transform(df['Age_group'])
df.head()

Unnamed: 0,id,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,Female,Male,Other,Age_group
0,30669,0,0,No,children,Rural,95.12,18.0,,0,0,1,0,0
1,30468,1,0,Yes,Private,Urban,87.96,39.2,never smoked,0,0,1,0,5
2,16523,0,0,No,Private,Urban,110.89,17.6,,0,1,0,0,0
3,56543,0,0,Yes,Private,Rural,69.04,35.9,formerly smoked,0,1,0,0,6
4,46136,0,0,No,Never_worked,Rural,161.28,19.1,,0,0,1,0,1


In [None]:
change_marr = {'Yes': 1, 'No': 0} # separate married or not
df['Married'] = df['ever_married'].map(change_marr)
del df['ever_married']
df['Single'] = 1 - df['Married']
df.head()

Unnamed: 0,id,hypertension,heart_disease,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,Female,Male,Other,Age_group,Married,Single
0,30669,0,0,children,Rural,95.12,18.0,,0,0,1,0,0,0,1
1,30468,1,0,Private,Urban,87.96,39.2,never smoked,0,0,1,0,5,1,0
2,16523,0,0,Private,Urban,110.89,17.6,,0,1,0,0,0,0,1
3,56543,0,0,Private,Rural,69.04,35.9,formerly smoked,0,1,0,0,6,1,0
4,46136,0,0,Never_worked,Rural,161.28,19.1,,0,0,1,0,1,0,1


In [None]:
one_hot = pd.get_dummies(df['work_type']) # one-hot encoding for work_type
del df['work_type']
df = df.join(one_hot)
df.head()

Unnamed: 0,id,hypertension,heart_disease,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,Female,Male,Other,Age_group,Married,Single,Govt_job,Never_worked,Private,Self-employed,children
0,30669,0,0,Rural,95.12,18.0,,0,0,1,0,0,0,1,0,0,0,0,1
1,30468,1,0,Urban,87.96,39.2,never smoked,0,0,1,0,5,1,0,0,0,1,0,0
2,16523,0,0,Urban,110.89,17.6,,0,1,0,0,0,0,1,0,0,1,0,0
3,56543,0,0,Rural,69.04,35.9,formerly smoked,0,1,0,0,6,1,0,0,0,1,0,0
4,46136,0,0,Rural,161.28,19.1,,0,0,1,0,1,0,1,0,1,0,0,0


In [None]:
one_hot = pd.get_dummies(df['Residence_type']) # one-hot encoding for residence_type
del df['Residence_type']
df = df.join(one_hot)
df.head()

Unnamed: 0,id,hypertension,heart_disease,avg_glucose_level,bmi,smoking_status,stroke,Female,Male,Other,Age_group,Married,Single,Govt_job,Never_worked,Private,Self-employed,children,Rural,Urban
0,30669,0,0,95.12,18.0,,0,0,1,0,0,0,1,0,0,0,0,1,1,0
1,30468,1,0,87.96,39.2,never smoked,0,0,1,0,5,1,0,0,0,1,0,0,0,1
2,16523,0,0,110.89,17.6,,0,1,0,0,0,0,1,0,0,1,0,0,0,1
3,56543,0,0,69.04,35.9,formerly smoked,0,1,0,0,6,1,0,0,0,1,0,0,1,0
4,46136,0,0,161.28,19.1,,0,0,1,0,1,0,1,0,1,0,0,0,1,0


In [None]:
def normalization(col): # normalization function
  return (col - col.min()) / (col.max() - col.min())

In [None]:
df['Normalized avg_glu_lvl'] = normalization(df['avg_glucose_level']) # normalize avg_glucose_level
del df['avg_glucose_level']
df.head()

Unnamed: 0,id,hypertension,heart_disease,bmi,smoking_status,stroke,Female,Male,Other,Age_group,Married,Single,Govt_job,Never_worked,Private,Self-employed,children,Rural,Urban,Normalized avg_glu_lvl
0,30669,0,0,18.0,,0,0,1,0,0,0,1,0,0,0,0,1,1,0,0.169964
1,30468,1,0,39.2,never smoked,0,0,1,0,5,1,0,0,0,1,0,0,0,1,0.139631
2,16523,0,0,17.6,,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0.236772
3,56543,0,0,35.9,formerly smoked,0,1,0,0,6,1,0,0,0,1,0,0,1,0,0.059479
4,46136,0,0,19.1,,0,0,1,0,1,0,1,0,1,0,0,0,1,0,0.450244


In [None]:
df['Normalized BMI'] = normalization(df['bmi']) # normalize BMI
del df['bmi']
df.head()

Unnamed: 0,id,hypertension,heart_disease,smoking_status,stroke,Female,Male,Other,Age_group,Married,Single,Govt_job,Never_worked,Private,Self-employed,children,Rural,Urban,Normalized avg_glu_lvl,Normalized BMI
0,30669,0,0,,0,0,1,0,0,0,1,0,0,0,0,1,1,0,0.169964,0.090286
1,30468,1,0,never smoked,0,0,1,0,5,1,0,0,0,1,0,0,0,1,0.139631,0.332571
2,16523,0,0,,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0.236772,0.085714
3,56543,0,0,formerly smoked,0,1,0,0,6,1,0,0,0,1,0,0,1,0,0.059479,0.294857
4,46136,0,0,,0,0,1,0,1,0,1,0,1,0,0,0,1,0,0.450244,0.102857


In [None]:
mean_bmi = df['Normalized BMI'].mean() # fill NaN values of BMI
df['Normalized BMI'] = df['Normalized BMI'].fillna(mean_bmi)
df.head()

Unnamed: 0,id,hypertension,heart_disease,smoking_status,stroke,Female,Male,Other,Age_group,Married,Single,Govt_job,Never_worked,Private,Self-employed,children,Rural,Urban,Normalized avg_glu_lvl,Normalized BMI
0,30669,0,0,,0,0,1,0,0,0,1,0,0,0,0,1,1,0,0.169964,0.090286
1,30468,1,0,never smoked,0,0,1,0,5,1,0,0,0,1,0,0,0,1,0.139631,0.332571
2,16523,0,0,,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0.236772,0.085714
3,56543,0,0,formerly smoked,0,1,0,0,6,1,0,0,0,1,0,0,1,0,0.059479,0.294857
4,46136,0,0,,0,0,1,0,1,0,1,0,1,0,0,0,1,0,0.450244,0.102857


In [None]:
mode_smoke = df['smoking_status'].mode() # fill NaN values of smoking_status
df['smoking_status'] = df['smoking_status'].fillna(mode_smoke[0])
df.head()

Unnamed: 0,id,hypertension,heart_disease,smoking_status,stroke,Female,Male,Other,Age_group,Married,Single,Govt_job,Never_worked,Private,Self-employed,children,Rural,Urban,Normalized avg_glu_lvl,Normalized BMI
0,30669,0,0,never smoked,0,0,1,0,0,0,1,0,0,0,0,1,1,0,0.169964,0.090286
1,30468,1,0,never smoked,0,0,1,0,5,1,0,0,0,1,0,0,0,1,0.139631,0.332571
2,16523,0,0,never smoked,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0.236772,0.085714
3,56543,0,0,formerly smoked,0,1,0,0,6,1,0,0,0,1,0,0,1,0,0.059479,0.294857
4,46136,0,0,never smoked,0,0,1,0,1,0,1,0,1,0,0,0,1,0,0.450244,0.102857


In [None]:
one_hot = pd.get_dummies(df['smoking_status']) # one-hot encoding for smoking_status
del df['smoking_status']
df = df.join(one_hot)
df.head()

Unnamed: 0,id,hypertension,heart_disease,stroke,Female,Male,Other,Age_group,Married,Single,...,Private,Self-employed,children,Rural,Urban,Normalized avg_glu_lvl,Normalized BMI,formerly smoked,never smoked,smokes
0,30669,0,0,0,0,1,0,0,0,1,...,0,0,1,1,0,0.169964,0.090286,0,1,0
1,30468,1,0,0,0,1,0,5,1,0,...,1,0,0,0,1,0.139631,0.332571,0,1,0
2,16523,0,0,0,1,0,0,0,0,1,...,1,0,0,0,1,0.236772,0.085714,0,1,0
3,56543,0,0,0,1,0,0,6,1,0,...,1,0,0,1,0,0.059479,0.294857,1,0,0
4,46136,0,0,0,0,1,0,1,0,1,...,0,0,0,1,0,0.450244,0.102857,0,1,0


In [None]:
del df['id'] # delete id column
isStroke = df.pop('stroke')
df.insert(loc=0, column='stroke', value=isStroke) # put stroke column first
df.head()

Unnamed: 0,stroke,hypertension,heart_disease,Female,Male,Other,Age_group,Married,Single,Govt_job,...,Private,Self-employed,children,Rural,Urban,Normalized avg_glu_lvl,Normalized BMI,formerly smoked,never smoked,smokes
0,0,0,0,0,1,0,0,0,1,0,...,0,0,1,1,0,0.169964,0.090286,0,1,0
1,0,1,0,0,1,0,5,1,0,0,...,1,0,0,0,1,0.139631,0.332571,0,1,0
2,0,0,0,1,0,0,0,0,1,0,...,1,0,0,0,1,0.236772,0.085714,0,1,0
3,0,0,0,1,0,0,6,1,0,0,...,1,0,0,1,0,0.059479,0.294857,1,0,0
4,0,0,0,0,1,0,1,0,1,0,...,0,0,0,1,0,0.450244,0.102857,0,1,0


## Part B

In [None]:
X = df.iloc[:, 1:]
Y = df['stroke']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2) # %20 for test

In [None]:
undersampler = RandomUnderSampler(sampling_strategy='majority') # balance the dataset using undersampling
X_resampled, Y_resampled = undersampler.fit_resample(x_train, y_train) # undersample the train set
X_resampled, Y_resampled = shuffle(X_resampled, Y_resampled) # shuffle the train set

## Part C

In [None]:
clf = svm.SVC(kernel='linear')
clf.fit(X_resampled, Y_resampled)
y_pred = clf.predict(x_test)

In [None]:
conf_matrix = confusion_matrix(y_test, y_pred) # confusion matrix
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[6164 2336]
 [  28  152]]


In [None]:
tn, fp, fn, tp = conf_matrix.ravel()
print("True Negative:", tn)
print("False Positive:", fp)
print("False Negative:", fn)
print("True Positive:", tp)

True Negative: 6164
False Positive: 2336
False Negative: 28
True Positive: 152


In [None]:
acc = round(accuracy_score(y_test, y_pred) * 100, 2)
print(f"Accuracy: {acc} %")

Accuracy: 72.76 %


In [None]:
precision = round(precision_score(y_test, y_pred) * 100, 2) # precision = TP / (TP + FP)
print(f"Precision: {precision} %")

Precision: 6.11 %


In [None]:
recall = round(recall_score(y_test, y_pred) * 100, 2) # recall = TP / (TP + FN)
print(f"Recall: {recall} %")

Recall: 84.44 %


In [None]:
f1 = round(f1_score(y_test, y_pred)* 100, 2) # precision * recall / (precision + recall)
print(f"F1 Score: {f1} %")

F1 Score: 11.39 %
