# Reading dataset

Original dataset from kaggle <a href="https://www.kaggle.com/fedesoriano/stroke-prediction-dataset">Download from here</a>

The dataset was modified to have balanced outputs and saved as `stroke-balanced.csv`

In [24]:
import pandas as pd

data = pd.read_csv('stroke-balanced.csv')

data.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,44912,Male,12.0,0,0,No,children,Urban,67.06,16.1,Unknown,0
1,66972,Female,52.0,0,0,Yes,Govt_job,Urban,80.88,23.8,smokes,0
2,1451,Female,17.0,0,0,No,Private,Urban,78.46,23.5,Unknown,0
3,49797,Female,28.0,0,0,No,Private,Rural,75.53,34.9,never smoked,0
4,70241,Female,22.0,0,0,No,Private,Urban,66.29,20.5,smokes,0


# Handling missing values

In [25]:
mask = data.isnull().any(axis=1)
data_clean = data[~mask]

# Input and output

In [26]:
data_input = data_clean.drop(columns=['id', 'stroke'])
data_output = data_clean['stroke']

# Handling categorical data
## Numeric encoding

In [27]:
data_input_encoded_1 = data_input.replace({
    'gender': {'Male': 0, 'Female': 1},
    'ever_married': {'Yes': 1, 'No': 0},
    'Residence_type': {'Rural': 0, 'Urban': 1}
})

In [28]:
data_input_encoded_1.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
0,0,12.0,0,0,0,children,1,67.06,16.1,Unknown
1,1,52.0,0,0,1,Govt_job,1,80.88,23.8,smokes
2,1,17.0,0,0,0,Private,1,78.46,23.5,Unknown
3,1,28.0,0,0,0,Private,0,75.53,34.9,never smoked
4,1,22.0,0,0,0,Private,1,66.29,20.5,smokes


## One-hot encoding

In [29]:
data_input_encoded_2 = pd.get_dummies(data_input_encoded_1)

In [30]:
data_input_encoded_2.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,0,12.0,0,0,0,1,67.06,16.1,0,0,0,0,1,1,0,0,0
1,1,52.0,0,0,1,1,80.88,23.8,1,0,0,0,0,0,0,0,1
2,1,17.0,0,0,0,1,78.46,23.5,0,0,1,0,0,1,0,0,0
3,1,28.0,0,0,0,0,75.53,34.9,0,0,1,0,0,0,0,1,0
4,1,22.0,0,0,0,1,66.29,20.5,0,0,1,0,0,0,0,0,1


# Split into (train - validation - test)

In [31]:
from sklearn.model_selection import train_test_split

X, X_test, y, y_test = train_test_split(
    data_input_encoded_2, data_output, test_size=0.20, random_state=0
)

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.25, random_state=0
)

In [32]:
print(X_train.shape)
print(y_train.shape)
print('---------------------')
print(X_val.shape)
print(y_val.shape)
print('---------------------')
print(X_test.shape)
print(y_test.shape)

(544, 17)
(544,)
---------------------
(182, 17)
(182,)
---------------------
(182, 17)
(182,)


# Feature scaling (Normalization)

In [33]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_val_scaled =  scaler.transform(X_val)
X_test_scaled =  scaler.transform(X_test)

# Support Vector Machine
## Linear SVM

In [34]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [44]:
svc = SVC(kernel='linear', random_state=0, C=0.4)
svc.fit(X_train_scaled, y_train)

y_pred_train = svc.predict(X_train_scaled)
y_pred_val = svc.predict(X_val_scaled)

print(accuracy_score(y_train, y_pred_train))
print(accuracy_score(y_val, y_pred_val))

0.7867647058823529
0.7857142857142857


## Poly SVM

In [56]:
svc = SVC(kernel='poly', degree=2, random_state=0, C=0.5)
svc.fit(X_train_scaled, y_train)

y_pred_train = svc.predict(X_train_scaled)
y_pred_val = svc.predict(X_val_scaled)

print(accuracy_score(y_train, y_pred_train))
print(accuracy_score(y_val, y_pred_val))

0.7904411764705882
0.7637362637362637


## RBF SVM

In [63]:
svc = SVC(kernel='rbf', gamma=0.01, random_state=0, C=120)
svc.fit(X_train_scaled, y_train)

y_pred_train = svc.predict(X_train_scaled)
y_pred_val = svc.predict(X_val_scaled)

print(accuracy_score(y_train, y_pred_train))
print(accuracy_score(y_val, y_pred_val))

0.7941176470588235
0.7912087912087912


# Testing

In [64]:
y_pred_test = svc.predict(X_test_scaled)
print(accuracy_score(y_test, y_pred_test))

0.7582417582417582
