<a href="https://colab.research.google.com/github/AkhilByju/Stroke-Predictor/blob/main/Stroke_Predictor_CS97.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # this is used for the plot the graph
import os
import seaborn as sns # used for plot interactive graph.
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn import metrics
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix
import sklearn.metrics.cluster as smc
from sklearn.model_selection import KFold


from matplotlib import pyplot
import itertools

%matplotlib inline

import random

random.seed(42)

In [None]:
# Helper function allowing you to export a graph
def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [None]:
# Helper function that allows you to draw nicely formatted confusion matrices
def draw_confusion_matrix(y, yhat, classes):
    '''
        Draws a confusion matrix for the given target and predictions
        Adapted from scikit-learn and discussion example.
    '''
    plt.cla()
    plt.clf()
    matrix = confusion_matrix(y, yhat)
    plt.imshow(matrix, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title("Confusion Matrix")
    plt.colorbar()
    num_classes = len(classes)
    plt.xticks(np.arange(num_classes), classes, rotation=90)
    plt.yticks(np.arange(num_classes), classes)

    fmt = 'd'
    thresh = matrix.max() / 2.
    for i, j in itertools.product(range(matrix.shape[0]), range(matrix.shape[1])):
        plt.text(j, i, format(matrix[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if matrix[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()
    plt.show()

In [None]:
def print_4_metrics(target_test, predicted):
  print("%-12s %f" % ('Accuracy:', metrics.accuracy_score(target_test,predicted)))
  print("%-12s %f" % ('Precision:', metrics.precision_score(target_test, predicted,labels=None, pos_label=1, average='binary', sample_weight=None)))
  print("%-12s %f" % ('Recall:', metrics.recall_score(target_test, predicted,labels=None, pos_label=1, average='binary', sample_weight=None)))
  print("%-12s %f" % ('F1 Score:', metrics.f1_score(target_test, predicted,labels=None, pos_label=1, average='binary', sample_weight=None)))


**Data Preprocessing**

In [None]:
url='https://drive.google.com/file/d/1_U-J7E30JmE8Q7KuMT723dI7QxJAxhAB/view?usp=sharing'
url='https://drive.google.com/uc?id=' + url.split('/')[-2]

df = pd.read_csv(url)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

Deleting ID because this column is not needed in the model

In [None]:
data = df.drop('id', axis = 1)

In [None]:
data.head()

Filling in Null Values

Median filled into BMI (Median Imputation)

In [None]:
median = data["bmi"].median()
data["bmi"].fillna(median, inplace=True)
null_rows = data[data.isnull().any(axis=1)]
null_rows

Replacing Null smoking values

In [None]:
data["stroke"].value_counts()

In [None]:
known_smoking = data[data["smoking_status"] != "Unknown"]

In [None]:
balanced_data = known_smoking.sort_values('stroke', ascending=False).head(404)
balanced_data

In [None]:
null_rows = balanced_data[balanced_data.isnull().any(axis=1)] #Used to check if any rows contain null data
null_rows

Preprocessing the Dataset

In [None]:
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler


# Categorical Features
categorical_features = ["gender", 'ever_married', 'work_type', 'Residence_type', 'smoking_status']


le = preprocessing.LabelEncoder()
for c in categorical_features:
  le.fit(balanced_data[c])
  balanced_data[c]=le.transform(balanced_data[c])

# Numerical Features

scaler = StandardScaler()
scale_data = balanced_data[['age', 'avg_glucose_level', 'bmi']]
binary_data = balanced_data.drop(['age', 'avg_glucose_level', 'bmi'], axis=1)
scaled_data = scaler.fit_transform(scale_data)

df_scaled_data = pd.DataFrame(scaled_data, columns=scale_data.columns)
# Something is wrong here
binary_data = binary_data.reset_index()
all_data = pd.concat([df_scaled_data, binary_data], axis=1)


In [None]:
all_data = all_data.drop(['index'], axis =1)

In [None]:
df_correlation = all_data.corr()
df_correlation['stroke'].sort_values(ascending=False)

In [None]:
data_target = all_data["stroke"]
data_prepared = all_data.drop(['stroke'], axis=1)

In [None]:
data_prepared

# **MODELS**

Train-Test Split

In [None]:
train, test, target, target_test = train_test_split(data_prepared, data_target, test_size = 0.2, random_state = 0)

Logistic Regression


In [None]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(penalty = 'l1', solver='liblinear', C = 0.15)

log_reg.fit(train, target)

log_predicted = log_reg.predict(test)
log_score = log_reg.predict_proba(test)[:,1]
print_4_metrics(target_test, log_predicted)

In [None]:
C_values = []
C = 0.1
while C < 0.35:
  C_values.append(C)
  C += 0.05

for c in C_values:
  log_reg = LogisticRegression(penalty = 'l1', solver='liblinear', C = 0.15)

  log_reg.fit(train, target)

  log_predicted = log_reg.predict(test)
  print(f"C value: {c}")
  print(f"Accuracy: {metrics.accuracy_score(target_test,log_predicted)}", '\n')

In [None]:
print_4_metrics(target_test, log_predicted)

In [None]:
# Plot ROC curve and report area under ROC
# use metrics.roc_curve(your y_test, predicted probabilities for y_test)

fpr_log_reg, tpr_log_reg, thresholds = metrics.roc_curve(target_test,log_score)
print("Logistic Model Performance Results:\n")
pyplot.figure(1)
pyplot.plot(fpr_log_reg, tpr_log_reg, color='orange', lw=1)
pyplot.title("ROC curve with Logistic Regression")
pyplot.xlabel('FPR')
pyplot.ylabel('TPR')

# report auc
# use metrics.auc(fpr, tpr)
aucroc = metrics.auc(fpr_log_reg, tpr_log_reg)
print('AUC of ROC: ', aucroc)


In [None]:
from sklearn.metrics import confusion_matrix
print(type(log_predicted))
draw_confusion_matrix(target_test, log_predicted, ['Stroke', 'No Stroke'])

K-Nearest Neighbor

In [None]:
for k in range(1,300):

    KNN = KNeighborsClassifier(n_neighbors=k)
    KNN.fit(train, target)
    KNN_predicted = KNN.predict(test)

    print(f"K value: {k}")
    print(f"Accuracy: {metrics.accuracy_score(target_test,KNN_predicted)}", '\n')

In [None]:
KNN = KNeighborsClassifier(n_neighbors=81, algorithm='ball_tree', p=1)

KNN.fit(train, target)

KNN_predicted = KNN.predict(test)
KNN_score = KNN.predict_proba(test)[:,1]

In [None]:
print_4_metrics(target_test, KNN_predicted)

In [None]:
# Plot ROC curve and report area under ROC
# use metrics.roc_curve(your y_test, predicted probabilities for y_test)

fpr_KNN, tpr_KNN, thresholds = metrics.roc_curve(target_test,KNN_score)
print("KNN Model Performance Results:\n")
pyplot.figure(1)
pyplot.plot(fpr_KNN, tpr_KNN, color='orange', lw=1)
pyplot.title("ROC curve with KNN")
pyplot.xlabel('FPR')
pyplot.ylabel('TPR')

# report auc
# use metrics.auc(fpr, tpr)
aucroc = metrics.auc(fpr_KNN, tpr_KNN)
print('AUC of ROC: ', aucroc)


In [None]:
draw_confusion_matrix(target_test, KNN_predicted, ['Stroke', 'No Stroke'])

Support Vector Machine

In [None]:
from sklearn.svm import SVC

svm = SVC(probability = True, C=.25, kernel="sigmoid", gamma='auto')

svm.fit(train, target)

svm_predicted = svm.predict(test)

svm_score = svm.predict_proba(test)[:,1]

In [None]:
C_values = []
C = 0.1
while C < 0.35:
  C_values.append(C)
  C += 0.05

for c in C_values:
  svm = SVC(probability = True, C=c, kernel="sigmoid", gamma='auto')
  svm.fit(train, target)
  svm_predicted = svm.predict(test)

  print(f"C value: {c}")
  print(f"Accuracy: {print_4_metrics(target_test,svm_predicted)}", '\n')

In [None]:
print_4_metrics(target_test, svm_predicted)

In [None]:
fpr_svm, tpr_svm, thresholds = metrics.roc_curve(target_test,svm_score)
print("Support Vector Machine Performance Results:\n")
pyplot.figure(1)
pyplot.plot(fpr_svm, tpr_svm, color='orange', lw=1)
pyplot.title("ROC curve with SVM")
pyplot.xlabel('FPR')
pyplot.ylabel('TPR')

# report auc
# use metrics.auc(fpr, tpr)
aucroc = metrics.auc(fpr_svm, tpr_svm)
print('AUC of ROC: ', aucroc)

In [None]:
draw_confusion_matrix(target_test, svm_predicted, ['Stroke', 'No Stroke'])

Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz

tree = DecisionTreeClassifier(max_depth=5, random_state=0)

tree.fit(train, target)

tree_predicted = tree.predict(test)

tree_score = tree.predict_proba(test)[:,1]

In [None]:
for depth in range(1,25):
  tree = DecisionTreeClassifier(max_depth=depth, random_state=0)
  tree.fit(train, target)
  tree_predicted = tree.predict(test)
  print(f"max_depth: {depth}")
  print(f"Accuracy: {metrics.accuracy_score(target_test,tree_predicted)}", '\n')

In [None]:
print_4_metrics(target_test, tree_predicted)

In [None]:
fpr_tree, tpr_tree, thresholds = metrics.roc_curve(target_test,tree_score)
print("Decision Tree Performance Results:\n")
pyplot.figure(1)
pyplot.plot(fpr_tree, tpr_tree, color='orange', lw=1)
pyplot.title("ROC curve with Decision Tree")
pyplot.xlabel('FPR')
pyplot.ylabel('TPR')

# report auc
# use metrics.auc(fpr, tpr)
aucroc = metrics.auc(fpr_tree, tpr_tree)
print('AUC of ROC: ', aucroc)

In [None]:
draw_confusion_matrix(target_test, tree_predicted, ['Stroke', 'No Stroke'])

Decision Tree (Random Forest)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

rf_tree = RandomForestClassifier(n_estimators=50, random_state=0) # Iterates through 50 different decision trees)

rf_tree.fit(train, target)

rf_tree_predicted = rf_tree.predict(test)

rf_tree_score = rf_tree.predict_proba(test)[:,1]
print_4_metrics(target_test, rf_tree_predicted)

In [None]:
fpr_rf_tree, tpr_rf_tree, thresholds = metrics.roc_curve(target_test,rf_tree_score)
print("Decision Tree (with Random Forest) Performance Results:\n")
pyplot.figure(1)
pyplot.plot(fpr_rf_tree, tpr_rf_tree, color='orange', lw=1)
pyplot.title("ROC curve with Decision Tree + Random Forest")
pyplot.xlabel('FPR')
pyplot.ylabel('TPR')

# report auc
# use metrics.auc(fpr, tpr)
aucroc = metrics.auc(fpr_rf_tree, tpr_rf_tree)
print('AUC of ROC: ', aucroc)

In [None]:
draw_confusion_matrix(target_test, rf_tree_predicted, ['Stroke', 'No Stroke'])

# Visual Model of Trees

Links to Generated Images:

Decision Tree: https://drive.google.com/file/d/1oR19aWImKc_sSxXKpNyUQL4sQfjck2dB/view?usp=sharing

Random Forest Decision Tree (1/100): https://drive.google.com/file/d/1fdtxiOqqent3u3pRRCILexspCsrcX00E/view?usp=sharing

In [None]:
# Decision Tree Model
import graphviz
from sklearn.tree import export_graphviz

data = export_graphviz(tree, out_file=None, filled=True, rounded=True, special_characters=True)
graph = graphviz.Source(data, format='pdf')
graph.render("decision_tree", format='pdf', cleanup=True)

In [None]:
# The first random forest tree models
rf_data = export_graphviz(rf_tree.estimators_[0], out_file=None, filled=True, rounded=True, special_characters=True)
rf_graph = graphviz.Source(rf_data, format='pdf')
rf_graph.render("rf_decision_tree_#1", format='pdf', cleanup=True)

Neural Network

Preprocessing




In [None]:
data_nn = df
median = data_nn["bmi"].median()
data_nn["bmi"].fillna(median, inplace=True)
null_rows = data_nn[data_nn.isnull().any(axis=1)]
data_nn["stroke"].value_counts()
balanced_data_nn = data_nn[data_nn["smoking_status"] != "Unknown"].sort_values('stroke', ascending=False).head(404)
gender_map = {'Male': 0,'Female': 1,'Other': 2}
ever_married_map = {'No': 0,'Yes': 1}
work_type_map = {'children': 0,'Govt_jov': 1,'Never_worked': 2,'Private': 3,'Self-employed': 4}
residence_type_map = {'Rural': 0,'Urban': 1}
smoking_status_map = {'formerly smoked': 0, 'never smoked': 1, 'smokes': 2, 'Unknown': 3}

In [None]:
import pandas as pd
from torch.utils.data import Dataset

class StrokeDataset(Dataset):
    def __init__(self, dataset, normalize=True, feature_transform=None, label_transform=None):
        self.feature_transform = feature_transform
        self.label_transform = label_transform

        #self.df = pd.read_csv(dataset_file_path)
        self.df = dataset
        # map strings to numbers
        self.df['gender'] = self.df['gender'].map(gender_map, na_action='ignore')
        self.df['ever_married'] = self.df['ever_married'].map(ever_married_map, na_action='ignore')
        self.df['work_type'] = self.df['work_type'].map(work_type_map, na_action='ignore')
        self.df['Residence_type'] = self.df['Residence_type'].map(residence_type_map, na_action='ignore')
        self.df['smoking_status'] = self.df['smoking_status'].map(smoking_status_map, na_action='ignore')

        # remove rows with missing data
        self.df.dropna(inplace=True)

        # normalize
        normalize_data = []

        if normalize:
            for col in self.df.columns:
                normalize_data.append(self.df[col].abs().max())
                self.df[col] = self.df[col] / self.df[col].abs().max()
        print(normalize_data)
    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, index):
        # https://stackoverflow.com/a/29763653
        features = self.df.iloc[index].drop(['stroke', 'id'])
        label = self.df['stroke'].iloc[index]

        if self.feature_transform:
            features = self.feature_transform(features)
        if self.label_transform:
            label = self.label_transform(label)
        return features, label

In [None]:
import os
import pandas as pd
import time
import torch
from torch import nn
from torch.utils.data import DataLoader, random_split

DATASET_SPLIT = 0.8 # n% training data, 1 - n% testing data

LEARNING_RATE = 0.01
BATCH_SIZE = 128
EPOCHS = 100
NORMALIZE = True

start_time = time.time()
#model_folder_path = 'models/model_3'

# Load dataset

dataset = StrokeDataset(
    balanced_data_nn,
    normalize=NORMALIZE,
    # convert pandas object to tensor
    feature_transform=lambda feature: torch.tensor(feature, dtype=torch.float32),
    label_transform=lambda label: torch.reshape(torch.tensor(label, dtype=torch.float32), (-1,))
)

split_pos = int(len(dataset) * DATASET_SPLIT)


In [None]:
train_dataset, test_dataset = random_split(dataset, [split_pos, len(dataset) - split_pos])

train_dataloader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True
)

test_dataloader = DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True
)

In [None]:
# Build model
model = nn.Sequential(
    nn.Linear(10, 16),
    nn.ReLU(),
    nn.Linear(16, 32),
    nn.ReLU(),
    nn.Linear(32, 16),
    nn.ReLU(),
    nn.Linear(16, 1),
    nn.ReLU(),
)

loss_fn = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)

def train_loop():
    size = len(train_dataloader.dataset)
    for batch, (X, y) in enumerate(train_dataloader):
        pred = model(X)
        loss = loss_fn(pred, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 30 == 0:
            loss, current = loss.item(), batch * BATCH_SIZE
            print(f'Training MSE Loss: {loss:>7f} [{current:>4d}/{size:>4d}]')

def test_loop():
    size = len(test_dataloader.dataset)
    num_batches = len(test_dataloader)
    test_loss, correct = 0, 0

    with torch.no_grad():
        for X, y in test_dataloader:
            pred = model(X)
            test_loss += loss_fn(pred, y).item()

            pred_answers = (pred > 0.5).float() # 0 or 1 only
            correct += torch.sum(torch.logical_not(torch.logical_xor(pred_answers, y))).item()

    avg_loss = test_loss / num_batches
    accuracy = correct / size
    print(f'Test Accuracy: {100 * accuracy:>0.1f}%')
    print(f'Test Average loss: {avg_loss:>8f}')

# Train and test every epoch

try:
    for epoch in range(EPOCHS):
        print()
        print(f'Epoch {epoch + 1}')
        print('-' * 20)
        train_loop()
        test_loop()

    print('Finished')

except KeyboardInterrupt:
    print('Finished early')

In [None]:
predicted_labels = []
for inputs, _ in test_dataloader:
    # Forward pass
    outputs = model(inputs)
    #print(outputs)
    for output in outputs:
      if output >= 0.5:
          predicted_labels.append(1)
      else:
          predicted_labels.append(0)

predicted_df = pd.DataFrame({'Predicted Labels': predicted_labels})

In [None]:
true_labels = []
for _, labels in test_dataloader:
    true_labels.extend(labels.flatten().tolist())
true_labels = [int(label) for label in true_labels]
true_df = pd.DataFrame({'True Labels': true_labels})

In [None]:
print_4_metrics(true_df, predicted_df)

In [None]:
print(true_labels)
print(predicted_labels)
print(type(true_df))
print(type(predicted_df))
draw_confusion_matrix(true_df, predicted_df, ['Stroke','No Stroke'])

In [None]:
cm = confusion_matrix(true_df, predicted_df)
print(cm)

In [None]:
print_4_metrics(true_df, predicted_df)