<a href="https://colab.research.google.com/github/Belkatux/Stroke-Prediction/blob/main/Stroke_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import relevant libraries and data for the project.

*   Pandas and Numpy - Data wrangling and treatment;
*   Matplotlib and Seaborn - Data visualization;
*   Sklearn - Machine Learning models (Logistic Regression, Random Forest)





In [18]:
#Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, confusion_matrix
from collections import Counter
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.impute import KNNImputer

pd.options.mode.chained_assignment = None

In [3]:
#CSV
url = 'https://raw.githubusercontent.com/Belkatux/Stroke-Prediction/main/Stroke_Prediction.csv'
dataset = pd.read_csv(url)
dataset.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [None]:
#Analysis
sex = dataset['gender'].value_counts()
totalsex = dataset['gender'].count()

sex = round(sex/totalsex * 100, 2)
sex #More women than men in the dataset

Female    58.59
Male      41.39
Other      0.02
Name: gender, dtype: float64

##Analysis of the number of positives and negatives.

Highly unbalanced dataset: This indicates that accuracy will not be a good metric to evaluate our trained algorithm since by just guessing all negatives, we would obtain a high accuracy but would not correctly predict new positive cases.

This unbalance will also influence how we train our algorithm.

In [4]:
stroke = dataset['stroke'].value_counts()
totalstroke = dataset['stroke'].count()

stroke = round(stroke/totalstroke * 100, 2)
stroke

0    95.13
1     4.87
Name: stroke, dtype: float64

In [5]:
#Number of NaN
dataset.isna().sum().to_frame(name="Null count")

Unnamed: 0,Null count
id,0
gender,0
age,0
hypertension,0
heart_disease,0
ever_married,0
work_type,0
Residence_type,0
avg_glucose_level,0
bmi,201


##Treatment of features.

Separating features into 'Categorical' and 'Numerical' given their type and 'Target'.

In [6]:
# features
features = ['gender', 'age', 'hypertension', 'heart_disease',
            'ever_married','work_type','Residence_type','avg_glucose_level',
            'bmi','smoking_status']

#target
target = 'stroke'

numerical_features = ['age', 'avg_glucose_level', 'bmi']

categorical_features = ['gender', 'hypertension', 'heart_disease',
                        'ever_married', 'work_type', 'Residence_type', 
                        'smoking_status']

In [7]:
# Converting features into required datatypes
dataset[numerical_features] = dataset[numerical_features].astype(np.float64)

dataset[categorical_features] = dataset[categorical_features].astype('category')

# data types
dataset[features+[target]].dtypes.to_frame(name="Data type")

Unnamed: 0,Data type
gender,category
age,float64
hypertension,category
heart_disease,category
ever_married,category
work_type,category
Residence_type,category
avg_glucose_level,float64
bmi,float64
smoking_status,category


## Transform categorical features.

Categorical data needs to be converted since machine learning algorithms usually cannot operate on label data directly. So, in order to work with this data, we use One-Hot Encoding to convert the categorical data to a numerical form. 



In [19]:
#Train-Test split
train, test = train_test_split(dataset, test_size=0.33, stratify=dataset.stroke ,random_state=42)

imputer = KNNImputer(n_neighbors = 5)

train[numerical_features] = imputer.fit_transform(train[numerical_features])
test[numerical_features] = imputer.transform(test[numerical_features])

#OneHotEncoder
encoder = OneHotEncoder(drop='first', sparse=False)
encoder.fit(train[categorical_features])
cols = encoder.get_feature_names(categorical_features)
train.loc[:, cols] = encoder.transform(train[categorical_features])
test.loc[:, cols] = encoder.transform(test[categorical_features])

train.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,gender_Male,gender_Other,hypertension_1,heart_disease_1,ever_married_Yes,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Urban,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
469,5835,Male,68.0,0,0,Yes,Private,Urban,92.21,27.3,Unknown,0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
1705,29104,Female,19.0,0,0,No,Private,Urban,110.7,38.5,never smoked,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
3674,26539,Male,69.0,0,0,Yes,Self-employed,Urban,202.51,30.8,formerly smoked,0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
4181,42760,Female,27.0,0,0,Yes,Private,Urban,57.46,23.0,smokes,0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
3447,24355,Female,1.88,0,0,No,children,Rural,97.26,16.7,Unknown,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [20]:
train.drop(categorical_features, axis=1, inplace=True)
test.drop(categorical_features, axis=1, inplace=True)

##Scaling of the numerical features.

Some algorithms need that all numerical features are in the same range in order to properly be trained. For instance, this is specially tru for the K Nearest Neighbors algorithm since it is a distance based algorithm.

In [21]:
scaler = StandardScaler()
scaler.fit(train[numerical_features])

train.loc[:, numerical_features] = scaler.transform(train[numerical_features])
test.loc[:, numerical_features] = scaler.transform(test[numerical_features])

train.head()

Unnamed: 0,id,age,avg_glucose_level,bmi,stroke,gender_Male,gender_Other,hypertension_1,heart_disease_1,ever_married_Yes,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Urban,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
469,5835,1.103695,-0.303031,-0.212188,0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
1705,29104,-1.081359,0.109946,1.217999,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
3674,26539,1.148288,2.160533,0.234745,0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
4181,42760,-0.724616,-1.079176,-0.761278,0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
3447,24355,-1.84479,-0.190238,-1.565758,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


## Define train and test data.

In [22]:
# Inputs and Target 
X_train = train.drop(target, axis=1)
y_train = train[target]


X_test = test.drop(target, axis=1)
y_test = test[target]

## Fitting the model to the transformed data.

Here, we used Random Forest with Bagging in order to obtain a more precise result while avoiding overfitting.

In [26]:
model = BaggingClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
print(cm)

[[1597    8]
 [  78    4]]


## F1 Score

Here, we make use of the F1 score to measure the success of our trained algorithm since we agreed that accuracy would not be a good metric to do so.

The F1 score is highly advised for cases where there is unbalanced data since and we want to measure our success of obtaining positives while minimizing our false negative count.

In [27]:
score = f1_score(y_test, y_pred)
print(f'The score is {score*100:.2f}%')

The score is 8.51%
