In [1]:
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import pandas as pd
import pickle

In [2]:
import warnings

warnings.filterwarnings("ignore")

In [3]:
stroke_df = pd.read_csv('data/stroke.csv')
stroke_df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [4]:
print(pd.isna(stroke_df).sum())  # checking missing values

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64


In [5]:
stroke_df['bmi'].fillna(stroke_df['bmi'].mean(), inplace=True)  # replace missing values with mean

In [6]:
stroke_df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.893237,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [7]:
for column in stroke_df.columns:
    if stroke_df[column].dtype == object:
        print(f'{column}:{stroke_df[column].unique()}')

gender:['Male' 'Female' 'Other']
ever_married:['Yes' 'No']
work_type:['Private' 'Self-employed' 'Govt_job' 'children' 'Never_worked']
Residence_type:['Urban' 'Rural']
smoking_status:['formerly smoked' 'never smoked' 'smokes' 'Unknown']


In [8]:
print(stroke_df['gender'].value_counts())
stroke_df = stroke_df[stroke_df['gender']!='Other']

Female    2994
Male      2115
Other        1
Name: gender, dtype: int64


In [9]:
mapping = {}
for column in stroke_df.columns:
    if stroke_df[column].dtype == object:
        mapping[column] = stroke_df[column].unique()
        stroke_df[column] = stroke_df[column].replace(mapping[column],range(0,len(mapping[column])))
stroke_df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,0,67.0,0,1,0,0,0,228.69,36.6,0,1
1,51676,1,61.0,0,0,0,1,1,202.21,28.893237,1,1
2,31112,0,80.0,0,1,0,0,1,105.92,32.5,1,1
3,60182,1,49.0,0,0,0,0,0,171.23,34.4,2,1
4,1665,1,79.0,1,0,0,1,1,174.12,24.0,1,1


In [10]:
X = stroke_df.drop(["stroke"], axis=1)
y = stroke_df["stroke"]
X.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
0,9046,0,67.0,0,1,0,0,0,228.69,36.6,0
1,51676,1,61.0,0,0,0,1,1,202.21,28.893237,1
2,31112,0,80.0,0,1,0,0,1,105.92,32.5,1
3,60182,1,49.0,0,0,0,0,0,171.23,34.4,2
4,1665,1,79.0,1,0,0,1,1,174.12,24.0,1


In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

In [12]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    random_state=42,
    n_estimators=30,
    criterion="gini",
    max_depth=5,
    warm_start=False,
    oob_score=True,
    class_weight=None,
)
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_test)
print(y_pred[0])

[0.99604742 0.00395258]


In [13]:
y_pred_df = pd.DataFrame(data = y_pred, columns=['0', '1'])

y_pred_df.head()

Unnamed: 0,0,1
0,0.996047,0.003953
1,0.980889,0.019111
2,0.971463,0.028537
3,0.779024,0.220976
4,0.8713,0.1287


In [14]:
print(classification_report(model.predict(X_val), y_val))

              precision    recall  f1-score   support

           0       1.00      0.96      0.98      1022
           1       0.00      0.00      0.00         0

    accuracy                           0.96      1022
   macro avg       0.50      0.48      0.49      1022
weighted avg       1.00      0.96      0.98      1022



In [15]:
pickle.dump(model, open('models/stroke_model', 'wb'))