### Module Import

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

### Dataset Definition

In [None]:
df = pd.read_csv('stroke_dataset.csv')
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
2,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
3,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
4,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1


### Stratification of the Dataset

In [None]:
X = df.drop('stroke', axis = 1)
y = df.stroke

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [None]:
print(Counter(y))
print(Counter(y_train))
print(Counter(y_test))

Counter({0: 4733, 1: 248})
Counter({0: 3786, 1: 198})
Counter({0: 947, 1: 50})


In [None]:
print((248)/(248 + 4733))
print((198)/(198 + 3786))
print((50)/(50 + 947))

0.049789198956032926
0.04969879518072289
0.05015045135406219


In [None]:
data_test = pd.concat([X_test, y_test], axis = 1)

In [None]:
data_test.to_csv('test.csv')

In [None]:
X_train.smoking_status.value_counts()

never smoked       1455
Unknown            1197
formerly smoked     712
smokes              620
Name: smoking_status, dtype: int64

### Null Imputation

In [None]:
def impute_smokers_age(X_train):
    min_age = 12

    X_train.loc[(X_train['age'] <= min_age) & (X_train['smoking_status'] == 'Unknown'), 'smoking_status'] = 'never smoked'
    X_train.loc[(X_train['smoking_status'] == 'Unknown'), 'smoking_status'] = np.nan

    imputer = SimpleImputer(strategy = 'most_frequent')
    X_train = pd.DataFrame(imputer.fit_transform(X_train))
    return X_train

In [None]:
X_train = impute_smokers_age(X_train)

In [None]:
X_train.iloc[:,9].value_counts()

never smoked       2652
formerly smoked     712
smokes              620
Name: 9, dtype: int64

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=af12788f-aecc-4989-a302-f8b336f386d1' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>