# Exercise 1 B

## Import Necessary Libraries

In [67]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.impute import KNNImputer
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
print("Setup Complete")

Setup Complete


In [68]:
# Path of the file to read
stroke_filepath = "../datasets/healthcare-dataset-stroke-data.csv"

# Read the file into a variable stroke_data
stroke_data = pd.read_csv(stroke_filepath, index_col="id")

# Set seed for reproducibility
np.random.seed(0)

## Handling Missing Values

In [69]:
stroke_data = stroke_data.sort_index()
stroke_data

Unnamed: 0_level_0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
67,Female,17.0,0,0,No,Private,Urban,92.97,,formerly smoked,0
77,Female,13.0,0,0,No,children,Rural,85.81,18.6,Unknown,0
84,Male,55.0,0,0,Yes,Private,Urban,89.17,31.5,never smoked,0
91,Female,42.0,0,0,No,Private,Urban,98.53,18.5,never smoked,0
99,Female,31.0,0,0,No,Private,Urban,108.89,52.3,Unknown,0
...,...,...,...,...,...,...,...,...,...,...,...
72911,Female,57.0,1,0,Yes,Private,Rural,129.54,60.9,smokes,0
72914,Female,19.0,0,0,No,Private,Urban,90.57,24.2,Unknown,0
72915,Female,45.0,0,0,Yes,Private,Urban,172.33,45.3,formerly smoked,0
72918,Female,53.0,1,0,Yes,Private,Urban,62.55,30.3,Unknown,1


In [70]:
# Categorical columns
cat_col = ['gender', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'smoking_status', 'stroke']

# Numerical columns
num_col = ['age', 'avg_glucose_level', 'bmi']

In [71]:
# Treat "Unknown" smoking_status as missing data
# Replace "Unknown" smoking_status with NaN value
stroke_data['smoking_status'].replace('Unknown', np.nan, inplace=True)

In [72]:
# Check if there are missing data
stroke_data.isnull().sum()

gender                  0
age                     0
hypertension            0
heart_disease           0
ever_married            0
work_type               0
Residence_type          0
avg_glucose_level       0
bmi                   201
smoking_status       1544
stroke                  0
dtype: int64

In [73]:
# Apply Lable Encoding in categorical data
le = LabelEncoder()

le_stroke_data = stroke_data[cat_col].apply(lambda series: pd.Series(
    le.fit_transform(series[series.notnull()]),
    index=series[series.notnull()].index
))

### Drop Columns With Missing Values

In [74]:
# Drop columns with missing data
dropped_columns_data = stroke_data.drop(["bmi", "smoking_status"], axis=1)

In [75]:
# Replace label encoded data with original data
dropped_columns_data = dropped_columns_data.drop(cat_col, axis=1, errors="ignore")
dropped_columns_data = pd.concat([le_stroke_data.drop(["smoking_status"], axis=1), dropped_columns_data], axis=1)

In [76]:
dropped_columns_data

Unnamed: 0_level_0,gender,hypertension,heart_disease,ever_married,work_type,Residence_type,stroke,age,avg_glucose_level
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
67,0,0,0,0,2,1,0,17.0,92.97
77,0,0,0,0,4,0,0,13.0,85.81
84,1,0,0,1,2,1,0,55.0,89.17
91,0,0,0,0,2,1,0,42.0,98.53
99,0,0,0,0,2,1,0,31.0,108.89
...,...,...,...,...,...,...,...,...,...
72911,0,1,0,1,2,0,0,57.0,129.54
72914,0,0,0,0,2,1,0,19.0,90.57
72915,0,0,0,1,2,1,0,45.0,172.33
72918,0,1,0,1,2,1,1,53.0,62.55


### Fill Missing Values With Column Mean

In [77]:
# Drop smoking_status column since mean method cannot be applied in categorical data
fill_mean_data = stroke_data.drop(["smoking_status"], axis=1)

In [78]:
# Fill missing numerical data with mean value
fill_mean_data = stroke_data.fillna(stroke_data.mean())

In [79]:
# Replace label encoded data with original data
fill_mean_data = fill_mean_data.drop(cat_col, axis=1, errors="ignore")
fill_mean_data = pd.concat([le_stroke_data.drop(["smoking_status"], axis=1), fill_mean_data], axis=1)

In [80]:
fill_mean_data

Unnamed: 0_level_0,gender,hypertension,heart_disease,ever_married,work_type,Residence_type,stroke,age,avg_glucose_level,bmi
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
67,0,0,0,0,2,1,0,17.0,92.97,28.893237
77,0,0,0,0,4,0,0,13.0,85.81,18.600000
84,1,0,0,1,2,1,0,55.0,89.17,31.500000
91,0,0,0,0,2,1,0,42.0,98.53,18.500000
99,0,0,0,0,2,1,0,31.0,108.89,52.300000
...,...,...,...,...,...,...,...,...,...,...
72911,0,1,0,1,2,0,0,57.0,129.54,60.900000
72914,0,0,0,0,2,1,0,19.0,90.57,24.200000
72915,0,0,0,1,2,1,0,45.0,172.33,45.300000
72918,0,1,0,1,2,1,1,53.0,62.55,30.300000


### Fill Missing Values With Linear Regression

In [81]:
# Drop smoking_status column since linear interpolation method cannot be applied in categorical data
lin_reg_data = stroke_data.drop(["smoking_status"], axis=1)

In [82]:
# Fill missing data with linear interpolation method
lin_reg_data["bmi"].interpolate(method="linear", inplace=True, limit_direction="both")

In [83]:
# Replace label encoded data with original data
lin_reg_data = lin_reg_data.drop(cat_col, axis=1, errors="ignore")
lin_reg_data = pd.concat([le_stroke_data.drop(["smoking_status"], axis=1), lin_reg_data], axis=1)

In [84]:
lin_reg_data

Unnamed: 0_level_0,gender,hypertension,heart_disease,ever_married,work_type,Residence_type,stroke,age,avg_glucose_level,bmi
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
67,0,0,0,0,2,1,0,17.0,92.97,18.6
77,0,0,0,0,4,0,0,13.0,85.81,18.6
84,1,0,0,1,2,1,0,55.0,89.17,31.5
91,0,0,0,0,2,1,0,42.0,98.53,18.5
99,0,0,0,0,2,1,0,31.0,108.89,52.3
...,...,...,...,...,...,...,...,...,...,...
72911,0,1,0,1,2,0,0,57.0,129.54,60.9
72914,0,0,0,0,2,1,0,19.0,90.57,24.2
72915,0,0,0,1,2,1,0,45.0,172.33,45.3
72918,0,1,0,1,2,1,1,53.0,62.55,30.3


### Fill Missing Values With k-Nearest Neighbors

In [85]:
# Drop bmi column since knn method should not be applied in categorical data
knn_data = stroke_data.drop(["bmi"], axis=1)

In [86]:
# Fill missing values with knn imputer
imputer = KNNImputer(n_neighbors=2)
le_knn_imp_np_array = imputer.fit_transform(le_stroke_data)

In [87]:
# Convert numpy array to dataframe
le_knn_imp = pd.DataFrame(data=le_knn_imp_np_array, index=le_stroke_data.index, columns=le_stroke_data.columns)

# Convert float64 to int64
le_knn_imp = le_knn_imp.astype(np.int64)

In [88]:
# Replace label encoded data with original data
knn_data = knn_data.drop(cat_col, axis=1, errors="ignore")
knn_data = pd.concat([le_knn_imp, knn_data], axis=1)

In [89]:
knn_data

Unnamed: 0_level_0,gender,hypertension,heart_disease,ever_married,work_type,Residence_type,smoking_status,stroke,age,avg_glucose_level
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
67,0,0,0,0,2,1,0,0,17.0,92.97
77,0,0,0,0,4,0,1,0,13.0,85.81
84,1,0,0,1,2,1,1,0,55.0,89.17
91,0,0,0,0,2,1,1,0,42.0,98.53
99,0,0,0,0,2,1,0,0,31.0,108.89
...,...,...,...,...,...,...,...,...,...,...
72911,0,1,0,1,2,0,2,0,57.0,129.54
72914,0,0,0,0,2,1,0,0,19.0,90.57
72915,0,0,0,1,2,1,0,0,45.0,172.33
72918,0,1,0,1,2,1,1,1,53.0,62.55


### Fill Missing Values With Linear Regression and k-Nearest Neighbors

In [90]:
# Add bmi column from linear regression to knn data
lin_reg_knn_data = pd.concat([knn_data, lin_reg_data["bmi"]], axis=1)

In [91]:
lin_reg_knn_data

Unnamed: 0_level_0,gender,hypertension,heart_disease,ever_married,work_type,Residence_type,smoking_status,stroke,age,avg_glucose_level,bmi
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
67,0,0,0,0,2,1,0,0,17.0,92.97,18.6
77,0,0,0,0,4,0,1,0,13.0,85.81,18.6
84,1,0,0,1,2,1,1,0,55.0,89.17,31.5
91,0,0,0,0,2,1,1,0,42.0,98.53,18.5
99,0,0,0,0,2,1,0,0,31.0,108.89,52.3
...,...,...,...,...,...,...,...,...,...,...,...
72911,0,1,0,1,2,0,2,0,57.0,129.54,60.9
72914,0,0,0,0,2,1,0,0,19.0,90.57,24.2
72915,0,0,0,1,2,1,0,0,45.0,172.33,45.3
72918,0,1,0,1,2,1,1,1,53.0,62.55,30.3


## Create And Train The Models

In [92]:
# Separate target from predictors
X = dropped_columns_data.drop(["stroke"], axis=1)
y = dropped_columns_data["stroke"]

# Divide data into training and validation subsets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

# Create a Classifier
clf = RandomForestClassifier(random_state=0)

# Train the model
clf.fit(X_train, y_train)
stroke_preds = clf.predict(X_valid)

print(classification_report(y_valid, stroke_preds, zero_division=1))

              precision    recall  f1-score   support

           0       0.95      0.99      0.97       974
           1       0.17      0.02      0.04        48

    accuracy                           0.95      1022
   macro avg       0.56      0.51      0.51      1022
weighted avg       0.92      0.95      0.93      1022



In [93]:
# Separate target from predictors
X = fill_mean_data.drop(["stroke"], axis=1)
y = fill_mean_data["stroke"]

# Divide data into training and validation subsets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

# Create a Classifier
clf = RandomForestClassifier(random_state=0)

# Train the model
clf.fit(X_train, y_train)
stroke_preds = clf.predict(X_valid)

print(classification_report(y_valid, stroke_preds, zero_division=1))

              precision    recall  f1-score   support

           0       0.95      1.00      0.98       974
           1       0.33      0.02      0.04        48

    accuracy                           0.95      1022
   macro avg       0.64      0.51      0.51      1022
weighted avg       0.92      0.95      0.93      1022



In [94]:
# Separate target from predictors
X = lin_reg_data.drop(["stroke"], axis=1)
y = lin_reg_data["stroke"]

# Divide data into training and validation subsets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

# Create a Classifier
clf = RandomForestClassifier(random_state=0)

# Train the model
clf.fit(X_train, y_train)
stroke_preds = clf.predict(X_valid)

print(classification_report(y_valid, stroke_preds, zero_division=1))

              precision    recall  f1-score   support

           0       0.95      1.00      0.98       974
           1       1.00      0.00      0.00        48

    accuracy                           0.95      1022
   macro avg       0.98      0.50      0.49      1022
weighted avg       0.96      0.95      0.93      1022



In [95]:
# Separate target from predictors
X = knn_data.drop(["stroke"], axis=1)
y = knn_data["stroke"]

# Divide data into training and validation subsets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

# Create a Classifier
clf = RandomForestClassifier(random_state=0)

# Train the model
clf.fit(X_train, y_train)
stroke_preds = clf.predict(X_valid)

print(classification_report(y_valid, stroke_preds, zero_division=1))

              precision    recall  f1-score   support

           0       0.95      1.00      0.98       974
           1       0.50      0.02      0.04        48

    accuracy                           0.95      1022
   macro avg       0.73      0.51      0.51      1022
weighted avg       0.93      0.95      0.93      1022



In [96]:
# Separate target from predictors
X = lin_reg_knn_data.drop(["stroke"], axis=1)
y = lin_reg_knn_data["stroke"]

# Divide data into training and validation subsets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

# Create a Classifier
clf = RandomForestClassifier(random_state=0)

# Train the model
clf.fit(X_train, y_train)
stroke_preds = clf.predict(X_valid)

print(classification_report(y_valid, stroke_preds, zero_division=1))

              precision    recall  f1-score   support

           0       0.95      1.00      0.98       974
           1       0.00      0.00      0.00        48

    accuracy                           0.95      1022
   macro avg       0.48      0.50      0.49      1022
weighted avg       0.91      0.95      0.93      1022

