This file contains the code for the following steps:
- download dataset from kaggle;
- removing unnecessary columns and rows with missing values
- balancing the values of the target variable;
- binarization of features (4 strategies).

In [1]:
# To comply with the code style
%load_ext jupyter_black

### Libraries

In [2]:
import time
import numpy as np
import pandas as pd

# Dataset downloader
import opendatasets as od

# Preprocessing
from sklearn.utils import resample
from sklearn.model_selection import train_test_split

# Feature selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# Visualization
import matplotlib.pyplot as plt
import seaborn as sb

plt.rcParams["figure.facecolor"] = (1, 1, 1, 1)

In [3]:
SEED = 42

# Preprocessing

### Dataset import

In [4]:
od.download("https://www.kaggle.com/datasets/fedesoriano/stroke-prediction-dataset")

Skipping, found downloaded files in ".\stroke-prediction-dataset" (use force=True to force download)


In [5]:
df = pd.read_csv("stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")
print(df.shape)
df.head()

(5110, 12)


Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [6]:
df.dropna(inplace=True)

In [7]:
df.drop(columns=["id"], inplace=True)

### Balancing classes

In [8]:
df["stroke"].value_counts()

stroke
0    4700
1     209
Name: count, dtype: int64

There is a **huge** imbalance. If left like this, the model will always produce a constant prediction. Let's try to change the ratio to at least 1 to 2

In [9]:
class_a = df[df["stroke"] == 1]
class_b = df[df["stroke"] == 0]

class_b_downsampled = resample(class_b, replace=False, n_samples=500, random_state=SEED)

df_balanced = pd.concat([class_a, class_b_downsampled])

# Let's check the class distribution
print(df_balanced["stroke"].value_counts())

stroke
0    500
1    209
Name: count, dtype: int64


In [10]:
df_balanced.describe()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,709.0,709.0,709.0,709.0,709.0,709.0
mean,47.928406,0.132581,0.090268,113.162454,29.10268,0.294781
std,23.678472,0.339361,0.286768,52.493273,7.808042,0.456266
min,0.24,0.0,0.0,55.12,12.3,0.0
25%,30.0,0.0,0.0,77.52,23.9,0.0
50%,51.0,0.0,0.0,93.02,28.1,0.0
75%,69.0,0.0,0.0,125.2,32.9,1.0
max,82.0,1.0,1.0,271.74,66.8,1.0


In [11]:
df_balanced.corr(numeric_only=True)

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
age,1.0,0.303512,0.294454,0.316009,0.321822,0.540588
hypertension,0.303512,1.0,0.12358,0.25751,0.131474,0.294552
heart_disease,0.294454,0.12358,1.0,0.2239,0.072813,0.228139
avg_glucose_level,0.316009,0.25751,0.2239,1.0,0.288076,0.263868
bmi,0.321822,0.131474,0.072813,0.288076,1.0,0.113405
stroke,0.540588,0.294552,0.228139,0.263868,0.113405,1.0


In [12]:
df_balanced.to_csv(
    "datasets/balanced.csv"
)  # Other files will now use only this version of the dataset.

## Binarizing the data

### Helping methods

In [13]:
def ordinal(df, features, boundaries, style=">="):
    new_df = df.copy()
    new_df.drop(columns=features, inplace=True)
    for i in range(len(features)):
        feature = features[i]
        curr_bound = boundaries[i]
        for boundary in curr_bound:
            if style == ">=":
                # XGBoost requires features_names without symbols from ' [],<'
                new_df[f"{feature}_ge_{boundary}"] = df[feature] >= boundary
                # greater or equal
            elif style == "<=":
                new_df[f"{feature}_le_{boundary}"] = df[feature] <= boundary
                # less or equal
            else:  # inter-ordinal
                new_df[f"{feature}_ge_{boundary}"] = df[feature] >= boundary
                new_df[f"{feature}_le_{boundary}"] = df[feature] <= boundary

    return new_df

In [14]:
def nominal(df, features):
    new_df = df.copy()
    for feature in features:
        dummies = pd.get_dummies(df[feature], prefix="", prefix_sep="")
        new_df = pd.concat([new_df, dummies], axis=1)
    new_df.drop(columns=features, inplace=True)
    return new_df

In [15]:
def dichotomic(df, features):
    new_df = df.copy()
    for feature in features:
        dummies = pd.get_dummies(df[feature], prefix=f"{feature}")
        new_df = pd.concat([new_df, dummies], axis=1)
    new_df.drop(columns=features, inplace=True)
    return new_df

### First strategy: seemingly logical

Let's think about what kind of relationship is assumed based on logical reasoning:

- The older a person is, the more likely they are to have a stroke
- The higher the blood glucose level, the higher the probability of diabetes mellitus, the greater the probability of stroke
- The information differs from one source to another, but it is more common to find information that each increase in bmi by 5 points increases the risk of stroke by 21%

Due to the fact that all signs have a direct relationship with the target variable, it seems logical and sufficient to use ordinal >=

In [16]:
df_bin_strategy1 = ordinal(
    df_balanced,
    ["age", "avg_glucose_level", "bmi"],
    [
        [25, 35, 45, 55, 60, 65, 70, 75, 80, 85],
        [70, 90, 95, 100, 110, 115, 120, 125, 130, 135],
        [18, 20, 25, 30, 35, 40],
    ],
    style=">=",
)

df_bin_strategy1 = nominal(
    df_bin_strategy1, ["gender", "work_type", "Residence_type", "smoking_status"]
)
df_bin_strategy1 = dichotomic(
    df_bin_strategy1, ["hypertension", "heart_disease", "ever_married"]
)

In [17]:
corr = df_bin_strategy1.corr()

In [18]:
corr["stroke"][np.abs(corr["stroke"]) >= 0.5].sort_values()

age_ge_55    0.509404
age_ge_45    0.510136
stroke       1.000000
Name: stroke, dtype: float64

the most important features: age>=45, age>=55

In [19]:
corr["stroke"][
    (0.3 <= np.abs(corr["stroke"])) & (np.abs(corr["stroke"]) < 0.5)
].sort_values()

age_ge_25    0.325901
age_ge_75    0.389185
age_ge_35    0.414506
age_ge_70    0.436932
age_ge_60    0.468620
age_ge_65    0.479041
Name: stroke, dtype: float64

Can be considered as important, again about age

In [20]:
corr["stroke"][
    (0.2 <= np.abs(corr["stroke"])) & (np.abs(corr["stroke"]) < 0.3)
].sort_values()

hypertension_0             -0.294552
ever_married_No            -0.273364
heart_disease_0            -0.228139
children                   -0.215728
avg_glucose_level_ge_110    0.210654
avg_glucose_level_ge_115    0.226720
avg_glucose_level_ge_120    0.227988
heart_disease_1             0.228139
avg_glucose_level_ge_125    0.246328
avg_glucose_level_ge_135    0.247657
avg_glucose_level_ge_130    0.250138
ever_married_Yes            0.273364
age_ge_80                   0.286938
hypertension_1              0.294552
Name: stroke, dtype: float64

Except age: ever_married, hypertension 

In [21]:
df_bin_strategy1.to_csv("datasets/strategy1.csv")

### Second strategy: inter-ordinal for each numeric

In [22]:
df_bin_strategy2 = ordinal(
    df_balanced,
    ["age", "avg_glucose_level", "bmi"],
    [
        [25, 35, 45, 55, 60, 65, 70, 75, 80, 85],
        [70, 90, 95, 100, 110, 115, 120, 125, 130, 135],
        [18, 20, 25, 30, 35, 40],
    ],
    style=">=<=",
)

df_bin_strategy2 = nominal(
    df_bin_strategy2, ["gender", "work_type", "Residence_type", "smoking_status"]
)
df_bin_strategy2 = dichotomic(
    df_bin_strategy2, ["hypertension", "heart_disease", "ever_married"]
)

Correlations will not change from another strategy, so let's go straight

In [23]:
df_bin_strategy2.to_csv("datasets/strategy2.csv")

### Third strategy: larger intervals for numeric features

I understand that we may lose valuable information, but this is not prohibited as part of a training experiment)

In [24]:
df_bin_strategy3 = ordinal(
    df_balanced,
    ["age", "avg_glucose_level", "bmi"],
    [
        [25, 45, 55, 60, 65, 70, 75],
        [70, 90, 95, 120, 125, 130],
        [25, 30, 35, 40],
    ],
    style=">=<=",
)

df_bin_strategy3 = nominal(
    df_bin_strategy3, ["gender", "work_type", "Residence_type", "smoking_status"]
)
df_bin_strategy3 = dichotomic(
    df_bin_strategy3, ["hypertension", "heart_disease", "ever_married"]
)

In [25]:
df_bin_strategy3.to_csv("datasets/strategy3.csv")

### Fourth strategy: selecting features

In [26]:
y = df_bin_strategy2["stroke"]
X = df_bin_strategy2.drop(columns=["stroke"])
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=SEED
)

In [27]:
chi2_features = SelectKBest(chi2, k=15)  # best 15 features
best_features = chi2_features.fit_transform(X_train, y_train)

In [28]:
feature_indices = chi2_features.get_support(indices=True)
column_names = X_train.columns[feature_indices]
for i in range(len(column_names)):
    print(f"feature with column_index {feature_indices[i]} = {column_names[i]}")

feature with column_index 1 = age_le_25
feature with column_index 3 = age_le_35
feature with column_index 4 = age_ge_45
feature with column_index 5 = age_le_45
feature with column_index 6 = age_ge_55
feature with column_index 7 = age_le_55
feature with column_index 8 = age_ge_60
feature with column_index 9 = age_le_60
feature with column_index 10 = age_ge_65
feature with column_index 11 = age_le_65
feature with column_index 12 = age_ge_70
feature with column_index 14 = age_ge_75
feature with column_index 16 = age_ge_80
feature with column_index 38 = avg_glucose_level_ge_135
feature with column_index 66 = hypertension_1


In [29]:
lst = list(column_names)
lst.append("stroke")

In [30]:
df_bin_strategy4 = df_bin_strategy2[lst]

In [31]:
df_bin_strategy4.to_csv("datasets/strategy4.csv")

## Preparing index for ConceptLattice

It is more logical to run this and the next block every time before the model

In [32]:
def prepare_index(df):
    return df.set_index(np.arange(len(df)).astype(str))

In [33]:
df_bin_strategy1 = prepare_index(df_bin_strategy1)
df_bin_strategy2 = prepare_index(df_bin_strategy2)
df_bin_strategy3 = prepare_index(df_bin_strategy3)
df_bin_strategy4 = prepare_index(df_bin_strategy4)

In [34]:
df_bin_strategy1

Unnamed: 0,stroke,age_ge_25,age_ge_35,age_ge_45,age_ge_55,age_ge_60,age_ge_65,age_ge_70,age_ge_75,age_ge_80,...,Unknown,formerly smoked,never smoked,smokes,hypertension_0,hypertension_1,heart_disease_0,heart_disease_1,ever_married_No,ever_married_Yes
0,1,True,True,True,True,True,True,False,False,False,...,False,True,False,False,True,False,False,True,False,True
1,1,True,True,True,True,True,True,True,True,True,...,False,False,True,False,True,False,False,True,False,True
2,1,True,True,True,False,False,False,False,False,False,...,False,False,False,True,True,False,True,False,False,True
3,1,True,True,True,True,True,True,True,True,False,...,False,False,True,False,False,True,True,False,False,True
4,1,True,True,True,True,True,True,True,True,True,...,False,True,False,False,True,False,True,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
704,0,True,True,True,True,True,True,True,True,False,...,False,True,False,False,False,True,False,True,False,True
705,0,True,True,True,True,True,True,True,True,False,...,False,False,True,False,True,False,False,True,False,True
706,0,False,False,False,False,False,False,False,False,False,...,True,False,False,False,True,False,True,False,True,False
707,0,True,True,True,True,True,True,True,False,False,...,False,True,False,False,True,False,True,False,False,True


## Spliting the data to train and test

In [35]:
y = df_bin_strategy1["stroke"]
X = df_bin_strategy1.drop(columns=["stroke"])

In [36]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=SEED
)