In [None]:
# import data
import pandas as pd
import yaml
try:
    with open("../config.yaml", "r") as file:
        config = yaml.safe_load(file)
except:
    print("Yaml configuration file not found!")

In [None]:
config

In [None]:
df = pd.read_csv(config ["data"]["raw"]["file_raw"])
df

In [None]:
# Data Cleaning

In [None]:
# check datatypes
df.dtypes

In [None]:
# check for null-values
df.isna().sum()

In [None]:
# fill null-values with mean
average_bmi = df['bmi'].mean()
df["bmi"] = df["bmi"].fillna(average_bmi)
df

In [None]:
df.isna().sum()

In [None]:
# drop "id" bc it will disturb training the model
df = df.drop("id", axis=1)

In [None]:
# EDA

In [None]:
# check numerical values for outliers
import seaborn as sns
import matplotlib.pyplot as plt 
import numpy as np

sns.boxplot(x = 'stroke', y = 'age', data = df)
plt.show()

In [None]:
sns.boxplot(x = 'stroke', y = 'avg_glucose_level', data = df)
plt.show()

In [None]:
sns.boxplot(x = 'stroke', y = 'bmi', data = df)
plt.show()

In [None]:
# remove outliers for age, only for stroke = 1
category_to_clean = 1
subset = df[df['stroke'] == category_to_clean]

Q1 = subset['age'].quantile(0.25)
Q3 = subset['age'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

cleaned_subset = subset[(subset['age'] >= lower_bound) & (subset['age'] <= upper_bound)]

df_cleaned = pd.concat([df[df['stroke'] != category_to_clean], cleaned_subset], ignore_index=True)

df_cleaned.info()

In [None]:
# remove outliers for bmi
Q1 = df['bmi'].quantile(0.25)
Q3 = df['bmi'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 2 * IQR
upper_bound = Q3 + 6 * IQR

df_cleaned = df_cleaned[(df_cleaned['bmi'] >= lower_bound) & (df['bmi'] <= upper_bound)]

df_cleaned.info()

In [None]:
# check numerical values for distribution

In [None]:
import statsmodels.api as sm
sm.qqplot(df_cleaned['age'].values, fit=True, line='45')
plt.show()

In [None]:
sm.qqplot(df_cleaned['avg_glucose_level'].values, fit=True, line='45')
plt.show()

In [None]:
sm.qqplot(df_cleaned['bmi'].values, fit=True, line='45')
plt.show()

In [None]:
# Split dataset

In [None]:
features = df_cleaned.drop(columns = ["stroke"])
target = df_cleaned["stroke"]

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.20, random_state=0)

In [None]:
X_train_num = X_train.select_dtypes("number")
X_test_num = X_test.select_dtypes("number")

In [None]:
# Apply a power transform featurewise to make avg_glucose_level and bmi more Gaussian-like
from sklearn.preprocessing import PowerTransformer
pt = PowerTransformer()

In [None]:
pt.fit(X_train_num)

X_train_trans = pt.transform(X_train_num)
X_test_trans = pt.transform(X_test_num)

In [None]:
X_train_trans = pd.DataFrame(X_train_trans, columns=X_train_num.columns, index=X_train_num.index )
X_train_trans.head()

In [None]:
X_test_trans = pd.DataFrame(X_test_trans, columns=X_test_num.columns, index=X_test_num.index )
X_test_trans.head()

In [None]:
sm.qqplot(X_train_trans['age'].values, fit=True, line='45')
plt.show()

In [None]:
sm.qqplot(X_train_trans['avg_glucose_level'].values, fit=True, line='45')
plt.show()

In [None]:
sm.qqplot(X_train_trans['bmi'].values, fit=True, line='45')
plt.show()

In [None]:
# Identify correlations

In [None]:
# create datasets with targets
#X_train_trans and 

In [None]:
import scipy.stats as st

df_stroke = df[df["stroke"] == 1][["age", "avg_glucose_level", "bmi"]]
df_no_stroke = df[df["stroke"] == 0][["age", "avg_glucose_level", "bmi"]]

#Set the hypothesis

#H0: df_stroke = df_no_stroke
#H1: df_stroke != df_no_stroke

alpha = 0.05

st.f_oneway(df_stroke, df_no_stroke)

In [None]:
# Chi test for target and categorical

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Assuming df_fin is your existing DataFrame
corr = np.abs(df_cleaned.corr())

mask = np.zeros_like(corr, dtype=bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(10, 10))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with adjusted annotation font size and mask
sns.heatmap(
    corr,
    mask=mask,
    cmap=cmap,
    vmax=1,
    square=True,
    linewidths=.5,
    cbar_kws={"shrink": .5},
    annot=True,
    annot_kws={"size": 8},  # Adjust the font size for annotation values
    fmt=".2f"  # Control the format of the annotation values
)

# Customize axis labels for better readability
plt.xticks(fontsize=10, rotation=45, ha="right")  # Adjust x-axis labels
plt.yticks(fontsize=10)  # Adjust y-axis labels

plt.show()