# Healthcare-Dataset-Preprocessing

*`Imports`*

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import LinearSVC, SVC
from sklearn.decomposition import PCA

*`load data`*

In [None]:
# use os to join the current working directory with the file path
FILE_PATH = os.path.join(os.getcwd(), '..', 'data', 'healthcare-dataset-stroke-data.csv')
df = pd.read_csv(FILE_PATH)

In [None]:
df.head()

### Dataset Description

- The Stroke Prediction Dataset contains information about patients and various health-related attributes.
- The goal is to predict whether a patient is likely to experience a stroke based on these attributes.

### Problem Defination 

- Stroke is one of the leading causes of death and disability. Early prediction of stroke risk can help in taking preventive measures. 
- The aim of this analysis is to build a predictive model that accurately classifies whether a person will experience a stroke based on their medical and demographic data.
- We will start with EDA amd preprocesssing of the dataset in this notebook

### EDA

In [None]:
df.columns

In [None]:
# modify the column names to be more user-friendly
df.columns = df.columns.str.replace(' ', '_').str.lower().str.strip()

In [None]:
# drop id 
id_series = df['id']           # Save the 'id' column as a separate Series -> may use later in mapping the predictions back to the original data
df.drop(columns=['id'], inplace=True)

In [None]:
df.columns

In [None]:
# quick check of the data types and non-null counts
df.info()

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
missing_values[missing_values > 0]         # will handle later 


In [None]:
# Check for duplicates
duplicates = df.duplicated().sum()
duplicates

In [None]:
# stats for numerical columns
df.describe(include='number').T
# max far from 75%

In [None]:
# stats for categorical columns
df.describe(include='object').T

In [None]:
# ranges of the data and possible values for categorical columns

for col in df.columns:
    if df[col].dtype == 'object':
        print(f"{col}: {df[col].unique()}")
    else:
        print(f"{col}: min={df[col].min()}, max={df[col].max()}")

In [None]:
# Display all duplicate rows
duplicates = df[df.duplicated()]
print("Number of duplicate rows:", len(duplicates))
# Display the duplicate rows if exists
if len(duplicates) > 0:
    print("Duplicate rows:")
    print(duplicates)

In [None]:
# Displaying mode, median, and mean for all numeric columns
for col in df.select_dtypes(include='number').columns:
    mode = df[col].mode()[0] if not df[col].mode().empty else 'No mode'
    median = df[col].median()
    mean = df[col].mean()
    print(f"Column: {col}\nMode: {mode}\nMedian: {median}\nMean: {mean}\n")


In [None]:
# Check the distribution of the target variable
df['stroke'].value_counts(normalize=True).plot(kind='bar', color=['#1f77b4', '#ff7f0e'])
plt.title('Distribution of Stroke Cases')


# stroke=0: No stroke, stroke=1: Stroke
# stroke=1 is the positive class (required for prediction)

In [None]:
# features distribution alone

for col in df.columns:
    if col != 'stroke':
        plt.figure(figsize=(10, 5))
        sns.histplot(df[col], kde=True, bins=30, color='blue')
        plt.title(f'Distribution of {col}')
        plt.xlabel(col)
        plt.ylabel('Frequency')
        plt.show()


*`Bivariate Analysis`*

In [None]:
# scatter plots(features vs target)
for select in [df[['age', 'avg_glucose_level', 'bmi']]]:
    for col in select.columns:
        if col != 'stroke':
            plt.figure(figsize=(10, 5))
            sns.scatterplot(data=df, x=col, y='stroke', alpha=0.5)
            plt.title(f'Scatter plot of {col} vs Stroke')
            plt.xlabel(col)
            plt.ylabel('Stroke')
            plt.show()

In [None]:
# correlation matrix for numerical features
correlation_matrix = df.select_dtypes(include=['number']).corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.show()

`Outliers`

In [None]:
numerical_df = df[['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi']]

outliers_iqr = pd.DataFrame(False, index=numerical_df.index, columns=numerical_df.columns)

for col in numerical_df:
    Q1, Q3 = np.percentile(numerical_df[col].dropna(), [25, 75])
    IQR = Q3 - Q1
    lower, upper = Q1 - 1.5 * IQR, Q3 + 1.5 * IQR
    outliers_iqr[col] = (numerical_df[col] < lower) | (numerical_df[col] > upper)

print("Outliers per column (IQR):")
print(outliers_iqr.sum())
# ratio of outliers
print("Outlier ratio per column (IQR):")
print(outliers_iqr.mean())
# ratio from all
print("Overall outlier ratio (IQR):")
print(outliers_iqr.values.mean())

In [None]:
# use boxplot to visualize outliers
plt.boxplot(numerical_df.values, labels=numerical_df.columns)
plt.title("Boxplot of Data")
plt.xlabel("Features")
plt.ylabel("Values")
plt.xticks(rotation=45)
plt.show()

In [None]:
# z-score 
z_scores = zscore(numerical_df)
threshold = 3  # confidence is 99.7% (±3 standard deviations)
outliers_z = numerical_df[np.abs(z_scores) > threshold]
print("Z-score Outliers:")
print(outliers_z.sum())

### Preprocessing

*`Drop id`* done

In [None]:
df.head()

*`Encoding`*

In [None]:
from sklearn.preprocessing import LabelEncoder

# List of columns to encode
columns = ['gender', 'ever_married', 'work_type', 'smoking_status', 'residence_type']  

# Dictionary to store encoders(use for decoding later) and mappings
label_encoders = {}
mappings = {}

# Encoding and creating mappings in a single loop
for col in columns:
    encoder = LabelEncoder()
    df[col] = encoder.fit_transform(df[col])
    label_encoders[col] = encoder

    # Creating a dictionary of original: encoded pairs
    mapping = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))
    mappings[col] = mapping

In [None]:
# chech after encoding
df.head()

In [None]:
# mapping the encoded values back to original values
print("\nOriginal to Encoded Mappings:")
for col, mapping in mappings.items():
    print(f"{col}: {mapping}")

*`handle missing values`*

In [None]:
# Check the percentage of missing values in each column
missing_percentage = df.isnull().mean() * 100
missing_percentage[missing_percentage > 0]  # Display columns with missing value

outliers handling firstly,impute missing values at the end 

In [None]:
# impute missing values using k-nearest neighbors
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=5)  # You can adjust n_neighbors based on your data
df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)    # in-place 

In [None]:
# Check if there are any missing values left after imputation

missing_values_after_imputation = df.isnull().sum()
missing_values_after_imputation[missing_values_after_imputation > 0] 

In [None]:
df.info()

*`Scaling`*

In [None]:
#check if scaling is needed 
for col in df.columns:
    if df[col].dtype == 'object':
        print(f"{col}: {df[col].unique()}")
    else:
        print(f"{col}: min={df[col].min()}, max={df[col].max()}")

In [None]:
# standarization for logestic regression later

scaler = StandardScaler()
df[['age', 'avg_glucose_level', 'bmi']] = scaler.fit_transform(df[['age', 'avg_glucose_level', 'bmi']])


In [None]:
# check the scaled values
df.head()

*`check on gender`*

In [None]:
# Filter rows where gender is 2
filtered_df = df[df['gender'] == 2]

# Count occurrences of stroke = 1 and stroke = 0
stroke_1_count = (filtered_df['stroke'] == 1).sum()
stroke_0_count = (filtered_df['stroke'] == 0).sum()

print(f"Number of rows where gender is 2 and stroke = 1: {stroke_1_count}")
print(f"Number of rows where gender is 2 and stroke = 0: {stroke_0_count}")



In [None]:
# only 1 row with gender=2(other), so drop this column

df = df.drop(filtered_df.index, axis=0)  # Drop the rows where


In [None]:
# now it's binary feature
# check gender possible values  

print(df['gender'].unique())


In [None]:
df.info()

In [None]:
# Histogram
for col in ['age', 'avg_glucose_level', 'bmi']:
    plt.figure(figsize=(10, 5))
    sns.histplot(df[col], kde=True, bins=30, color='blue')
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.show()

In [None]:
# power transformation for skewed features
from sklearn.preprocessing import PowerTransformer

pt = PowerTransformer(method='yeo-johnson')
df[['avg_glucose_level', 'bmi']] = pt.fit_transform(df[['avg_glucose_level', 'bmi']])

In [None]:
# check the distribution after power transformation
for col in ['avg_glucose_level', 'bmi']:
    plt.figure(figsize=(10, 5))
    sns.histplot(df[col], kde=True, bins=30, color='blue')
    plt.title(f'Distribution of {col} after Power Transformation')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.show()

`Outliers after Transformations`

In [None]:
numerical_df = df[['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi']]

outliers_iqr = pd.DataFrame(False, index=numerical_df.index, columns=numerical_df.columns)

for col in numerical_df:
    Q1, Q3 = np.percentile(numerical_df[col].dropna(), [25, 75])
    IQR = Q3 - Q1
    lower, upper = Q1 - 1.5 * IQR, Q3 + 1.5 * IQR
    outliers_iqr[col] = (numerical_df[col] < lower) | (numerical_df[col] > upper)

print("Outliers per column (IQR):")
print(outliers_iqr.sum())
# ratio of outliers
print("Outlier ratio per column (IQR):")
print(outliers_iqr.mean())
# ratio from all
print("Overall outlier ratio (IQR):")
print(outliers_iqr.values.mean())

In [None]:
# z-score 
z_scores = zscore(numerical_df)
threshold = 3  # confidence is 99.7% (±3 standard deviations)
outliers_z = numerical_df[np.abs(z_scores) > threshold]
print("Z-score Outliers:")
print(outliers_z.sum())

- normalization using power transformation reduced no of outliers significantly
- now can remove extreme outliers as their percentage is small

`handle outliers`

In [None]:
# remove outliers with z-score
threshold = 3  # confidence is 99.7% (±3 standard deviations)

# compute z-scores for numeric columns
z_scores = np.abs(zscore(df[numerical_df.columns]))

# mask of rows to keep
mask = (z_scores <= threshold).all(axis=1)

# size before
size_before = df.shape[0]

# apply mask to the full dataframe (not just numerical_df)
df = df[mask].reset_index(drop=True)

# size after
size_after = df.shape[0]

print(f"Removed {size_before - size_after} outliers (Z-score)")
print(f"New size: {size_after}")


In [None]:
# save to CSV
output_file_path = os.path.join(os.getcwd(), '..', 'data','processed_stroke_data.csv')
df.to_csv(output_file_path, index=False)

### `Final Df`
- Unnecessary dropped
- Encoded using label encoding
- Missing values imputed with KNN
- Scaled using StandardScaler
- Outliers handled using power transfotmation
- class imbalance -> later after spliting in modeling notebook
#### Ready for Logestic Regression or any other clf model

In [None]:
processed_df = pd.read_csv(output_file_path)
processed_df.head(10)

In [None]:
processed_df.info()

`prepare data`

In [None]:
X = processed_df.drop(columns=['stroke'])
y = processed_df['stroke']

`data splitting`

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)
# check the shape of the data
print("Training data shape:", X_train.shape)
print("Testing data shape:", X_test.shape)

`handle class imbalance`

In [None]:
# oversample the minority class using SMOTE
sm = SMOTE(
    sampling_strategy=0.5,   # how much to oversample
    random_state=42,           
    k_neighbors=4,              # number of neighbors to generate new samples
)

X_train_res, y_train_res = sm.fit_resample(X_train, y_train)
# check shape after oversampling
print("Training data shape after oversampling:", X_train_res.shape)

In [None]:
# visualize the class distribution
sns.countplot(x=y_train_res)
plt.title("Training class distribution after SMOTE")
plt.show()

`train`

In [None]:
# 1. Logistic Regression (default)
log_reg = LogisticRegression(
    random_state=42,
    max_iter=1000,
    C=1.0,             # regularization strength (inverse)
    solver="lbfgs"     # you can change to "liblinear", "saga" etc.
)

log_reg.fit(X_train_res, y_train_res)  # train on SMOTE-resampled data


In [None]:
# 2. Logistic Regression (with class_weight balanced)
log_reg_balanced = LogisticRegression(
    random_state=42,
    max_iter=1000,
    C=1.0,
    solver="lbfgs",
    class_weight="balanced"
)

log_reg_balanced.fit(X_train_res, y_train_res)  # train on SMOTE-resampled data

In [None]:
# 3. Hard-Margin SVM (linear) -> C is very large (no slack variables)
svm_hard = SVC(
    kernel="linear",
    C=1e6,            # big C → hard margin
    random_state=42
)

svm_hard = LinearSVC(C=1e4, random_state=42, max_iter=10000)

svm_hard.fit(X_train_res, y_train_res)  # train on SMOTE-resampled data

In [None]:
# 4. Soft-Margin SVM (linear)
svm_soft = SVC(
    kernel="linear",
    C=1.0,            # regularization parameter
    random_state=42
)

svm_soft.fit(X_train_res, y_train_res)  # train on SMOTE-resampled data

In [None]:
# 5. RBF Kernel SVM
svm_rbf = SVC(
    kernel="rbf",
    C=1.0,            # regularization
    gamma="scale",    # kernel coefficient ("scale", "auto" or float)
    random_state=42
)

svm_rbf.fit(X_train_res, y_train_res)  # train on SMOTE-resampled data

In [None]:
# 6. Polynomial Kernel SVM
svm_poly = SVC(
    kernel="poly",
    C=1.0,
    degree=5,         # degree of polynomial
    gamma="scale",    # kernel coefficient
    coef0=0.0,        # independent term
    random_state=42,
)

svm_poly.fit(X_train_res, y_train_res)  # train on SMOTE-resampled data

`evalute`

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Define the models with parameters you can adjust
models = {
    "Logistic Regression": log_reg,
    "Logistic Regression (balanced)": log_reg_balanced,

    "SVM Hard-Margin (Linear)": svm_hard,
    
    "SVM Soft-Margin (Linear)": svm_soft,

    "SVM RBF Kernel": svm_rbf,
    "SVM Polynomial Kernel": svm_poly
}

# Loop through and evaluate each model
results = {}

for name, model in models.items():
    y_pred = model.predict(X_test)       # predict on test set

    results[name] = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, zero_division=0),
        "Recall": recall_score(y_test, y_pred, zero_division=0),
        "F1 Score": f1_score(y_test, y_pred, zero_division=0)
    }

# Show results
results_df = pd.DataFrame(results).T
print(results_df)

In [None]:
# Reduce features to 2D using PCA
pca = PCA(n_components=2)
X_train_2D = pca.fit_transform(X_train_res)  # use resampled training data
X_test_2D = pca.transform(X_test)

# Re-train two models on reduced features
svm_linear = SVC(kernel="linear", C=1.0, random_state=42)
svm_rbf = SVC(kernel="rbf", C=1.0, gamma="scale", random_state=42)

svm_linear.fit(X_train_2D, y_train_res)
svm_poly.fit(X_train_2D, y_train_res)

models_2D = {"Linear SVM": svm_linear, "Poly SVM": svm_poly}

# Function to plot decision boundaries
def plot_decision_boundary(model, X, y, title):
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.linspace(x_min, x_max, 300),
                         np.linspace(y_min, y_max, 300))

    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    plt.contourf(xx, yy, Z, alpha=0.3)
    plt.scatter(X[:, 0], X[:, 1], c=y, edgecolor="k", s=20, cmap=plt.cm.coolwarm)
    plt.title(title)
    plt.show()

# Plot for both models
for name, model in models_2D.items():
    plot_decision_boundary(model, X_train_2D, y_train_res, f"Decision Boundary - {name}")
