In [3]:
from warnings import filterwarnings
filterwarnings(action="ignore")

import numpy as np
import pandas as pd
from tqdm import tqdm
from io import StringIO
from tabulate import tabulate

from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use("ggplot")

In [4]:
df = pd.read_excel("Compiled Maternal Data.xlsx",sheet_name = 'Sheet2')

print("Shape: ", df.shape)
df.head()

Shape:  (1582, 10)


Unnamed: 0,MATERNAL AGE,GESTATIONAL AGE,PARITY,WEIGHT,HEIGHT,BMI,SYSTOLIC BP,DIASTOLIC BP,URINE ANALYSIS,RISK LEVEL
0,25,41.0,0,67.0,1.71,22.913033,180,90,NEGATIVE,High
1,31,21.0,0,76.0,1.61,29.319856,130,80,NEGATIVE,Mid
2,40,16.0,0,90.0,1.65,33.057851,140,90,NEGATIVE,High
3,30,19.0,0,57.0,1.33,32.223416,130,80,NEGATIVE,Mid
4,33,25.0,3,78.0,1.61,30.091432,120,60,NEGATIVE,Mid


In [5]:
bad_rows = df[df['HEIGHT'] == '166/158'].index
df.drop(bad_rows, inplace=True)
bad_rows

Index([392], dtype='int64')

In [6]:
df.loc[df['SYSTOLIC BP'] == '1o5', 'SYSTOLIC BP'] = '105'

In [7]:
df['HEIGHT'] = pd.to_numeric(df['HEIGHT'], errors='coerce')
df['SYSTOLIC BP'] = pd.to_numeric(df['SYSTOLIC BP'], errors='coerce')

In [8]:
df['RISK LEVEL'] = df['RISK LEVEL'].map(lambda x: x if x in ['Low', 'High'] else 'Mid')

In [9]:
df['NON NEGATIVE URINE ANALYSIS'] = df['URINE ANALYSIS'].apply(lambda x: 0 if x == 'NEGATIVE' else 1)

In [10]:
urine_map = {
    'NEGATIVE': 0,
    'POSITIVE': 1,
    'GLUCOSE TRACE': 2
}

df['URINE ANALYSIS'] = df['URINE ANALYSIS'].map(urine_map).fillna(3).astype(int)

In [11]:
risk_level_map = {
    'High': 0,
    'Mid': 1,
    'Low': 2
}

df['RISK LEVEL'] = df['RISK LEVEL'].map(risk_level_map)

In [12]:
df.isnull().sum()

MATERNAL AGE                   0
GESTATIONAL AGE                3
PARITY                         0
WEIGHT                         2
HEIGHT                         0
BMI                            0
SYSTOLIC BP                    0
DIASTOLIC BP                   0
URINE ANALYSIS                 0
RISK LEVEL                     0
NON NEGATIVE URINE ANALYSIS    0
dtype: int64

In [13]:
df = df.fillna(df.mean())

In [14]:
def detect_outliers_iqr(df, threshold=1.5):
    outliers = {}
    for column in df.select_dtypes(include='number').columns:
        q1 = df[column].quantile(0.25)
        q3 = df[column].quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - threshold * iqr
        upper_bound = q3 + threshold * iqr

        outlier_values = df[(df[column] < lower_bound) | (df[column] > upper_bound)][column]
        if not outlier_values.empty:
            outliers[column] = outlier_values

    return outliers

outliers = detect_outliers_iqr(df)

In [15]:
y = df.pop("RISK LEVEL")
X = df

In [16]:
def Train_Algorithms(X, y, algorithms):
    skf = StratifiedKFold(n_splits=10)
    
    test_metrics = {"F1_SCORE": [], "PRECISION": [], "RECALL": [], "ACCURACY": []}
    train_metrics = {"F1_SCORE": [], "PRECISION": [], "RECALL": [], "ACCURACY": []}
    
    model_names, predictions, probabilities = [], [], []
    trained_models = {}

    for algo in algorithms:
        # Smart initialization based on model name
        if algo.__name__ == "CatBoostClassifier":
            model = algo(silent=True)
        elif algo.__name__ == "XGBClassifier":
            model = algo(verbose=False)
        elif algo.__name__ == "LGBMClassifier":
            model = algo(verbosity=-1)
        elif algo.__name__ == "SVC":
            model = algo(probability=True)
        else:
            model = algo()

        fold_metrics_test = {"F1": [], "PRECISION": [], "RECALL": [], "ACCURACY": []}
        fold_metrics_train = {"F1": [], "PRECISION": [], "RECALL": [], "ACCURACY": []}

        for train_idx, test_idx in tqdm(skf.split(X, y), desc=f"{algo.__name__}", total=skf.get_n_splits()):
            x_train, x_test = X.iloc[train_idx], X.iloc[test_idx]
            y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

            model.fit(x_train, y_train)

            y_pred_test = model.predict(x_test)
            y_pred_train = model.predict(x_train)

            # Test metrics
            fold_metrics_test["F1"].append(metrics.f1_score(y_test, y_pred_test, average="macro"))
            fold_metrics_test["PRECISION"].append(metrics.precision_score(y_test, y_pred_test, average="macro"))
            fold_metrics_test["RECALL"].append(metrics.recall_score(y_test, y_pred_test, average="macro"))
            fold_metrics_test["ACCURACY"].append(metrics.accuracy_score(y_test, y_pred_test))

            # Train metrics
            fold_metrics_train["F1"].append(metrics.f1_score(y_train, y_pred_train, average="macro"))
            fold_metrics_train["PRECISION"].append(metrics.precision_score(y_train, y_pred_train, average="macro"))
            fold_metrics_train["RECALL"].append(metrics.recall_score(y_train, y_pred_train, average="macro"))
            fold_metrics_train["ACCURACY"].append(metrics.accuracy_score(y_train, y_pred_train))

        # Store final metrics
        model_name = algo.__name__
        model_names.append(model_name)
        test_metrics["F1_SCORE"].append(np.mean(fold_metrics_test["F1"]))
        test_metrics["PRECISION"].append(np.mean(fold_metrics_test["PRECISION"]))
        test_metrics["RECALL"].append(np.mean(fold_metrics_test["RECALL"]))
        test_metrics["ACCURACY"].append(np.mean(fold_metrics_test["ACCURACY"]))

        train_metrics["F1_SCORE"].append(np.mean(fold_metrics_train["F1"]))
        train_metrics["PRECISION"].append(np.mean(fold_metrics_train["PRECISION"]))
        train_metrics["RECALL"].append(np.mean(fold_metrics_train["RECALL"]))
        train_metrics["ACCURACY"].append(np.mean(fold_metrics_train["ACCURACY"]))

        # Store predictions and trained model
        predictions.append(model.predict(X))
        probabilities.append(model.predict_proba(X)[:, 0])  # first class probability
        trained_models[model_name] = model

    # Tabulated metric summaries
    test_table = tabulate(pd.DataFrame(test_metrics).T, headers=[name.replace("Classifier", "") for name in model_names], tablefmt="double_grid")
    train_table = tabulate(pd.DataFrame(train_metrics).T, headers=[name.replace("Classifier", "") for name in model_names], tablefmt="double_grid")

    # Helper to format predictions/probabilities
    def format_output(output_list):
        df_output = pd.DataFrame(output_list).T
        df_output.columns = model_names
        return df_output

    return test_table, train_table, format_output(predictions), format_output(probabilities), trained_models

In [17]:
data_test, data_train, pred, pred_proba, trained_models = Train_Algorithms(X, y, 
                                                                           [DecisionTreeClassifier, RandomForestClassifier, GradientBoostingClassifier, 
                                                                            XGBClassifier, LGBMClassifier,  CatBoostClassifier])

DecisionTreeClassifier: 100%|██████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 10.26it/s]
RandomForestClassifier: 100%|██████████████████████████████████████████████████████████| 10/10 [00:09<00:00,  1.02it/s]
GradientBoostingClassifier: 100%|██████████████████████████████████████████████████████| 10/10 [00:20<00:00,  2.10s/it]
XGBClassifier: 100%|███████████████████████████████████████████████████████████████████| 10/10 [00:06<00:00,  1.53it/s]
LGBMClassifier: 100%|██████████████████████████████████████████████████████████████████| 10/10 [00:08<00:00,  1.16it/s]
CatBoostClassifier: 100%|██████████████████████████████████████████████████████████████| 10/10 [01:06<00:00,  6.66s/it]


In [18]:
print(data_train)

╔═══════════╦════════════════╦════════════════╦════════════════════╦═══════╦════════╦════════════╗
║           ║   DecisionTree ║   RandomForest ║   GradientBoosting ║   XGB ║   LGBM ║   CatBoost ║
╠═══════════╬════════════════╬════════════════╬════════════════════╬═══════╬════════╬════════════╣
║ F1_SCORE  ║              1 ║              1 ║           0.945734 ║     1 ║      1 ║   0.999217 ║
╠═══════════╬════════════════╬════════════════╬════════════════════╬═══════╬════════╬════════════╣
║ PRECISION ║              1 ║              1 ║           0.9573   ║     1 ║      1 ║   0.999363 ║
╠═══════════╬════════════════╬════════════════╬════════════════════╬═══════╬════════╬════════════╣
║ RECALL    ║              1 ║              1 ║           0.935501 ║     1 ║      1 ║   0.999073 ║
╠═══════════╬════════════════╬════════════════╬════════════════════╬═══════╬════════╬════════════╣
║ ACCURACY  ║              1 ║              1 ║           0.954249 ║     1 ║      1 ║   0.999016 ║
╚═════════

In [19]:
print(data_test)

╔═══════════╦════════════════╦════════════════╦════════════════════╦══════════╦══════════╦════════════╗
║           ║   DecisionTree ║   RandomForest ║   GradientBoosting ║      XGB ║     LGBM ║   CatBoost ║
╠═══════════╬════════════════╬════════════════╬════════════════════╬══════════╬══════════╬════════════╣
║ F1_SCORE  ║       0.788408 ║       0.847685 ║           0.854265 ║ 0.836265 ║ 0.836431 ║   0.839017 ║
╠═══════════╬════════════════╬════════════════╬════════════════════╬══════════╬══════════╬════════════╣
║ PRECISION ║       0.79694  ║       0.872401 ║           0.873261 ║ 0.856956 ║ 0.862245 ║   0.861606 ║
╠═══════════╬════════════════╬════════════════╬════════════════════╬══════════╬══════════╬════════════╣
║ RECALL    ║       0.790064 ║       0.83948  ║           0.847898 ║ 0.830764 ║ 0.828435 ║   0.83263  ║
╠═══════════╬════════════════╬════════════════╬════════════════════╬══════════╬══════════╬════════════╣
║ ACCURACY  ║       0.830523 ║       0.889977 ║           0.8956

In [22]:
from imblearn.over_sampling import RandomOverSampler

def Train_Algorithms_and_Upsample_Minority_class(X, y, algo):
    # Print value counts of the original dataset
    print("Original Target Distribution:")
    print(y.value_counts().to_string())
    print()

    # Perform upsampling
    ros = RandomOverSampler(random_state=42)
    X_resampled, y_resampled = ros.fit_resample(X, y)

    # Print value counts after upsampling
    print("After Upsampling Target Distribution:")
    print(y_resampled.value_counts().to_string())
    print()

    # Initialize Stratified K-Fold
    stf = StratifiedKFold(n_splits=10)

    # Initialize dictionaries and lists
    model_performance = {"F1_SCORE": [], "PRECISION": [], "RECALL": [], "ACCURACY": []}
    model_performances = {"F1_SCORE": [], "PRECISION": [], "RECALL": [], "ACCURACY": []}
    model_names, pred, pred_proba = [], [], []
    trained_models = {}

    for algorithms in algo:
        init_algo = algorithms(silent=True) if algorithms.__name__ == "CatBoostClassifier"\
                    else algorithms(verbose=False) if algorithms.__name__ == "XGBClassifier"\
                    else algorithms(verbosity=-1) if algorithms.__name__ == "LGBMClassifier"\
                    else algorithms(probability=True) if algorithms.__name__ == "SVC"\
                    else algorithms()

        F1_SCORE, F1_SCORE_TRAIN = [], []
        PRECISION, PRECISION_TRAIN = [], []
        RECALL, RECALL_TRAIN = [], []
        ACCURACY, ACCURACY_TRAIN = [], []

        # Split the upsampled dataset
        for train_idx, test_idx in tqdm(stf.split(X_resampled, y_resampled), desc=f"{algorithms.__name__}", total=len(algo)):
            xtrain, xtest = X_resampled.iloc[train_idx], X_resampled.iloc[test_idx]
            ytrain, ytest = y_resampled.iloc[train_idx], y_resampled.iloc[test_idx]

            # Fit the model
            init_algo.fit(xtrain, ytrain)
            test_pred = init_algo.predict(xtest)
            train_pred = init_algo.predict(xtrain)

            # Test Metrics
            F1_SCORE.append(metrics.f1_score(y_true=ytest, y_pred=test_pred, average="macro"))
            PRECISION.append(metrics.precision_score(y_true=ytest, y_pred=test_pred, average="macro"))
            RECALL.append(metrics.recall_score(y_true=ytest, y_pred=test_pred, average="macro"))
            ACCURACY.append(metrics.accuracy_score(y_true=ytest, y_pred=test_pred))

            # Train Metrics
            F1_SCORE_TRAIN.append(metrics.f1_score(y_true=ytrain, y_pred=train_pred, average="macro"))
            PRECISION_TRAIN.append(metrics.precision_score(y_true=ytrain, y_pred=train_pred, average="macro"))
            RECALL_TRAIN.append(metrics.recall_score(y_true=ytrain, y_pred=train_pred, average="macro"))
            ACCURACY_TRAIN.append(metrics.accuracy_score(y_true=ytrain, y_pred=train_pred))

        # Store predictions
        pred_proba.append(init_algo.predict_proba(X_resampled)[:, 0])  # take the first data probability
        pred.append(init_algo.predict(X_resampled))  # take predicted score

        # Append model names and performance metrics
        model_names.append(algorithms.__name__)
        model_performance["F1_SCORE"].append(np.mean(F1_SCORE))
        model_performance["PRECISION"].append(np.mean(PRECISION))
        model_performance["RECALL"].append(np.mean(RECALL))
        model_performance["ACCURACY"].append(np.mean(ACCURACY))

        # Train Model Performance
        model_performances["F1_SCORE"].append(np.mean(F1_SCORE_TRAIN))
        model_performances["PRECISION"].append(np.mean(PRECISION_TRAIN))
        model_performances["RECALL"].append(np.mean(RECALL_TRAIN))
        model_performances["ACCURACY"].append(np.mean(ACCURACY_TRAIN))

        trained_models[algorithms.__name__] = init_algo

    # Format performance results into tables
    data_train = tabulate(pd.DataFrame(model_performances).T, headers=[i.split("Classifier")[0] for i in model_names], tablefmt="double_grid")
    data = tabulate(pd.DataFrame(model_performance).T, headers=[i.split("Classifier")[0] for i in model_names], tablefmt="double_grid")

    # Helper function to clean predictions for CSV output
    def clean_csv(df):
        df_fix = pd.DataFrame(pd.DataFrame(df).T)
        column_names = df_fix.columns.tolist()
        for old_col, new_col in zip(column_names, model_names):
            df_fix.rename(columns={old_col: new_col}, inplace=True)
        return df_fix

    return data, data_train, clean_csv(pred), clean_csv(pred_proba), y_resampled, trained_models

In [24]:
data_upsample, data_train_upsample, pred_upsample, pred_proba_upsample, y_upsample, trained_models = Train_Algorithms_and_Upsample_Minority_class(X, y,
                                                                                                                                                  [DecisionTreeClassifier])

Original Target Distribution:
RISK LEVEL
2    844
1    545
0    192

After Upsampling Target Distribution:
RISK LEVEL
0    844
1    844
2    844



DecisionTreeClassifier: 10it [00:00, 32.69it/s]                                                                        


In [25]:
# @title UPSAMPLING TRAINING PERFORMANCE RESULT
print(data_train_upsample)

╔═══════════╦════════════════╗
║           ║   DecisionTree ║
╠═══════════╬════════════════╣
║ F1_SCORE  ║              1 ║
╠═══════════╬════════════════╣
║ PRECISION ║              1 ║
╠═══════════╬════════════════╣
║ RECALL    ║              1 ║
╠═══════════╬════════════════╣
║ ACCURACY  ║              1 ║
╚═══════════╩════════════════╝


In [26]:
# @title UPSAMPLING PERFORMANCE RESULT
print(data_train)

╔═══════════╦════════════════╦════════════════╦════════════════════╦═══════╦════════╦════════════╗
║           ║   DecisionTree ║   RandomForest ║   GradientBoosting ║   XGB ║   LGBM ║   CatBoost ║
╠═══════════╬════════════════╬════════════════╬════════════════════╬═══════╬════════╬════════════╣
║ F1_SCORE  ║              1 ║              1 ║           0.945734 ║     1 ║      1 ║   0.999217 ║
╠═══════════╬════════════════╬════════════════╬════════════════════╬═══════╬════════╬════════════╣
║ PRECISION ║              1 ║              1 ║           0.9573   ║     1 ║      1 ║   0.999363 ║
╠═══════════╬════════════════╬════════════════╬════════════════════╬═══════╬════════╬════════════╣
║ RECALL    ║              1 ║              1 ║           0.935501 ║     1 ║      1 ║   0.999073 ║
╠═══════════╬════════════════╬════════════════╬════════════════════╬═══════╬════════╬════════════╣
║ ACCURACY  ║              1 ║              1 ║           0.954249 ║     1 ║      1 ║   0.999016 ║
╚═════════

In [27]:
# @title UPSAMPLING TEST PERFORMANCE RESULT
print(data_upsample)

╔═══════════╦════════════════╗
║           ║   DecisionTree ║
╠═══════════╬════════════════╣
║ F1_SCORE  ║       0.937144 ║
╠═══════════╬════════════════╣
║ PRECISION ║       0.938628 ║
╠═══════════╬════════════════╣
║ RECALL    ║       0.937652 ║
╠═══════════╬════════════════╣
║ ACCURACY  ║       0.937643 ║
╚═══════════╩════════════════╝
