In [5]:
from warnings import filterwarnings
filterwarnings(action="ignore")

import numpy as np
import pandas as pd
from tqdm import tqdm
from io import StringIO
from tabulate import tabulate

from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use("ggplot")

In [6]:
df = pd.read_excel("Compiled Maternal Data.xlsx",sheet_name = 'Sheet2')

print("Shape: ", df.shape)
df.head()

Shape:  (1582, 10)


Unnamed: 0,MATERNAL AGE,GESTATIONAL AGE,PARITY,WEIGHT,HEIGHT,BMI,SYSTOLIC BP,DIASTOLIC BP,URINE ANALYSIS,RISK LEVEL
0,25,41.0,0,67.0,1.71,22.913033,180,90,NEGATIVE,High
1,31,21.0,0,76.0,1.61,29.319856,130,80,NEGATIVE,Mid
2,40,16.0,0,90.0,1.65,33.057851,140,90,NEGATIVE,High
3,30,19.0,0,57.0,1.33,32.223416,130,80,NEGATIVE,Mid
4,33,25.0,3,78.0,1.61,30.091432,120,60,NEGATIVE,Mid


In [7]:
bad_rows = df[df['HEIGHT'] == '166/158'].index
df.drop(bad_rows, inplace=True)
bad_rows

Index([392], dtype='int64')

In [8]:
df.loc[df['SYSTOLIC BP'] == '1o5', 'SYSTOLIC BP'] = '105'

In [9]:
df['HEIGHT'] = pd.to_numeric(df['HEIGHT'], errors='coerce')
df['SYSTOLIC BP'] = pd.to_numeric(df['SYSTOLIC BP'], errors='coerce')

In [10]:
df['RISK LEVEL'] = df['RISK LEVEL'].map(lambda x: x if x in ['Low', 'High'] else 'Mid')

In [11]:
df['NON NEGATIVE URINE ANALYSIS'] = df['URINE ANALYSIS'].apply(lambda x: 0 if x == 'NEGATIVE' else 1)

In [12]:
urine_map = {
    'NEGATIVE': 0,
    'POSITIVE': 1,
    'GLUCOSE TRACE': 2
}

df['URINE ANALYSIS'] = df['URINE ANALYSIS'].map(urine_map).fillna(3).astype(int)

In [13]:
risk_level_map = {
    'High': 0,
    'Mid': 1,
    'Low': 2
}

df['RISK LEVEL'] = df['RISK LEVEL'].map(risk_level_map)

In [15]:
def detect_outliers_iqr(df, threshold=1.5):
    outliers = {}
    for column in df.select_dtypes(include='number').columns:
        q1 = df[column].quantile(0.25)
        q3 = df[column].quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - threshold * iqr
        upper_bound = q3 + threshold * iqr

        outlier_values = df[(df[column] < lower_bound) | (df[column] > upper_bound)][column]
        if not outlier_values.empty:
            outliers[column] = outlier_values

    return outliers

outliers = detect_outliers_iqr(df)

In [16]:
y = df.pop("RISK LEVEL")
X = df

In [17]:
def Train_Algorithms(X, y, algorithms):
    skf = StratifiedKFold(n_splits=10)
    
    test_metrics = {"F1_SCORE": [], "PRECISION": [], "RECALL": [], "ACCURACY": []}
    train_metrics = {"F1_SCORE": [], "PRECISION": [], "RECALL": [], "ACCURACY": []}
    
    model_names, predictions, probabilities = [], [], []
    trained_models = {}

    for algo in algorithms:
        # Smart initialization based on model name
        if algo.__name__ == "CatBoostClassifier":
            model = algo(silent=True)
        elif algo.__name__ == "XGBClassifier":
            model = algo(verbose=False)
        elif algo.__name__ == "LGBMClassifier":
            model = algo(verbosity=-1)
        elif algo.__name__ == "SVC":
            model = algo(probability=True)
        else:
            model = algo()

        fold_metrics_test = {"F1": [], "PRECISION": [], "RECALL": [], "ACCURACY": []}
        fold_metrics_train = {"F1": [], "PRECISION": [], "RECALL": [], "ACCURACY": []}

        for train_idx, test_idx in tqdm(skf.split(X, y), desc=f"{algo.__name__}", total=skf.get_n_splits()):
            x_train, x_test = X.iloc[train_idx], X.iloc[test_idx]
            y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

            model.fit(x_train, y_train)

            y_pred_test = model.predict(x_test)
            y_pred_train = model.predict(x_train)

            # Test metrics
            fold_metrics_test["F1"].append(metrics.f1_score(y_test, y_pred_test, average="macro"))
            fold_metrics_test["PRECISION"].append(metrics.precision_score(y_test, y_pred_test, average="macro"))
            fold_metrics_test["RECALL"].append(metrics.recall_score(y_test, y_pred_test, average="macro"))
            fold_metrics_test["ACCURACY"].append(metrics.accuracy_score(y_test, y_pred_test))

            # Train metrics
            fold_metrics_train["F1"].append(metrics.f1_score(y_train, y_pred_train, average="macro"))
            fold_metrics_train["PRECISION"].append(metrics.precision_score(y_train, y_pred_train, average="macro"))
            fold_metrics_train["RECALL"].append(metrics.recall_score(y_train, y_pred_train, average="macro"))
            fold_metrics_train["ACCURACY"].append(metrics.accuracy_score(y_train, y_pred_train))

        # Store final metrics
        model_name = algo.__name__
        model_names.append(model_name)
        test_metrics["F1_SCORE"].append(np.mean(fold_metrics_test["F1"]))
        test_metrics["PRECISION"].append(np.mean(fold_metrics_test["PRECISION"]))
        test_metrics["RECALL"].append(np.mean(fold_metrics_test["RECALL"]))
        test_metrics["ACCURACY"].append(np.mean(fold_metrics_test["ACCURACY"]))

        train_metrics["F1_SCORE"].append(np.mean(fold_metrics_train["F1"]))
        train_metrics["PRECISION"].append(np.mean(fold_metrics_train["PRECISION"]))
        train_metrics["RECALL"].append(np.mean(fold_metrics_train["RECALL"]))
        train_metrics["ACCURACY"].append(np.mean(fold_metrics_train["ACCURACY"]))

        # Store predictions and trained model
        predictions.append(model.predict(X))
        probabilities.append(model.predict_proba(X)[:, 0])  # first class probability
        trained_models[model_name] = model

    # Tabulated metric summaries
    test_table = tabulate(pd.DataFrame(test_metrics).T, headers=[name.replace("Classifier", "") for name in model_names], tablefmt="double_grid")
    train_table = tabulate(pd.DataFrame(train_metrics).T, headers=[name.replace("Classifier", "") for name in model_names], tablefmt="double_grid")

    # Helper to format predictions/probabilities
    def format_output(output_list):
        df_output = pd.DataFrame(output_list).T
        df_output.columns = model_names
        return df_output

    return test_table, train_table, format_output(predictions), format_output(probabilities), trained_models

In [21]:
data_test, data_train, pred, pred_proba, trained_models = Train_Algorithms(X, y, [DecisionTreeClassifier])

DecisionTreeClassifier: 100%|██████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 15.61it/s]


In [23]:
print(data_train)

╔═══════════╦════════════════╗
║           ║   DecisionTree ║
╠═══════════╬════════════════╣
║ F1_SCORE  ║       0.99977  ║
╠═══════════╬════════════════╣
║ PRECISION ║       0.999697 ║
╠═══════════╬════════════════╣
║ RECALL    ║       0.999844 ║
╠═══════════╬════════════════╣
║ ACCURACY  ║       0.999789 ║
╚═══════════╩════════════════╝


In [22]:
print(data_test)

╔═══════════╦════════════════╗
║           ║   DecisionTree ║
╠═══════════╬════════════════╣
║ F1_SCORE  ║       0.802262 ║
╠═══════════╬════════════════╣
║ PRECISION ║       0.813398 ║
╠═══════════╬════════════════╣
║ RECALL    ║       0.80126  ║
╠═══════════╬════════════════╣
║ ACCURACY  ║       0.845072 ║
╚═══════════╩════════════════╝
