<a href="https://colab.research.google.com/github/Anirudhan007/Fetal-Health-Prediction/blob/main/fetal_health_solution_ipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [1]:
# Importing Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_report, accuracy_score, f1_score
from sklearn import metrics
from sklearn.metrics import roc_curve, auc, roc_auc_score
np.random.seed(0)


In [18]:
try:
    medical_data = pd.read_csv("data/medical_data_file.csv")
    histogram_data =pd.read_csv("data/histogram_data.csv")
    print("Loaded files from 'data/' directory.")
except FileNotFoundError:
    medical_data = pd.read_csv("medical_data_file.csv")
    histogram_data =pd.read_csv("histogram_data.csv")
    print("Loaded files from 'data/' directory.")




Loaded files from 'data/' directory.


In [19]:
# Preview
print("medical_data_file preview:")
print(medical_data_file.head())

print("\nhistogram_data preview:")
print(histogram_data.head())

medical_data_file preview:
   patient_id  baseline value  accelerations  fetal_movement  \
0           1             120          0.000             0.0   
1           2             132          0.006             0.0   
2           3             133          0.003             0.0   
3           4             134          0.003             0.0   
4           5             132          0.007             0.0   

   uterine_contractions  light_decelerations  severe_decelerations  \
0                 0.000                0.000                   0.0   
1                 0.006                0.003                   0.0   
2                 0.008                0.003                   0.0   
3                 0.008                0.003                   0.0   
4                 0.008                0.000                   0.0   

   prolongued_decelerations  abnormal_short_term_variability  \
0                       0.0                               73   
1                       0.0            

In [20]:
# -----------------------------------
# 1. Helper functions (Fetal Health Prediction)

def add_engineered_features(df: pd.DataFrame, eps: float = 1e-6) -> pd.DataFrame:
    """
    Add engineered features as required by instruction.md.
    This function assumes the following columns exist:
      - severe_decelerations
      - prolonged_decelerations
      - abnormal_short_term_variability
      - percentage_of_time_with_abnormal_long_term_variability
      - light_decelerations
      - prolongued_decelerations
      - accelerations
    """
    df = df.copy()

    # Major Deceleration Burden
    df["MajorDecelBurden"] = df["severe_decelerations"] + df["prolonged_decelerations"]

    # Variability Abnormality Index
    df["VariabilityAbnormalityIndex"] = (
        df["abnormal_short_term_variability"]
        + df["percentage_of_time_with_abnormal_long_term_variability"]
    )

    # Total Decelerations + Reassurance Ratio
    df["TotalDecelerations"] = (
        df["light_decelerations"]
        + df["severe_decelerations"]
        + df["prolongued_decelerations"]
    )

    df["ReassuranceRatio"] = df["accelerations"] / (df["TotalDecelerations"] + eps)

    return df


def filter_insured_patients(df: pd.DataFrame) -> pd.DataFrame:
    """
    Remove all rows where health_insurance == 0 or False.
    Handles numeric and string-like values such as "False", "0", etc.
    """
    df = df.copy()

    # Normalize health_insurance to numeric-ish values where possible
    # True/False -> 1/0, strings -> parsed
    def _to_boolish(x):
        if pd.isna(x):
            return np.nan
        if isinstance(x, bool):
            return x
        s = str(x).strip().lower()
        if s in ["false", "0", "no", "n"]:
            return False
        if s in ["true", "1", "yes", "y"]:
            return True
        # fallback: try numeric conversion
        try:
            return bool(int(float(s)))
        except Exception:
            return np.nan

    df["health_insurance_bool"] = df["health_insurance"].apply(_to_boolish)

    # Keep only insured (True)
    df = df[df["health_insurance_bool"] == True].copy()

    # Keep original column name health_insurance (do not replace),
    # drop helper column
    df = df.drop(columns=["health_insurance_bool"])

    return df


def reconstruct_dataframe_from_split_dict(data: dict) -> pd.DataFrame:
    """
    Reconstruct DataFrame from to_dict(orient='split') serialization.
    """
    if isinstance(data, pd.DataFrame):
        return data
    if isinstance(data, dict) and "columns" in data and "data" in data:
        return pd.DataFrame(data["data"], columns=data["columns"])
    return pd.DataFrame(data)


In [21]:
# -----------------------------------
# 2. Data Preparation + Dataset Integration + NA Handling

EPS = 1e-6  # small constant to avoid divide-by-zero

# Corrected add_engineered_features function
def add_engineered_features(df: pd.DataFrame, eps: float = 1e-6) -> pd.DataFrame:
    """
    Add engineered features as required by instruction.md.
    This function assumes the following columns exist:
      - severe_decelerations
      - prolongued_decelerations
      - abnormal_short_term_variability
      - percentage_of_time_with_abnormal_long_term_variability
      - light_decelerations
      - accelerations
    """
    df = df.copy()

    # Major Deceleration Burden
    df["MajorDecelBurden"] = df["severe_decelerations"] + df["prolongued_decelerations"]

    # Variability Abnormality Index
    df["VariabilityAbnormalityIndex"] = (
        df["abnormal_short_term_variability"]
        + df["percentage_of_time_with_abnormal_long_term_variability"]
    )

    # Total Decelerations + Reassurance Ratio
    df["TotalDecelerations"] = (
        df["light_decelerations"]
        + df["severe_decelerations"]
        + df["prolongued_decelerations"]
    )

    df["ReassuranceRatio"] = df["accelerations"] / (df["TotalDecelerations"] + eps)

    return df

# 1) Create engineered features in medical_data_file
medical_processed = add_engineered_features(medical_data_file, eps=EPS)

# 2) Merge medical + histogram on patient_id
final_df = medical_processed.merge(histogram_data, on="patient_id", how="inner")

# Drop patient_id after merge (ID feature leakage prevention)
final_df = final_df.drop(columns=["patient_id"])

# 3) Remove all rows where health_insurance == 0 or False
final_df = filter_insured_patients(final_df)

# 4) Drop rows with NA in features or target
# Instruction says: drop any row containing NA in features or target
# Since we use all features, drop rows with ANY NA
final_df = final_df.dropna(axis=0).copy()

# Quick sanity checks
print("final_df shape:", final_df.shape)
final_df.head()

final_df shape: (2105, 27)


Unnamed: 0,baseline value,accelerations,fetal_movement,uterine_contractions,light_decelerations,severe_decelerations,prolongued_decelerations,abnormal_short_term_variability,mean_value_of_short_term_variability,percentage_of_time_with_abnormal_long_term_variability,...,histogram_min,histogram_max,histogram_number_of_peaks,histogram_number_of_zeroes,histogram_mode,histogram_mean,histogram_median,histogram_variance,histogram_tendency,health_insurance
0,120,0.0,0.0,0.0,0.0,0.0,0.0,73,0.5,43,...,62,126,2,0,120,137,121,73,1,1
1,132,0.006,0.0,0.006,0.003,0.0,0.0,17,2.1,0,...,68,198,6,1,141,136,140,12,0,1
2,133,0.003,0.0,0.008,0.003,0.0,0.0,16,2.1,0,...,68,198,5,1,141,135,138,13,0,1
3,134,0.003,0.0,0.008,0.003,0.0,0.0,16,2.4,0,...,53,170,11,0,137,134,137,13,1,1
4,132,0.007,0.0,0.008,0.0,0.0,0.0,16,2.4,0,...,53,170,9,0,137,136,138,11,1,1


In [22]:
# -----------------------------------
# 3. Prepare Features and Target

TARGET = "fetal_health"

# Use all features except the target column
FEATURES = [c for c in final_df.columns if c != TARGET]

# Ensure target exists
assert TARGET in final_df.columns, "CRITICAL: fetal_health target column not found in final_df."

X = final_df[FEATURES].copy()
y = final_df[TARGET].copy()

print("Number of features:", len(FEATURES))
print("X shape:", X.shape)
print("y shape:", y.shape)

# Quick check: classes in target
print("Target classes:", sorted(y.unique()))


Number of features: 26
X shape: (2105, 26)
y shape: (2105,)
Target classes: [np.int64(1), np.int64(2), np.int64(3)]


In [23]:
# -----------------------------------
# 4. Train/Test Split (70/30)

RANDOM_SEED = 42

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=RANDOM_SEED, shuffle=True
)

print("Train/Test split (rows):", X_train.shape[0], "/", X_test.shape[0])


Train/Test split (rows): 1473 / 632


In [24]:
# -----------------------------------
# 5. Train RandomForestClassifier + Evaluation (F1 + AUC)

rf_model = RandomForestClassifier(
    n_estimators=300,
    random_state=RANDOM_SEED,
    n_jobs=-1
)

rf_model.fit(X_train, y_train)

# Predictions
y_pred = rf_model.predict(X_test)

# Probabilities for AUC (multi-class)
y_proba = rf_model.predict_proba(X_test)

# F1-score (macro for 3-class classification)
f1 = f1_score(y_test, y_pred, average="macro")

# AUC for multiclass using One-vs-Rest
auc = roc_auc_score(y_test, y_proba, multi_class="ovr")

print("F1 (macro):", f1)
print("AUC (ovr):", auc)


F1 (macro): 0.8940986472286047
AUC (ovr): 0.9878347180006205


In [25]:
# -----------------------------------
# 6. Deliverables

# 1) Feature importance dict (rounded to 5 decimals)
importances = rf_model.feature_importances_

feature_importance_dict = {
    feat: round(float(imp), 5)
    for feat, imp in zip(FEATURES, importances)
}

# 2) Model quality (rounded to 5 decimals)
model_quality = {
    "f1": round(float(f1), 5),
    "auc": round(float(auc), 5),
}

# 3) fetal_status dataframe (counts per class 1/2/3)
fetal_status = (
    final_df["fetal_health"]
    .value_counts()
    .sort_index()
    .reset_index()
)

fetal_status.columns = ["fetal_health", "count"]

print("=== feature_importance_dict (sample) ===")
print(dict(list(feature_importance_dict.items())))

print("\n=== model_quality ===")
print(model_quality)

print("\n=== fetal_status ===")
print(fetal_status)


=== feature_importance_dict (sample) ===
{'baseline value': 0.03411, 'accelerations': 0.03331, 'fetal_movement': 0.0155, 'uterine_contractions': 0.0247, 'light_decelerations': 0.00546, 'severe_decelerations': 0.00029, 'prolongued_decelerations': 0.0385, 'abnormal_short_term_variability': 0.1031, 'mean_value_of_short_term_variability': 0.0988, 'percentage_of_time_with_abnormal_long_term_variability': 0.09199, 'mean_value_of_long_term_variability': 0.04015, 'MajorDecelBurden': 0.03871, 'VariabilityAbnormalityIndex': 0.12912, 'TotalDecelerations': 0.00874, 'ReassuranceRatio': 0.03089, 'histogram_width': 0.02905, 'histogram_min': 0.02733, 'histogram_max': 0.02505, 'histogram_number_of_peaks': 0.0184, 'histogram_number_of_zeroes': 0.00562, 'histogram_mode': 0.04659, 'histogram_mean': 0.07398, 'histogram_median': 0.04725, 'histogram_variance': 0.02622, 'histogram_tendency': 0.00713, 'health_insurance': 0.0}

=== model_quality ===
{'f1': 0.8941, 'auc': 0.98783}

=== fetal_status ===
   fetal_

In [22]:
# -----------------------------------
# 7. Variable Serialization (required)

fetal_status = fetal_status.to_dict(orient="split")
