In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Fetal Health Prediction\n",
    "\n",
    "This notebook implements the data processing, modeling, and evaluation pipeline for the Fetal Health Prediction task."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import os\n",
    "import json\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.metrics import f1_score, roc_auc_score"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Data Loading"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Try loading from 'data/' directory or root\n",
    "try:\n",
    "    # Note: Using 'medical_data.csv' to match standard file naming\n",
    "    medical_data_file = pd.read_csv(\"data/medical_data.csv\")\n",
    "    histogram_data = pd.read_csv(\"data/histogram_data.csv\")\n",
    "    print(\"Loaded files from data/ directory.\")\n",
    "except FileNotFoundError:\n",
    "    medical_data_file = pd.read_csv(\"medical_data.csv\")\n",
    "    histogram_data = pd.read_csv(\"histogram_data.csv\")\n",
    "    print(\"Loaded files from root directory.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Helper Functions & Feature Engineering"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def add_engineered_features(df: pd.DataFrame, eps: float = 1e-6) -> pd.DataFrame:\n",
    "    \"\"\"\n",
    "    Add engineered features as required by instruction.md.\n",
    "    \"\"\"\n",
    "    df = df.copy()\n",
    "\n",
    "    # Major Deceleration Burden\n",
    "    df[\"MajorDecelBurden\"] = df[\"severe_decelerations\"] + df[\"prolongued_decelerations\"]\n",
    "\n",
    "    # Variability Abnormality Index\n",
    "    df[\"VariabilityAbnormalityIndex\"] = (\n",
    "        df[\"abnormal_short_term_variability\"]\n",
    "        + df[\"percentage_of_time_with_abnormal_long_term_variability\"]\n",
    "    )\n",
    "\n",
    "    # Total Decelerations + Reassurance Ratio\n",
    "    df[\"TotalDecelerations\"] = (\n",
    "        df[\"light_decelerations\"]\n",
    "        + df[\"severe_decelerations\"]\n",
    "        + df[\"prolongued_decelerations\"]\n",
    "    )\n",
    "\n",
    "    df[\"ReassuranceRatio\"] = df[\"accelerations\"] / (df[\"TotalDecelerations\"] + eps)\n",
    "\n",
    "    return df\n",
    "\n",
    "def filter_insured_patients(df: pd.DataFrame) -> pd.DataFrame:\n",
    "    \"\"\"\n",
    "    Remove all rows where health_insurance == 0 or False.\n",
    "    \"\"\"\n",
    "    df = df.copy()\n",
    "\n",
    "    # Normalize health_insurance to numeric-ish values where possible\n",
    "    def _to_boolish(x):\n",
    "        if pd.isna(x):\n",
    "            return np.nan\n",
    "        if isinstance(x, bool):\n",
    "            return x\n",
    "        s = str(x).strip().lower()\n",
    "        if s in [\"false\", \"0\", \"no\", \"n\"]:\n",
    "            return False\n",
    "        if s in [\"true\", \"1\", \"yes\", \"y\"]:\n",
    "            return True\n",
    "        try:\n",
    "            return bool(int(float(s)))\n",
    "        except Exception:\n",
    "            return np.nan\n",
    "\n",
    "    df[\"health_insurance_bool\"] = df[\"health_insurance\"].apply(_to_boolish)\n",
    "    df = df[df[\"health_insurance_bool\"] == True].copy()\n",
    "    df = df.drop(columns=[\"health_insurance_bool\"])\n",
    "\n",
    "    return df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. Data Processing Pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "EPS = 1e-6\n",
    "\n",
    "# 1. Create engineered features\n",
    "medical_processed = add_engineered_features(medical_data_file, eps=EPS)\n",
    "\n",
    "# 2. Merge medical + histogram on patient_id\n",
    "final_df = medical_processed.merge(histogram_data, on=\"patient_id\", how=\"inner\")\n",
    "\n",
    "# Drop patient_id after merge\n",
    "final_df = final_df.drop(columns=[\"patient_id\"])\n",
    "\n",
    "# 3. Remove uninsured patients\n",
    "final_df = filter_insured_patients(final_df)\n",
    "\n",
    "# 4. Drop rows with NA\n",
    "final_df = final_df.dropna(axis=0).copy()\n",
    "\n",
    "print(\"Final dataset shape:\", final_df.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. Model Training"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "TARGET = \"fetal_health\"\n",
    "FEATURES = [c for c in final_df.columns if c != TARGET]\n",
    "\n",
    "X = final_df[FEATURES].copy()\n",
    "y = final_df[TARGET].copy()\n",
    "\n",
    "# Split Dataset (70/30, seed 42)\n",
    "X_train, X_test, y_train, y_test = train_test_split(\n",
    "    X, y, test_size=0.30, random_state=42, shuffle=True\n",
    ")\n",
    "\n",
    "# Fit Random Forest\n",
    "rf_model = RandomForestClassifier(\n",
    "    n_estimators=300,\n",
    "    random_state=42,\n",
    "    n_jobs=-1\n",
    ")\n",
    "rf_model.fit(X_train, y_train)\n",
    "\n",
    "# Predictions\n",
    "y_pred = rf_model.predict(X_test)\n",
    "y_proba = rf_model.predict_proba(X_test)\n",
    "\n",
    "# Metrics\n",
    "f1 = f1_score(y_test, y_pred, average=\"macro\")\n",
    "auc = roc_auc_score(y_test, y_proba, multi_class=\"ovr\")\n",
    "\n",
    "print(f\"F1 (macro): {f1}\")\n",
    "print(f\"AUC (ovr): {auc}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5. Deliverables"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 1. Feature Importance Dictionary\n",
    "importances = rf_model.feature_importances_\n",
    "feature_importance_dict = {\n",
    "    feat: round(float(imp), 5)\n",
    "    for feat, imp in zip(FEATURES, importances)\n",
    "}\n",
    "\n",
    "# 2. Model Quality\n",
    "model_quality = {\n",
    "    \"f1\": round(float(f1), 5),\n",
    "    \"auc\": round(float(auc), 5),\n",
    "}\n",
    "\n",
    "# 3. Fetal Status Counts (DataFrame)\n",
    "fetal_status_df = (\n",
    "    final_df[\"fetal_health\"]\n",
    "    .value_counts()\n",
    "    .sort_index()\n",
    "    .reset_index()\n",
    ")\n",
    "fetal_status_df.columns = [\"fetal_health\", \"count\"]\n",
    "\n",
    "# Serialize DataFrame for output\n",
    "fetal_status = fetal_status_df.to_dict(orient=\"split\")\n",
    "\n",
    "print(\"Model Quality:\", model_quality)\n",
    "print(\"Fetal Status:\", fetal_status)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# === CRITICAL: SAVE VARIABLES FOR TEST HARNESS ===\n",
    "\n",
    "# Serialize final_df for verifier\n",
    "final_df_serialized = final_df.to_dict(orient=\"split\")\n",
    "\n",
    "# Define the variables to save\n",
    "# Note: fetal_status is already serialized to dict above\n",
    "notebook_vars = {\n",
    "    \"feature_importance_dict\": feature_importance_dict,\n",
    "    \"model_quality\": model_quality,\n",
    "    \"fetal_status\": fetal_status,\n",
    "    \"final_df\": final_df_serialized\n",
    "}\n",
    "\n",
    "# Ensure directory exists\n",
    "verifier_dir = \"/logs/verifier\"\n",
    "if not os.path.exists(verifier_dir):\n",
    "    try:\n",
    "        os.makedirs(verifier_dir)\n",
    "    except PermissionError:\n",
    "        verifier_dir = \".\"\n",
    "\n",
    "# Save the JSON\n",
    "with open(f\"{verifier_dir}/notebook_variables.json\", \"w\") as f:\n",
    "    json.dump(notebook_vars, f, indent=2)\n",
    "\n",
    "print(f\"Variables saved to {verifier_dir}/notebook_variables.json\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}