In [1]:
import os
import json
import sys

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

sys.path.append("../..")

DATA_DIR = "../data"

from early_diagnosis.data_loader.loader import load_data
from early_diagnosis.data_loader.source import EarlyDiagnosisCPRDSource
from abstract_models.imputation import median_imputer_missing

In [2]:
balanced_df = load_data(os.path.join(DATA_DIR, "processed", "balanced_ED_NT.csv"))

attr_selections = json.load(open(os.path.join(DATA_DIR, "expert_attr_selection.json")))

target = "Dia_HFD_12M"

In [3]:
imputer = median_imputer_missing
model = RandomForestClassifier()

pipeline = Pipeline(steps=[('preprocessor', imputer), ('classifier', model)])


In [4]:
data_source = EarlyDiagnosisCPRDSource(balanced_df, target=target)

In [5]:
X, y = data_source.xy()
X = X.drop([
    "ID", "date", 'days_to_HFD', 'days_in_observation', "days_in_db", 'Dia_HFD_patient', 'Dia_HFD_event'],
    axis=1)
attrs = list(set(attr_selections["expert"]).intersection(X.columns)) + ['Med_LD_permanent']
attrs.remove("Blo_NT")
X = X.loc[:, attrs].rename(columns={"Med_LD_permanent": "Med_LD"})


In [6]:
pipeline.fit(X,y)

In [7]:
import mlflow
import mlflow.sklearn

mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("early_diagnosis_NT")

# Start an MLflow run
with mlflow.start_run() as run:
    # Log the model
    mlflow.sklearn.log_model(
        sk_model=pipeline,
        artifact_path="model",
        registered_model_name="EarlyDiagnosis_M0"  # Optional: Name in the model registry
    )

    # Log parameters or metrics if needed
    mlflow.log_param("model_type", "RandomForest")
    mlflow.log_param("preprocessing", "median_imputer_missing")
    mlflow.log_param("dataset", "balanced no NT")
    mlflow.log_param("Accuracy", 0.78)
    mlflow.log_param("AUC", 0.86)
    mlflow.log_param("Sensitivity", 0.75)
    mlflow.log_param("Specificity", 0.82)

Successfully registered model 'EarlyDiagnosis_M0'.
2025/12/08 09:23:49 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: EarlyDiagnosis_M0, version 1
Created version '1' of model 'EarlyDiagnosis_M0'.


üèÉ View run indecisive-cub-341 at: http://127.0.0.1:5000/#/experiments/474062631850540483/runs/dfcf410e73a54c21b78745cf2c66367b
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/474062631850540483
