In [17]:
import os
import json
import sys

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

sys.path.append("../..")

DATA_DIR = "../data"

from early_diagnosis.data_loader.loader import load_data
from early_diagnosis.data_loader.source import EarlyDiagnosisCPRDSource
from abstract_models.imputation import median_imputer_missing

In [18]:
df = load_data(os.path.join(DATA_DIR, "processed", "early_diagnosis_NT.csv"))

attr_selections = json.load(open(os.path.join(DATA_DIR, "expert_attr_selection.json")))

target = "Dia_HFD_12M"

In [19]:
imputer = median_imputer_missing
model = RandomForestClassifier()

pipeline = Pipeline(steps=[('preprocessor', imputer), ('classifier', model)])


In [20]:
df_step = df.dropna(subset=target)
# remove all that are eventually diagnosed
df_step = df_step.drop(df_step.index[df_step[target].eq(0) & df_step["Dia_HFD_patient"].eq(1)])
df_step[target].value_counts()
n_positive = df_step[target].value_counts()[1]
negative_df = df_step.loc[df_step["Dia_HFD_patient"].eq(0)].sort_values(by="days_in_db", ascending=False).iloc[:n_positive]

balanced_df = pd.concat(
    [
        df_step.loc[df_step[target].eq(1)],
        negative_df
    ], axis=0
)

data_source = EarlyDiagnosisCPRDSource(balanced_df, target=target)

In [21]:
X, y = data_source.xy()
X = X.drop([
    "ID", "date", 'days_to_HFD', 'days_in_observation', "days_in_db", 'Dia_HFD_patient', 'Dia_HFD_event'],
    axis=1)
attrs = list(set(attr_selections["expert"]).intersection(X.columns)) + ['Med_LD_permanent']
X = X.loc[:, attrs].rename(columns={"Med_LD_permanent": "Med_LD"})


In [23]:
pipeline.fit(X,y)

In [27]:
import mlflow
import mlflow.sklearn

mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("early_diagnosis_NT")

# Start an MLflow run
with mlflow.start_run() as run:
    # Log the model
    mlflow.sklearn.log_model(
        sk_model=pipeline,
        artifact_path="model",
        registered_model_name="EarlyDiagnosis"  # Optional: Name in the model registry
    )

    # Log parameters or metrics if needed
    mlflow.log_param("model_type", "RandomForest")
    mlflow.log_param("preprocessing", "median_imputer_missing")

Successfully registered model 'EarlyDiagnosis'.
2025/10/30 14:01:54 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: EarlyDiagnosis, version 1
Created version '1' of model 'EarlyDiagnosis'.
