# ML Base Models


## 
---

In [1]:
import mlflow
import os
TRACKING_DB = "sqlite:///D:/Project/Predictive_maintaince/Data/mlflow.db"
ARTIFACT_ROOT = "file:///D:/Project/Predictive_maintaince/mlartifacts"
mlflow.set_tracking_uri(TRACKING_DB)
experiment_name = "compressor_fault_model_v1"
experiment = mlflow.get_experiment_by_name(experiment_name)
if experiment is None:
    experiment_id = mlflow.create_experiment(
        name=experiment_name,
        artifact_location=ARTIFACT_ROOT
    )
else:
    experiment_id = experiment.experiment_id
mlflow.set_experiment(experiment_name)


import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix,f1_score,accuracy_score
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "notebook_connected"
import plotly.subplots as sp

2026/01/24 00:31:33 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.schemas
2026/01/24 00:31:33 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.tables
2026/01/24 00:31:33 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.types
2026/01/24 00:31:33 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.constraints
2026/01/24 00:31:33 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.defaults
2026/01/24 00:31:33 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.comments
2026/01/24 00:31:33 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2026/01/24 00:31:33 INFO mlflow.store.db.utils: Updating database tables
2026/01/24 00:31:33 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2026/01/24 00:31:33 INFO alembic.runtime.migration: Will assume non-transactional DDL.
2026/01/24 00:31:33 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2026/01/24 00:31:33 INFO alembic.runtime

In [2]:
val_df = pd.read_csv('../Data/processed/val_data.csv')
train_df = pd.read_csv('../Data/processed/train_data.csv')

In [3]:
FEATURES=["T_amb",
"T_evap_sat",
"P_dis_bar",
"P_suc_bar",
"P_comp_W",
"N_comp_Hz",
"delta_cond_evap",
"cooling_efficiency",
"P_dis_bar_rolling_mean",
"P_suc_bar_rolling_std",
"COP_diff",
"N_comp_Hz_diff",
"door_open",
"frost_level"]


In [4]:

def create_windows(df, window_size=60, stride=15):
    X, y = [], []

    for run_id in df['run_id'].unique():
        run_data = df[df['run_id'] == run_id]

        values = run_data[FEATURES].values
        label = run_data['fault_id'].iloc[0]

        for start in range(0, len(values) - window_size, stride):
            end = start + window_size

            X.append(values[start:end])
            y.append(label)

    return np.array(X), np.array(y)


In [5]:
X_train, y_train = create_windows(train_df)
X_val, y_val     = create_windows(val_df)

print(X_train.shape)
print(y_train.shape)


(95679, 60, 14)
(95679,)


In [6]:
X_train_flat = X_train.reshape(X_train.shape[0], -1)
X_val_flat   = X_val.reshape(X_val.shape[0], -1)

print(X_train_flat.shape)


(95679, 840)


In [7]:
scaler = StandardScaler()

X_train_flat = scaler.fit_transform(X_train_flat)
X_val_flat   = scaler.transform(X_val_flat)

## Random-forest

In [8]:
# mlflow.sklearn.autolog()

with mlflow.start_run(run_name="random_forest_model_v1"):
    mlflow.log_param("model","RandomForest")
    mlflow.log_param("n_estimators",200)
    mlflow.log_param("max_depth",20)
    mlflow.log_param("random_state",42)
    mlflow.log_param("window_size",60)
    mlflow.log_param("stride",15)
    mlflow.log_param("RUN_TYPE","CPU")
    
    rf = RandomForestClassifier(
        n_estimators=200,
        max_depth=20,
        n_jobs=-1,
        random_state=42
)


    rf.fit(X_train_flat, y_train)
    y_pred = rf.predict(X_val_flat)
    
    acc_rf = accuracy_score(y_val, y_pred)
    f1_rf = f1_score(y_val, y_pred, average='macro')
    cls_rf = classification_report(y_val, y_pred)
    cm_rf = confusion_matrix(y_val, y_pred)
    mlflow.log_metric("accuracy", acc_rf)
    mlflow.log_metric("f1_macro", f1_rf)
    mlflow.log_text(cls_rf, "classification_report.txt")
    mlflow.log_text(str(cm_rf), "confusion_matrix.txt")
    fig = px.imshow(
    cm_rf,
    text_auto=True,
    title="Confusion Matrix — Random Forest Baseline",
    color_continuous_scale="Blues"
)
    fig.write_image(r"../Plots/Base-Models-Benchmarking/Random_Forest_Confusion_Matrix.png")
    mlflow.log_artifact(r"../Plots/Base-Models-Benchmarking/Random_Forest_Confusion_Matrix.png")
    fig.show()
    mlflow.sklearn.log_model(rf, "random_forest_model")
    mlflow.end_run()



##
---

## CatBoost

In [9]:
from catboost import CatBoostClassifier


In [10]:
with mlflow.start_run(run_name="catboost_model_v1"):
    mlflow.log_param("model","CatBoost")
    mlflow.log_param("iterations",1000)
    mlflow.log_param("depth",8)
    mlflow.log_param("learning_rate",0.05)
    mlflow.log_param("loss_function",'multi-class')
    mlflow.log_param("window_size",60)
    mlflow.log_param("stride",15)
    mlflow.log_param("RUN_TYPE","GPU")
    
    model = CatBoostClassifier(
        iterations=1000,
        depth=8,
        learning_rate=0.05,
        loss_function='MultiClass',
        verbose=1,
        task_type="GPU"
    )
    
    model.fit(X_train_flat, y_train)
    y_pred = model.predict(X_val_flat)
    
    acc_cat = accuracy_score(y_val, y_pred)
    f1_cat = f1_score(y_val, y_pred, average='macro')
    cls_cat = classification_report(y_val, y_pred)
    cm_cat = confusion_matrix(y_val, y_pred)
    mlflow.log_metric("accuracy", acc_cat)
    mlflow.log_metric("f1_macro", f1_cat)
    mlflow.log_text(cls_cat, "classification_report.txt")
    mlflow.log_text(str(cm_cat), "confusion_matrix.txt")
    fig = px.imshow(
    cm_cat,
    text_auto=True,
    title="Confusion Matrix — CatBoost Baseline",
    color_continuous_scale="Blues"
)
    fig.write_image(r"../Plots/Base-Models-Benchmarking/CatBoost_Confusion_Matrix.png")
    mlflow.log_artifact(r"../Plots/Base-Models-Benchmarking/CatBoost_Confusion_Matrix.png")
    fig.show()
    
    mlflow.catboost.log_model(model, "catboost_model")
    mlflow.end_run()


0:	learn: 2.3640069	total: 278ms	remaining: 4m 38s
1:	learn: 2.2247897	total: 505ms	remaining: 4m 12s
2:	learn: 2.1187290	total: 755ms	remaining: 4m 10s
3:	learn: 2.0299460	total: 991ms	remaining: 4m 6s
4:	learn: 1.9543477	total: 1.23s	remaining: 4m 4s
5:	learn: 1.8865017	total: 1.45s	remaining: 4m
6:	learn: 1.8300592	total: 1.69s	remaining: 4m
7:	learn: 1.7768989	total: 1.92s	remaining: 3m 58s
8:	learn: 1.7270406	total: 2.17s	remaining: 3m 58s
9:	learn: 1.6780096	total: 2.39s	remaining: 3m 57s
10:	learn: 1.6353138	total: 2.62s	remaining: 3m 55s
11:	learn: 1.5959891	total: 2.87s	remaining: 3m 56s
12:	learn: 1.5587093	total: 3.11s	remaining: 3m 55s
13:	learn: 1.5250607	total: 3.34s	remaining: 3m 55s
14:	learn: 1.4952109	total: 3.58s	remaining: 3m 54s
15:	learn: 1.4654176	total: 3.83s	remaining: 3m 55s
16:	learn: 1.4394262	total: 4.05s	remaining: 3m 54s
17:	learn: 1.4133720	total: 4.29s	remaining: 3m 53s
18:	learn: 1.3884720	total: 4.53s	remaining: 3m 53s
19:	learn: 1.3666483	total: 4.77



##
---

## XGBoost

In [11]:
import xgboost as xgb

In [12]:
with mlflow.start_run(run_name="xgboost_model_v1"):
    mlflow.log_param("model","XGBoost")
    mlflow.log_param("n_estimators",500)
    mlflow.log_param("RUN_TYPE","GPU")
    
    model = xgb.XGBClassifier(
        n_estimators=500,
        max_depth=8,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        objective="multi:softmax",
        eval_metric="mlogloss",
        tree_method="hist",
        device="cuda",
        verbosity=1
    )

    
    model.fit(X_train_flat, y_train)
    y_pred = model.predict(X_val_flat)
    
    acc_xgb = accuracy_score(y_val, y_pred)
    f1_xgb = f1_score(y_val, y_pred, average='macro')
    cls_xgb = classification_report(y_val, y_pred)
    cm_xgb = confusion_matrix(y_val, y_pred)
    mlflow.log_metric("accuracy", acc_xgb)
    mlflow.log_metric("f1_macro", f1_xgb)
    mlflow.log_text(cls_xgb, "classification_report.txt")
    mlflow.log_text(str(cm_xgb), "confusion_matrix.txt")
    
    fig = px.imshow(
    cm_xgb,
    text_auto=True,
    title="Confusion Matrix — XGBoost Baseline",
    color_continuous_scale="Blues"
)
    fig.write_image(r"../Plots/Base-Models-Benchmarking/XGBoost_Confusion_Matrix.png")
    mlflow.log_artifact(r"../Plots/Base-Models-Benchmarking/XGBoost_Confusion_Matrix.png")
    fig.show()
    mlflow.xgboost.log_model(model, "xgboost_model")
    mlflow.end_run()

    


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.







##
-------

## Score table
