In [3]:
import pip
import pickle
import pandas as pd
import numpy as np
import mlflow
import mlflow.xgboost
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from xgboost import XGBClassifier

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
from mlflow.models.signature import infer_signature

  import pkg_resources


In [4]:
mlflow.set_tracking_uri('http://localhost:5000')
mlflow.set_experiment('Spine-disease-exp')
mlflow.sklearn.autolog()

In [5]:
url = './data/Dataset_spine.csv'

def read_dataframe(url):
    df = pd.read_csv(url)
    #df.head()

    return df

In [6]:
def preprocessing(df):
    df.columns = df.columns.str.lower().str.replace(' ', '_')

    df['class_att'] = df['class_att'].map({'Abnormal': 0, 'Normal': 1}).astype(int)

    X = df.iloc[:, :-1].values
    y = df.iloc[:, -1].values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify = y)

    return X_train, X_test, y_train, y_test

In [8]:
def train_model(X_train, X_test, y_train, y_test):

    best_params = {
    'n_estimators': 150,
    'max_depth': 10,
    'learning_rate': 0.1086,
    'subsample': 0.7775,
    'colsample_bytree': 0.6674,
    'gamma': 3.2692
    }

    # Start MLflow run
    with mlflow.start_run(run_name="xgb_Final_Model_Training"):
        # Initialize model with **unpacked** parameters
        final_model = XGBClassifier(
            **best_params,
            random_state=42,
            eval_metric='logloss',
            use_label_encoder=False
        )
        
        # Train with MLflow autologging
        # mlflow.xgboost.autolog(
        #     log_input_examples=True,
        #     log_model_signatures=True,
        #     log_models=True
        #)
        
        final_model.fit(
            X_train, 
            y_train,
            eval_set=[(X_test, y_test)],
            verbose=True
        )
        
        # Manual logging to ensure all metrics are captured
        y_pred = final_model.predict(X_test)
        y_proba = final_model.predict_proba(X_test)[:, 1]
        
        mlflow.log_metrics({
            "test_accuracy": accuracy_score(y_test, y_pred),
            "test_auc": roc_auc_score(y_test, y_proba),
            "test_f1": f1_score(y_test, y_pred)
        })
        
        with open('models/xgbclassifier.pkl', 'wb') as f_out:
            pickle.dump(XGB, f_out)

        mlflow.log_artifact(local_path = 'models/xgbclassifier.pkl', artifact_path = 'local_model')
        # Explicit model logging (redundant but ensures capture)
        # mlflow.xgboost.log_model(
        #     xgb_model=final_model,
        #     name="production_model",
        #     signature=infer_signature(X_train, final_model.predict(X_train)),
        #     input_example=X_train[:1]
        # )

    print("Final model trained and logged successfully!")

In [None]:
df = read_dataframe(url)
X_train, X_test, y_train, y_test = preprocessing(df)
train_model = train_model(X_train, X_test, y_train, y_test)