In [1]:
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
train = pd.read_csv("train.csv")
X_train, y_train = train['text'], train['spam']

test = pd.read_csv("test.csv")
X_test, y_test = test['text'], test['spam']

In [3]:
# Function to train and log models with MLflow
def train_and_log_model(model, model_name):
    with mlflow.start_run():

        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(max_features=2000)),
            ('clf', model),
        ])
        pipeline.fit(X_train, y_train)
        
        # Log the model
        mlflow.sklearn.log_model(pipeline, model_name)
        
        # Evaluate and log metrics
        y_pred = pipeline.predict_proba(X_test)[:, 1]
        aucpr = roc_auc_score(y_test, y_pred)
        mlflow.log_metric("AUCPR", aucpr)
        mlflow.log_params(pipeline.get_params())
        mlflow.register_model(mlflow.get_artifact_uri("model"),model_name)
        
        return aucpr

In [4]:
mlflow.set_experiment("experiment1_spamdetection")

2024/02/23 16:42:10 INFO mlflow.tracking.fluent: Experiment with name 'experiment1_spamdetection' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///e:/VSCode/Applied%20ML/Assignment-2-DVC/mlruns/551097864478859455', creation_time=1708686730890, experiment_id='551097864478859455', last_update_time=1708686730890, lifecycle_stage='active', name='experiment1_spamdetection', tags={}>

In [5]:
# Train and log the Random Forest model
rf_model = RandomForestClassifier()
rf_aucpr = train_and_log_model(rf_model, "RandomForest")

# Train and log the XGBoost model
xgb_model = XGBClassifier()
xgb_aucpr = train_and_log_model(xgb_model, "XGBoost")

# Train and log the Logistic Regression model
lr_model = LogisticRegression()
lr_aucpr = train_and_log_model(lr_model, "LogisticRegression")

# Print AUCPR for each model
print(f"AUCPR for RandomForest: {rf_aucpr}")
print(f"AUCPR for XGBoost: {xgb_aucpr}")
print(f"AUCPR for LogisticRegression: {lr_aucpr}")


Successfully registered model 'RandomForest'.
Created version '1' of model 'RandomForest'.
Successfully registered model 'XGBoost'.
Created version '1' of model 'XGBoost'.


AUCPR for RandomForest: 0.9957066189624328
AUCPR for XGBoost: 0.9970197959464506
AUCPR for LogisticRegression: 0.9993827281841593


Successfully registered model 'LogisticRegression'.
Created version '1' of model 'LogisticRegression'.
