### Here we are using different models than the models used to train for assignment 1

In [1]:
 ! pip install sentence-transformers
 ! pip install mlflow

Collecting sentence-transformers
  Downloading sentence_transformers-3.4.1-py3-none-any.whl.metadata (10 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Using cached tokenizers-0.21.0-cp39-abi3-macosx_11_0_arm64.whl.metadata (6.7 kB)
Downloading sentence_transformers-3.4.1-py3-none-any.whl (275 kB)
Downloading transformers-4.49.0-py3-none-any.whl (10.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hUsing cached tokenizers-0.21.0-cp39-abi3-macosx_11_0_arm64.whl (2.6 MB)
Installing collected packages: tokenizers, transformers, sentence-transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.13.3
    Uninstalling tokenizers-0.13.3:
      Successfully uninstalled tokenizer

In [2]:
import pandas as pd  
import numpy as np 
from sklearn.utils import resample 
from sklearn.model_selection import GridSearchCV

# Parsing URLs
from urllib.parse import urlparse  

# SentenceTransformer for text embeddings
from sentence_transformers import SentenceTransformer 

# Scikit-learn for various ML models 
from sklearn.linear_model import LogisticRegression 
from sklearn.naive_bayes import MultinomialNB 
from sklearn.svm import SVC    
from sklearn.ensemble import RandomForestClassifier 
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import StackingClassifier 

# Evaluation metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_recall_curve, auc   

# MLflow for experiment tracking
import mlflow 

# Joblib for saving and loading models
import joblib  

# Python logging m

In [3]:
def convert_text_to_vectors(data, filename):
    """
    Convert text data to sentence vectors using SentenceTransformer model and store in a file.
    """
    model = SentenceTransformer('all-MiniLM-L6-v2')

    data = data.replace(np.nan, '', regex=True)
    vectors = model.encode(data['text'])

    # Store vectors in a file
    with open(filename, 'wb') as file:
        pickle.dump(vectors, file)

    return vectors

In [4]:
def train_model(clf, param_grid, train_emb, val_emb, test_emb, y_train, y_val, y_test):
    """
    Train a model with hyperparameter tuning on embedded training data and evaluate performance on validation and test sets.
    """
    # Parameter tuning with validation set
    grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, scoring='accuracy')
    grid_search.fit(val_emb, y_val)
    
    # Get best parameters
    best_params = grid_search.best_params_
    
    # Train the model on combined training and validation data with best parameters
    clf.set_params(**best_params)
    clf.fit(train_emb, y_train)

    # Make predictions on the test set
    y_pred = clf.predict(test_emb)

    # Calculate accuracy
    acc = accuracy_score(y_test, y_pred)

    # Evaluate precision-recall curve AUC for binary classification predictions
    precision, recall, thresholds = precision_recall_curve(y_test, y_pred)
    aucpr = auc(recall, precision)

    return clf, acc, aucpr

In [5]:
def log_model_metrics(model, acc, aucpr, model_name):
    """
    Log model metrics and artifacts using MLflow.
    """
    filename = model_name + ".joblib"
    joblib.dump(model, filename)

    print(f"Accuracy: {acc}")
    print(f"AUCPR: {aucpr}")

    mlflow.sklearn.autolog()

    with mlflow.start_run():
        mlflow.log_param("model", filename)
        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("AUCPR", aucpr)

        # Log artifact
        mlflow.log_artifact(filename)

        mlflow.sklearn.log_model(model, model_name)

        # Get the MLflow tracking URI scheme
        tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme

        mlflow.sklearn.log_model(
            sk_model=model,
            artifact_path="sklearn-model",
            registered_model_name=model_name + "_Model"
        )

In [6]:
def get_model_version(model_name):
    """
    Retrieve the latest version of a model from MLflow by its name.
    """
    client = mlflow.tracking.MlflowClient()
    
    model_version = client.get_latest_versions(model_name, stages=["None"])[0].version
    return model_version

### Main

In [10]:
import pickle

#Load data
train = pd.read_csv("Data/Training Data.csv")
val = pd.read_csv("Data/Validation Data.csv")
test = pd.read_csv("Data/Test Data.csv")


# # Convert text to vectors and store them
convert_text_to_vectors(train, 'train_emb.pkl')
convert_text_to_vectors(val, 'val_emb.pkl')
convert_text_to_vectors(test, 'test_emb.pkl')

array([[-0.06437393, -0.05329217,  0.05308124, ..., -0.0481124 ,
        -0.11354938,  0.01842844],
       [-0.00070231,  0.04682585,  0.04150278, ..., -0.06567338,
         0.07070542,  0.07212626],
       [-0.02298434, -0.05006471,  0.02371489, ..., -0.02528759,
        -0.10597737, -0.00363031],
       ...,
       [ 0.04190622,  0.0892358 ,  0.01323775, ...,  0.01013731,
        -0.01525157, -0.10524866],
       [-0.04490845, -0.04449276,  0.03015462, ...,  0.01425023,
        -0.04494065, -0.03324023],
       [-0.11006838, -0.05572332, -0.00285661, ..., -0.05245342,
        -0.11218999, -0.07318109]], dtype=float32)

In [11]:
# Load vectors from files
with open('train_emb.pkl', 'rb') as file:
    train_emb = pickle.load(file)

with open('val_emb.pkl', 'rb') as file:
    val_emb = pickle.load(file)

with open('test_emb.pkl', 'rb') as file:
    test_emb = pickle.load(file)

## Train Models

### 1. Logistic Regression

In [13]:
model_name = "Logistic_Regression"
clf = LogisticRegression()

param_grid_lr = {
    'C': [0.1, 1, 10],
    'solver': ['liblinear'], 
    'penalty': ['l1', 'l2'],
    'max_iter': [100, 200, 500]
}

# Convert labels to binary (0 for ham, 1 for spam)
y_train = train['spam'].map({'ham': 0, 'spam': 1})
y_val = val['spam'].map({'ham': 0, 'spam': 1})
y_test = test['spam'].map({'ham': 0, 'spam': 1})

# Train the model
lr_model, lr_acc, lr_aucpr = train_model(
    clf, param_grid_lr, train_emb, val_emb, test_emb, 
    y_train, y_val, y_test
)

log_model_metrics(lr_model, lr_acc, lr_aucpr, model_name)

version = get_model_version(model_name + "_Model")
print(f"Model Version: {version}")

Accuracy: 0.9820574162679426
AUCPR: 0.9392165610586662


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Model Version: 1


Successfully registered model 'Logistic_Regression_Model'.
Created version '1' of model 'Logistic_Regression_Model'.
  model_version = client.get_latest_versions(model_name, stages=["None"])[0].version


### 2 . Support Vector Classifier

In [15]:
model_name = "Support_Vector_Machine"
clf = SVC()

param_grid_svc = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'gamma': ['scale', 'auto']
}

# Convert labels to binary (0 for ham, 1 for spam)
y_train = train['spam'].map({'ham': 0, 'spam': 1})
y_val = val['spam'].map({'ham': 0, 'spam': 1})
y_test = test['spam'].map({'ham': 0, 'spam': 1})

# Train the model
svm_model, svm_acc, svm_aucpr = train_model(
    clf, param_grid_svc, train_emb, val_emb, test_emb, 
    y_train, y_val, y_test  # Use numerical labels here
)

log_model_metrics(svm_model, svm_acc, svm_aucpr, model_name)

version = get_model_version(model_name + "_Model")
print(f"Model Version: {version}")

2025/03/05 03:45:42 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '1f8d3f5e4da547e9a093a69804115b19', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
2025/03/05 03:45:51 INFO mlflow.sklearn.utils: Logging the 5 best runs, 19 runs will be omitted.
2025/03/05 03:45:51 INFO mlflow.utils.autologging_utils: Created MLflow auto

Accuracy: 0.9760765550239234
AUCPR: 0.9158907711539291


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Model Version: 1


Successfully registered model 'Support_Vector_Machine_Model'.
Created version '1' of model 'Support_Vector_Machine_Model'.
  model_version = client.get_latest_versions(model_name, stages=["None"])[0].version


### 3. Random Forest

In [16]:
model_name = "Random_Forest"
clf = RandomForestClassifier(n_estimators=300, max_depth=6, random_state=101)

param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 6, 8, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Convert labels to binary (0 for ham, 1 for spam)
y_train = train['spam'].map({'ham': 0, 'spam': 1})
y_val = val['spam'].map({'ham': 0, 'spam': 1})
y_test = test['spam'].map({'ham': 0, 'spam': 1})

# Train the model
rf_model, rf_acc, rf_aucpr = train_model(
    clf, param_grid_rf, train_emb, val_emb, test_emb, 
    y_train, y_val, y_test  # Use numerical labels here
)

log_model_metrics(rf_model, rf_acc, rf_aucpr, model_name)

version = get_model_version(model_name + "_Model")
print(f"Model Version: {version}")

2025/03/05 03:50:33 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'a53e566349624fcaa1143cc97d946306', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
2025/03/05 04:13:41 INFO mlflow.sklearn.utils: Logging the 5 best runs, 211 runs will be omitted.
2025/03/05 04:13:41 INFO mlflow.utils.autologging_utils: Created MLflow aut

Accuracy: 0.9593301435406698
AUCPR: 0.867181775076512


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Model Version: 1


Successfully registered model 'Random_Forest_Model'.
Created version '1' of model 'Random_Forest_Model'.
  model_version = client.get_latest_versions(model_name, stages=["None"])[0].version


### 4. Ensemble Modeling: Stacking Classifier

In [18]:
model_name = "Stacking_Classifier"

# Convert labels to binary and ensure numpy arrays
y_train = train['spam'].map({'ham': 0, 'spam': 1}).values
y_val = val['spam'].map({'ham': 0, 'spam': 1}).values
y_test = test['spam'].map({'ham': 0, 'spam': 1}).values

# Define base estimators (enable probability for SVM)
estimators = [
    ('lr', LogisticRegression()),
    ('svm', SVC(probability=True)),  # Required for `predict_proba`
    ('rf', RandomForestClassifier(n_estimators=100, random_state=101))
]

# Define the stacking classifier
clf = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(),
    stack_method='auto',
    cv=3,
    passthrough=False
)

# Define the hyperparameter grid
param_grid_stack = {
    'stack_method': ['auto', 'predict_proba'],
    'final_estimator': [LogisticRegression(), RandomForestClassifier()],
    'cv': [2, 3, 5],  # Remove invalid parameters like `final_estimator__max_iter`
    'passthrough': [False, True]
}

# Train the model
stacking_model, stacking_acc, stacking_aucpr = train_model(
    clf, param_grid_stack, train_emb, val_emb, test_emb, 
    y_train, y_val, y_test
)

log_model_metrics(stacking_model, stacking_acc, stacking_aucpr, model_name)

version = get_model_version(model_name + "_Model")
print(f"Model Version: {version}")

2025/03/05 04:20:59 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '5d546c95e7bd4418a01657edb8dd0151', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
2025/03/05 04:26:00 INFO mlflow.sklearn.utils: Logging the 5 best runs, 19 runs will be omitted.
2025/03/05 04:26:00 INFO mlflow.utils.autologging_utils: Created MLflow auto

Accuracy: 0.9868421052631579
AUCPR: 0.9560338554146605


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Model Version: 1


Successfully registered model 'Stacking_Classifier_Model'.
Created version '1' of model 'Stacking_Classifier_Model'.
  model_version = client.get_latest_versions(model_name, stages=["None"])[0].version
