## Loading libraries

In [15]:
import numpy as np
import pandas as pd
import sklearn
import mlflow

## Hosting mlflow on localhost

In [16]:
mlflow.set_tracking_uri('http://localhost:5000')

In [20]:
import pandas as pd
import re
import nltk

# Download NLTK data (only needed once)
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lalsa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lalsa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lalsa\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Some routine pre-processing

In [21]:
def preprocess_text(text):
    """
    1. Lowercase
    2. Remove punctuation, numbers, and other non-alphabet characters
    3. Tokenize
    4. Remove stopwords
    5. Lemmatize
    """
    # 1. Lowercase
    text = text.lower()
    
    # 2. Remove punctuation, numbers, etc. (keep only letters)
    text = re.sub(r'[^a-z]+', ' ', text)

    # 3. Tokenize
    tokens = nltk.word_tokenize(text)

    # 4. Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # 5. Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Join back into a single string
    return ' '.join(tokens)


In [None]:
# 1. Load the datasets
train_dataset = pd.read_csv(r'D:\aml\AppliedMachineLearning\Assignment2\data\train.csv')
test_dataset = pd.read_csv(r'D:\aml\AppliedMachineLearning\Assignment2\data\test.csv')
validation_dataset = pd.read_csv(r'D:\aml\AppliedMachineLearning\Assignment2\data\validation.csv')

# 2. Convert "spam" and "ham" to numeric labels (1 = spam, 0 = ham)
# Adjust these mappings if your dataset uses different labels
train_dataset['Spam'] = train_dataset['class'].map({'spam': 1, 'ham': 0})
test_dataset['Spam'] = test_dataset['class'].map({'spam': 1, 'ham': 0})
validation_dataset['Spam'] = validation_dataset['class'].map({'spam': 1, 'ham': 0})

# 3. Preprocess the text in each dataset
train_dataset['Preprocessed_text'] = train_dataset['sms'].apply(preprocess_text)
test_dataset['Preprocessed_text'] = test_dataset['sms'].apply(preprocess_text)
validation_dataset['Preprocessed_text'] = validation_dataset['sms'].apply(preprocess_text)


train_dataset = train_dataset[['Preprocessed_text', 'Spam']]
test_dataset = test_dataset[['Preprocessed_text', 'Spam']]
validation_dataset = validation_dataset[['Preprocessed_text', 'Spam']]

In [23]:
train_dataset

Unnamed: 0,Preprocessed_text,Spam
0,spoke uncle john today strongly feel need sacr...,0
1,left yet probably gon na til dinner,0
2,ryder unsold gibbs,0
3,love set soul fire spark flame big rawring fla...,0
4,turning phone mom telling everyone cancer sist...,0
...,...,...
3896,unni thank dear recharge rakhesh,0
3897,cheer message zogtorius staring phone age deci...,0
3898,lol grin babe thanks thinking,0
3899,sorry sent blank msg yup trying serious studying,0


In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()

In [25]:
X_train = tfidf.fit_transform(train_dataset['Preprocessed_text']).toarray()
y_train = train_dataset['Spam']

In [26]:
X_test = tfidf.transform(test_dataset['Preprocessed_text']).toarray()
y_test = test_dataset['Spam']

In [27]:
X_validation = tfidf.transform(validation_dataset['Preprocessed_text']).toarray()
y_validation = validation_dataset['Spam']

### Decisiont tree

In [28]:
from sklearn.metrics import auc, precision_score, recall_score, confusion_matrix, precision_recall_curve
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import ParameterGrid
from random import seed

import warnings
warnings.filterwarnings("ignore")

param_grid = {
    'max_depth': [2,8,13,20, None],
    'min_samples_split': [2, 7, 15, 20],
    'min_samples_leaf': [1, 2, 4, 7, 10]
}

with mlflow.start_run(run_name= 'Decision Tree'):
    best_auc = 0
    best_params = None
    for params in ParameterGrid(param_grid):
        seed(564)
        clf = DecisionTreeClassifier(**params)
        clf.fit(X_train, y_train)

        mlflow.sklearn.log_model(clf, f"[ {params['max_depth']}, {params['min_samples_split']}, {params['min_samples_leaf']} ]")
        precision, recall, _ = precision_recall_curve(y_validation, clf.predict_proba(X_validation)[:,1])
        sorted_indices = np.argsort(precision)
        precision = precision[sorted_indices]
        recall = recall[sorted_indices]
        validation_auc = auc(precision, recall)
        if validation_auc > best_auc:
            best_auc = validation_auc
            best_params = params
            
        
    mlflow.log_params(best_params)
    mlflow.log_metric("validation_auc", best_auc)

    seed(564)

    best_model_DT = DecisionTreeClassifier(**best_params)
    best_model_DT.fit(X_train, y_train)

    mlflow.sklearn.log_model(best_model_DT, "Best Decision Tree model")



🏃 View run Decision Tree at: http://localhost:5000/#/experiments/0/runs/9a491120d57c4d598efcdf41f6ab21cb
🧪 View experiment at: http://localhost:5000/#/experiments/0


### Logistic regression

In [31]:
from sklearn.linear_model import LogisticRegression
import warnings
import numpy as np
from sklearn.model_selection import ParameterGrid
import mlflow
import mlflow.sklearn
from random import seed
from sklearn.metrics import precision_recall_curve, auc

warnings.filterwarnings("ignore")

param_grid = {
    'l1_ratio': list(np.arange(0.0, 1.01, 0.1)),
    'penalty': ['elasticnet'],
    'solver': ['saga']
}

with mlflow.start_run(run_name='Logistic Regression'):
    best_auc = 0
    best_params = None
    for params in ParameterGrid(param_grid):
        seed(789)
        clf = LogisticRegression(**params)
        clf.fit(X_train, y_train)

        # Use a valid directory name (replace ":" with "_" in the artifact path)
        artifact_path = 'lamda_' + str(params['l1_ratio'])
        mlflow.sklearn.log_model(clf, artifact_path)

        precision, recall, _ = precision_recall_curve(
            y_validation, clf.predict_proba(X_validation)[:, 1]
        )
        sorted_indices = np.argsort(precision)
        precision = precision[sorted_indices]
        recall = recall[sorted_indices]
        validation_auc = auc(precision, recall)
        if validation_auc > best_auc:
            best_auc = validation_auc
            best_params = params

    mlflow.log_params(best_params)
    mlflow.log_metric("validation_auc", best_auc)

    seed(789)
    best_model_logistic = LogisticRegression(**best_params)
    best_model_logistic.fit(X_train, y_train)

    mlflow.sklearn.log_model(best_model_logistic, "Best_Logistic_model")




🏃 View run Logistic Regression at: http://localhost:5000/#/experiments/0/runs/848dd172be1347c992bf38cc218f0d66
🧪 View experiment at: http://localhost:5000/#/experiments/0


## Multinomila Naive Bayes

In [34]:
from sklearn.naive_bayes import MultinomialNB
import warnings
import numpy as np
from sklearn.model_selection import ParameterGrid
import mlflow
import mlflow.sklearn
from random import seed
from sklearn.metrics import precision_recall_curve, auc

warnings.filterwarnings("ignore")

param_grid = {
    'alpha': list(np.arange(0.0, 10.01, 0.5))
}

# If X_train and X_validation are numpy arrays, replace NaN values using np.nan_to_num
X_train = np.nan_to_num(X_train, nan=0)
X_validation = np.nan_to_num(X_validation, nan=0)

with mlflow.start_run(run_name='Naive Bayes'):
    best_auc = 0
    best_params = None
    for params in ParameterGrid(param_grid):
        seed(561)
        clf = MultinomialNB(**params)
        clf.fit(X_train, y_train)
        
        mlflow.sklearn.log_model(clf, 'alpha_' + str(params['alpha']))
        y_probs = clf.predict_proba(X_validation)[:, 1]
        
        # Check if there are any NaN values in predictions
        if np.isnan(y_probs).any():
            print(f"NaN values detected for parameters {params}. Skipping evaluation.")
            continue

        precision, recall, _ = precision_recall_curve(y_validation, y_probs)
        sorted_indices = np.argsort(precision)
        precision = precision[sorted_indices]
        recall = recall[sorted_indices]
        validation_auc = auc(precision, recall)
        
        if validation_auc > best_auc:
            best_auc = validation_auc
            best_params = params
            
    mlflow.log_params(best_params)
    mlflow.log_metric("validation_auc", best_auc)

    seed(561)
    best_model_NB = MultinomialNB(**best_params)
    best_model_NB.fit(X_train, y_train)

    mlflow.sklearn.log_model(best_model_NB, "Best_Naive_Bayes_model")




NaN values detected for parameters {'alpha': 0.0}. Skipping evaluation.




🏃 View run Naive Bayes at: http://localhost:5000/#/experiments/0/runs/941c578daaa84cf1b0b2c7ad550fb706
🧪 View experiment at: http://localhost:5000/#/experiments/0


In [39]:
NB_model = mlflow.sklearn.load_model(r'D:\aml\AppliedMachineLearning\mlruns\0\941c578daaa84cf1b0b2c7ad550fb706\artifacts\Best_Naive_Bayes_model')
Logistic_model = mlflow.sklearn.load_model(r'D:\aml\AppliedMachineLearning\mlruns\0\848dd172be1347c992bf38cc218f0d66\artifacts\Best_Logistic_model')
DT_model = mlflow.sklearn.load_model(r'D:\aml\AppliedMachineLearning\mlruns\0\9a491120d57c4d598efcdf41f6ab21cb\artifacts\Best Decision Tree model')

In [40]:

models = [DT_model, Logistic_model, NB_model]
aucpr = []
for i in range(3):
    precision, recall, _ = precision_recall_curve(y_test, models[i].predict_proba(X_test)[:,1])
    sorted_indices = np.argsort(precision)
    precision = precision[sorted_indices]
    recall = recall[sorted_indices]
    aucpr.append(auc(precision, recall))


In [41]:
result = {"Model":["Decision Tree", "Logistic Regression", "Naive Bayes"]}
result['AUCPR'] = aucpr

In [42]:
pd.DataFrame(result)

Unnamed: 0,Model,AUCPR
0,Decision Tree,0.667602
1,Logistic Regression,0.797862
2,Naive Bayes,0.82538


## AUCPR is max for Multinomial NB thus we choose that model