# Assignment 1 - Applied Machine Learning
---
## Arghadeep Ghosh

This notebook 'train.csv' contains the code for loading the Training, Validation and Test datasets, fitting Naive-Bayes, Logistic Regression and Random Forest models on the training data and evaluating the model on the Validation and Test datasets.

In [1]:
import pandas as pd
import csv
import numpy as np

from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix, precision_recall_curve, auc
from sklearn.pipeline import Pipeline

from sklearn.metrics import auc, plot_precision_recall_curve

import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
train = pd.read_csv('data/train.csv')
valid = pd.read_csv('data/validation.csv')
test = pd.read_csv('data/test.csv')

len(train), len(valid), len(test)

(3344, 1115, 1115)

## Preprocessing Functions
---
The data is converted from a sentence model to a Bag of words format with each word asigned a tf-idf weighting

In [3]:
def split_into_lemmas(message):
    message = message.lower()  # convert bytes into proper unicode
    words = TextBlob(message).words
    return [word.lemma for word in words]

train.message.apply(split_into_lemmas)

0                    [going, for, dinner.msg, you, after]
1       [please, call, 08712402578, immediately, a, th...
2       [am, only, searching, for, good, dual, sim, mo...
3                       [ya, that, one, is, slow, a, poo]
4                      [talk, to, g, and, x, about, that]
                              ...                        
3339    [im, cool, ta, luv, but, v.tired, 2, cause, i...
3340                   [4, taco, 1, raja, burrito, right]
3341    [ma, head, dey, swell, oh, thanks, for, making...
3342    [yes, the, only, place, in, town, to, meet, ex...
3343       [i, 'm, good, have, you, registered, to, vote]
Name: message, Length: 3344, dtype: object

In [4]:
bow_transformer = CountVectorizer(analyzer=split_into_lemmas).fit(train['message'])
train_bow = bow_transformer.transform(train['message'])
valid_bow = bow_transformer.transform(valid['message'])
test_bow = bow_transformer.transform(test['message'])

bow = bow_transformer.transform([train['message'][5]])

print(bow_transformer.get_feature_names_out()[2096])

e


In [5]:
tfidf_transformer = TfidfTransformer().fit(train_bow)
train_tfidf = tfidf_transformer.transform(train_bow)
valid_tfidf = tfidf_transformer.transform(valid_bow)
test_tfidf = tfidf_transformer.transform(test_bow)


tfidf = tfidf_transformer.transform(bow)
print(tfidf)

  (0, 6454)	0.22971346003428875
  (0, 6400)	0.11771610046219375
  (0, 6046)	0.1579552939608168
  (0, 5923)	0.1945482280959502
  (0, 5836)	0.10090471982516318
  (0, 5797)	0.19832567147753144
  (0, 5697)	0.07860951998278148
  (0, 5637)	0.13830934410443585
  (0, 4477)	0.18313084909544983
  (0, 4151)	0.16752358524628597
  (0, 4091)	0.12427382764296355
  (0, 3966)	0.1377534794481989
  (0, 3763)	0.17014811410617742
  (0, 3345)	0.2193993235307073
  (0, 3116)	0.1080116708513791
  (0, 2522)	0.1412257928846768
  (0, 2378)	0.24802786234704557
  (0, 1687)	0.19279897619093267
  (0, 1403)	0.24425041896546437
  (0, 1389)	0.11835255301240706
  (0, 1268)	0.3019528756441539
  (0, 1233)	0.2348439904928211
  (0, 1020)	0.21139905772153192
  (0, 486)	0.3019528756441539
  (0, 281)	0.21286263040870704
  (0, 185)	0.3019528756441539


In [6]:
def evaluate_model(model, X, Y):
    Y_pred = model.predict(X)

    print('Accuracy:', accuracy_score(Y, Y_pred)*100, '%')
    print('Precision:', precision_score(Y, Y_pred, pos_label = 'spam')*100, '%')
    print('Recall:', recall_score(Y, Y_pred, pos_label = 'spam')*100, '%')
    print('F1 Score:', f1_score(Y, Y_pred, pos_label = 'spam')*100, '%')

    cm = confusion_matrix(Y, Y_pred)
    ax= plt.subplot()
    sns.heatmap(cm, annot=True, fmt='g', ax=ax);  #annot=True to annotate cells, ftm='g' to disable scientific notation

    # labels, title and ticks
    ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
    ax.set_title('Confusion Matrix'); 
    ax.xaxis.set_ticklabels(['Not Spam', 'Spam']); ax.yaxis.set_ticklabels(['Not Spam', 'Spam']);

## Naive-Bayes Model
---
We train Multinomial Naive-Bayes models on the training data for different values of alpha and evaluate it on the validation and testing data. We've obtained the Accuracy, precision, recall, F1 score and AUPCR in each case.

All the metrics and parameters are tracked using mlflow and the runtime can be seen by running
```> mlflow ui```


In [7]:
import mlflow
import mlflow.sklearn
from urllib.parse import urlparse

for alpha in np.arange(0.2, 2.0, 0.3):
    with mlflow.start_run():
        spam_detectorNB = MultinomialNB(alpha = alpha).fit(train_tfidf, train['label'])
        
        Y_pred = (spam_detectorNB.predict(train_tfidf) == 'spam').astype('int64')
        Y = (train['label'] == 'spam').astype('int64')

        acc = accuracy_score(Y, Y_pred)    
        pres = precision_score(Y, Y_pred)
        rec = recall_score(Y, Y_pred)
        f1 = f1_score(Y, Y_pred)
        
        p, r, threshold = precision_recall_curve(Y, Y_pred)
        aucpr = auc(p, r)

        mlflow.log_param("alpha", alpha)
        mlflow.log_metric("acc", acc)
        mlflow.log_metric("pres", pres)
        mlflow.log_metric("rec", rec)
        mlflow.log_metric("f1", f1)
        mlflow.log_metric("AUCPR", aucpr)

        tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme

        if tracking_url_type_store != "file":
            mlflow.sklearn.log_model(spam_detectorNB, "model", registered_model_name="NaiveBayes")
        else:
            mlflow.sklearn.log_model(spam_detectorNB, "model")



## Logistic Regression Model
---
We train Logistic Regression models on the training data for different C values and evaluate it on the validation and testing data. We've obtained the Accuracy, precision, recall, F1 score and AUCPR in each case.

All the metrics and parameters are tracked using mlflow and the runtime can be seen by running
```> mlflow ui```


In [None]:
import mlflow
import mlflow.sklearn
from urllib.parse import urlparse

Cs = [0.1, 0.5, 1, 10, 20, 50, 100]
for C in Cs:
    with mlflow.start_run():
        spam_detectorLR = LogisticRegression(C = C).fit(train_tfidf, train['label'])
        
        Y_pred = (spam_detectorLR.predict(train_tfidf) == 'spam').astype('int64')
        Y = (train['label'] == 'spam').astype('int64')

        acc = accuracy_score(Y, Y_pred)    
        pres = precision_score(Y, Y_pred)
        rec = recall_score(Y, Y_pred)
        f1 = f1_score(Y, Y_pred)
        
        p, r, threshold = precision_recall_curve(Y, Y_pred)
        aucpr = auc(p, r)

        mlflow.log_param("C", C)
        mlflow.log_metric("acc", acc)
        mlflow.log_metric("pres", pres)
        mlflow.log_metric("rec", rec)
        mlflow.log_metric("f1", f1)
        mlflow.log_metric("AUCPR", aucpr)

        tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme

        if tracking_url_type_store != "file":
            mlflow.sklearn.log_model(spam_detectorNB, "model", registered_model_name="NaiveBayes")
        else:
            mlflow.sklearn.log_model(spam_detectorNB, "model")

## Random Forest Classifier
---

We train Random Forest models on the training data for different values of n estimators and evaluate it on the validation and testing data. We've obtained the Accuracy, precision, recall, F1 score and AUCPR in each case.

All the metrics and parameters are tracked using mlflow and the runtime can be seen by running
```> mlflow ui```


In [None]:
import mlflow
import mlflow.sklearn
from urllib.parse import urlparse


for n in range(10, 100, 10):
    with mlflow.start_run():
        spam_detectorRF = RandomForestClassifier(n_estimators = n).fit(train_tfidf, train['label'])
        
        Y_pred = (spam_detectorRF.predict(train_tfidf) == 'spam').astype('int64')
        Y = (train['label'] == 'spam').astype('int64')

        acc = accuracy_score(Y, Y_pred)    
        pres = precision_score(Y, Y_pred)
        rec = recall_score(Y, Y_pred)
        f1 = f1_score(Y, Y_pred)
        
        p, r, threshold = precision_recall_curve(Y, Y_pred)
        aucpr = auc(p, r)

        mlflow.log_param("n trees", n)
        mlflow.log_metric("acc", acc)
        mlflow.log_metric("pres", pres)
        mlflow.log_metric("rec", rec)
        mlflow.log_metric("f1", f1)
        mlflow.log_metric("AUCPR", aucpr)

        tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme

        if tracking_url_type_store != "file":
            mlflow.sklearn.log_model(spam_detectorNB, "model", registered_model_name="NaiveBayes")
        else:
            mlflow.sklearn.log_model(spam_detectorNB, "model")