# Assignment 1 : SMS Spam Classification

## Importing Libraries

In [None]:
# ! pip install mlflow
# ! pip install jinja2==3.0.3

In [8]:
import mlflow
import logging
from urllib.parse import urlparse
from markupsafe import escape

logging.basicConfig(level=logging.WARN)
logger = logging.getLogger(__name__)

import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import  classification_report, confusion_matrix, accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB

## Loading training, validation, test data

In [11]:
train=pd.read_csv("Data/Training Data.csv")
val=pd.read_csv("Data/Validation Data.csv")
test=pd.read_csv("Data/Test Data.csv")

In [12]:
train.head()

Unnamed: 0,Text,Label
0,need coffee run tomocant believe time week alr...,0
1,need say anything know outsider,0
2,hit move,0
3,yup thk e shop close lor,0
4,ran younger man make pretty baby together,0


In [13]:
val.head()

Unnamed: 0,Text,Label
0,oh god ive found number im glad text back xaft...,1
1,please leave topicsorry telling,0
2,hello hello hi lou sorry took long 2 reply lef...,0
3,k actually guy meet sunoco howard right way,0
4,dun b sad dun thk abt already concentrate ur p...,0


In [14]:
test.head()

Unnamed: 0,Text,Label
0,oh k kbut big hitteranyway good,0
1,world suffers lot violence bad people silence ...,0
2,wan2 win meetgreet westlife 4 u m8 currently t...,1
3,dip cell dead coming u better respond else sha...,0
4,tot u reach liao said tshirt,0


In [15]:
y_train,X_train=train["Label"],train["Text"]
y_val,X_val=val["Label"],val["Text"]
y_test,X_test=test["Label"],test["Text"]

## Converting string to vectors, to give as input to the models

In [16]:
# replacing NAN entries by empty string
X_train = X_train.replace(np.nan, '', regex=True)
X_val = X_val.replace(np.nan, '', regex=True)
X_test = X_test.replace(np.nan, '', regex=True)

In [17]:
vector = CountVectorizer().fit(X_train)
X_train_vec=vector.transform(X_train)
X_val_vec=vector.transform(X_val)
X_test_vec=vector.transform(X_test)

In [18]:
X_train_vec.shape,X_val_vec.shape,X_test_vec.shape

((4026, 7432), (711, 7432), (837, 7432))

In [19]:
tfidf_transformer = TfidfTransformer()
tfidf_train = tfidf_transformer.fit_transform(X_train_vec)
tfidf_val = tfidf_transformer.fit_transform(X_val_vec)
tfidf_test = tfidf_transformer.fit_transform(X_test_vec)

In [20]:
tfidf_train.shape,tfidf_val.shape,tfidf_test.shape

((4026, 7432), (711, 7432), (837, 7432))

## Training Models

In [None]:
def eval_metrics(actual, pred):
    precision, recall, thresholds = precision_recall_curve(actual, pred)
    auc_precision_recall = auc(recall, precision)
    return (auc_precision_recall)

# Multinomial Naive Bayes

In [None]:
mlflow.sklearn.autolog()

clf = MultinomialNB()
clf.fit(tfidf_train, y_train)

y_pred = clf.predict(tfidf_test)
aucpr = eval_metrics(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
conf_2=confusion_matrix(y_test, y_pred)

with mlflow.start_run(run_name="Multinomial Naive Bayes"):

    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("AUCPR",aucpr)
    mlflow.sklearn.log_model(clf, "model")
    mlflow.log_dict(np.array(conf_2).tolist(), "confusion_matrix.json")

    print("\nMultinomial Naive Bayes")
    print(f"Accuracy: {acc}")
    print(f"AUCPR: {aucpr}")
    print(f"Confusion Matrix: {conf_2} \n\n")
    

    tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
    mlflow.sklearn.log_model(
        sk_model=clf,
        artifact_path="sklearn-model",
        registered_model_name="multinomial-nb-model"
    )
    if tracking_url_type_store != "file":
      mlflow.sklearn.log_model(clf, "model", registered_model_name="multinomial-nb-model")
    else:
      mlflow.sklearn.log_model(clf, "model")


In [None]:
print(mlflow.tracking.MlflowClient().get_model_version("multinomial-nb-model", '1'))

# Logistic Regression

In [None]:
mlflow.sklearn.autolog()

n_estimators = 100
clf = LogisticRegression(C=n_estimators)
clf.fit(tfidf_train, y_train)

y_pred = clf.predict(tfidf_test)
aucpr = eval_metrics(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
conf_1=confusion_matrix(y_test, y_pred)

with mlflow.start_run(run_name=f"n_estimators : {n_estimators}"):
  
    mlflow.log_param("n_estimators", n_estimators)
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("AUCPR",aucpr)
    mlflow.log_dict(np.array(conf_1).tolist(), "confusion_matrix.json")
    mlflow.sklearn.log_model(clf, "model")

    print("\nLogistic Regression Model (no_of_estimator={:f}):".format(n_estimators))
    print(f"Accuracy: {acc}")
    print(f"AUCPR: {aucpr} ")
    print(f"Confusion Matrix: {conf_1} \n \n")
    

    tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
    mlflow.sklearn.log_model(
        sk_model=clf,
        artifact_path="sklearn-model",
        registered_model_name="logistic-regression-model"
    )
    if tracking_url_type_store != "file":
      mlflow.sklearn.log_model(clf, "model", registered_model_name="LogisticRegression")
    else:
      mlflow.sklearn.log_model(clf, "model")


In [None]:
mlflow.tracking.MlflowClient().get_model_version("logistic-regression-model","1")

# MLP Classifier

In [None]:
mlflow.sklearn.autolog()

clf = MLPClassifier(random_state=101,learning_rate='adaptive')
clf.fit(tfidf_train, y_train)

y_pred = clf.predict(tfidf_test)
aucpr = eval_metrics(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
conf_3=confusion_matrix(y_test, y_pred)

with mlflow.start_run(run_name="Multilayer Perceptron"):

    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("AUCPR",aucpr)
    mlflow.sklearn.log_model(clf, "model")
    
    print("\nMultilayer Perceptron")
    print(f"Accuracy: {acc}")
    print(f"AUCPR: {aucpr}")
    print(f"Confusion Matrix {conf_3} \n\n")
    

    tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
    mlflow.sklearn.log_model(
        sk_model=clf,
        artifact_path="sklearn-model",
        registered_model_name="multilayer-perceptron-model"
    )
    if tracking_url_type_store != "file":
      mlflow.sklearn.log_model(clf, "model", registered_model_name="multilayer-perceptron-model")
    else:
      mlflow.sklearn.log_model(clf, "model")


In [None]:
print(mlflow.tracking.MlflowClient().get_model_version("multilayer-perceptron-model", '1'))

# Conclusion

As our aim is to reduce False Negative , thus increasing recall.

MLP classifier is the perect match, according to accuracy as well as recall criteria.