In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, precision_score, accuracy_score, recall_score
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv('mail_data.csv')

In [3]:
df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5608,spam,You've been selected for a survey. Participate...
5609,spam,Get a free sample of our product. Order now: w...
5610,spam,Special offer for subscribers only! Redeem now...
5611,spam,Your insurance policy needs renewal. Click her...


In [4]:
df.shape

(5613, 2)

In [5]:
data = df.where(pd.notnull(df), '')

In [6]:
data

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5608,spam,You've been selected for a survey. Participate...
5609,spam,Get a free sample of our product. Order now: w...
5610,spam,Special offer for subscribers only! Redeem now...
5611,spam,Your insurance policy needs renewal. Click her...


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5613 entries, 0 to 5612
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5613 non-null   object
 1   Message   5613 non-null   object
dtypes: object(2)
memory usage: 87.8+ KB


In [8]:
data.loc[data["Category"] == "spam", "Category"] = 0
data.loc[data["Category"] == "ham", "Category"] = 1

In [9]:
X = data['Message']
y = data["Category"]

In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(X,y, test_size=0.2, random_state=101)

In [15]:
X_train

3484    Hello, my love! How goes that day ? I wish you...
3057                             Webpage s not available!
314     Hi the way I was with u 2day, is the normal wa...
4677                              Ü ready then call me...
2616           2marrow only. Wed at  &lt;#&gt;  to 2 aha.
                              ...                        
4171    Mmmmmm ... I love you,so much, Ahmad ... I can...
599     Will do. Was exhausted on train this morning. ...
1361    Yo dude guess who just got arrested the other day
1547                Shant disturb u anymore... Jia you...
4959                     Why didn't u call on your lunch?
Name: Message, Length: 4490, dtype: object

In [12]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('logistic_regression', LogisticRegression())
])


In [14]:
pipeline.fit(X_train, Y_train)


ValueError: Unknown label type: 'unknown'

In [314]:
feature_extraction = TfidfVectorizer(min_df=1, stop_words = "english", lowercase=True)

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [315]:
X_train_features.shape

(4490, 7434)

In [316]:
X_test_features.shape

(1123, 7434)

In [317]:
lg_model = LogisticRegression()
lg_model.fit(X_train_features, Y_train)

In [318]:
prediction_on_training_data = lg_model.predict(X_train_features)


conf_matrix = confusion_matrix(Y_train, prediction_on_training_data)

precision = precision_score(Y_train, prediction_on_training_data)

accuracy = accuracy_score(Y_train, prediction_on_training_data)

recall = recall_score(Y_train, prediction_on_training_data)

print("Confusion Matrix:")
print(conf_matrix)
print("Precision:", precision)
print("Accuracy:", accuracy)
print("Recall:", recall)

Confusion Matrix:
[[ 489  127]
 [   5 3869]]
Precision: 0.9682182182182182
Accuracy: 0.9706013363028954
Recall: 0.9987093443469283


In [319]:
prediction_on_test_data = lg_model.predict(X_test_features)


conf_matrix = confusion_matrix(Y_test, prediction_on_test_data)

precision = precision_score(Y_test, prediction_on_test_data)

accuracy = accuracy_score(Y_test, prediction_on_test_data)

recall = recall_score(Y_test, prediction_on_test_data)

print("Confusion Matrix:")
print(conf_matrix)
print("Precision:", precision)
print("Accuracy:", accuracy)
print("Recall:", recall)

Confusion Matrix:
[[100  52]
 [  1 970]]
Precision: 0.949119373776908
Accuracy: 0.9528049866429208
Recall: 0.9989701338825953


In [320]:
def custom_predict(X, threshold):
    probs = lg_model.predict_proba(X) 
    return (probs[:, 1] > threshold).astype(int)
    
    


In [346]:
userInput = ["You won 5000 USD of 2000 USD"]

mail = feature_extraction.transform(userInput)

prediction = lg_model.predict_proba(mail)
new_preds = custom_predict(X=mail, threshold=0.6) 
new_preds

array([0])

In [347]:
prediction[:, 1]

array([0.53784081])

In [352]:
import joblib

joblib.dump(lg_model, './models/email_classifier_model.pkl')
joblib.dump(feature_extraction, './models/feature_extraction.pkl')

['./models/feature_extraction.pkl']