In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, precision_score, accuracy_score, recall_score
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv('mail_data.csv')

In [3]:
df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5608,spam,You've been selected for a survey. Participate...
5609,spam,Get a free sample of our product. Order now: w...
5610,spam,Special offer for subscribers only! Redeem now...
5611,spam,Your insurance policy needs renewal. Click her...


In [4]:
df.shape

(5613, 2)

In [5]:
data = df.where(pd.notnull(df), '')

In [6]:
data

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5608,spam,You've been selected for a survey. Participate...
5609,spam,Get a free sample of our product. Order now: w...
5610,spam,Special offer for subscribers only! Redeem now...
5611,spam,Your insurance policy needs renewal. Click her...


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5613 entries, 0 to 5612
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5613 non-null   object
 1   Message   5613 non-null   object
dtypes: object(2)
memory usage: 87.8+ KB


In [8]:
data.loc[data["Category"] == "spam", "Category"] = 0
data.loc[data["Category"] == "ham", "Category"] = 1

In [9]:
X = data['Message']
y = data["Category"]

In [10]:
X_train, X_test, Y_train, Y_test = train_test_split(X,y, test_size=0.2, random_state=101)

In [11]:
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [12]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(min_df=1, stop_words = "english", lowercase=True)),
    ('logistic_regression', LogisticRegression())
])

pipeline.fit(X_train, Y_train)


In [13]:
pipeline.predict_proba(["Sale Sale Sale. Free Free free"])

array([[0.31976543, 0.68023457]])

In [14]:
prediction_on_training_data = pipeline.predict(X_train)


conf_matrix = confusion_matrix(Y_train, prediction_on_training_data)

precision = precision_score(Y_train, prediction_on_training_data)

accuracy = accuracy_score(Y_train, prediction_on_training_data)

recall = recall_score(Y_train, prediction_on_training_data)

print("Confusion Matrix:")
print(conf_matrix)
print("Precision:", precision)
print("Accuracy:", accuracy)
print("Recall:", recall)

Confusion Matrix:
[[ 489  127]
 [   5 3869]]
Precision: 0.9682182182182182
Accuracy: 0.9706013363028954
Recall: 0.9987093443469283


In [15]:
prediction_on_test_data = pipeline.predict(X_test)


conf_matrix = confusion_matrix(Y_test, prediction_on_test_data)

precision = precision_score(Y_test, prediction_on_test_data)

accuracy = accuracy_score(Y_test, prediction_on_test_data)

recall = recall_score(Y_test, prediction_on_test_data)

print("Confusion Matrix:")
print(conf_matrix)
print("Precision:", precision)
print("Accuracy:", accuracy)
print("Recall:", recall)

Confusion Matrix:
[[100  52]
 [  1 970]]
Precision: 0.949119373776908
Accuracy: 0.9528049866429208
Recall: 0.9989701338825953


In [16]:
def custom_predict(X, threshold):
    probs = pipeline.predict_proba(X) 
    return (probs[:, 1] > threshold).astype(int)
    
    


In [17]:
userInput = ["Urgent! Buy one get one free. Special offer get a product for free!!!"]

prediction = pipeline.predict_proba(userInput)
new_preds = custom_predict(X=userInput, threshold=0.6) 
new_preds

array([0])

In [18]:
prediction

array([[0.68757237, 0.31242763]])

In [19]:
import joblib

joblib.dump(pipeline, './models/pipeline.joblib')

['./models/pipeline.joblib']