df link: https://www.kaggle.com/datasets/purusinghvi/email-spam-classification-dataset/

In [70]:
import numpy as np
import re
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

In [52]:
data = pd.read_csv('C:\\Users\\Jozef\\Downloads\\email spam classification\\combined_data.csv')

print(f'this is the data size: {data.size} and shape: {data.shape}')

this is the data size: 166896 and shape: (83448, 2)


We can see the size of the data and the shape, for what i'm seeing we will need to tokenize and vectorize this dataset

In [53]:
data.head()

Unnamed: 0,label,text
0,1,ounce feather bowl hummingbird opec moment ala...
1,1,wulvob get your medircations online qnb ikud v...
2,0,computer connection from cnn com wednesday es...
3,1,university degree obtain a prosperous future m...
4,0,thanks for all your answers guys i know i shou...


In [54]:

def preprocess_emails(mails):
    mails = re.sub(r'[^a-zA-Z0-9\s]', '', mails.lower())
    token = mails.split()
    stop_words = set(stopwords.words('english'))
    tokenM = [word for word in token if not word in stop_words]
    clean_mails = ' '.join(tokenM)
    return(clean_mails)
    
    
emails_prepros = [preprocess_emails(email) for email in data['text']]

data['text'] = emails_prepros
    
print('preprocess completed')    

preprocess completed


In [55]:
df

Unnamed: 0,label,text
0,1,ounce feather bowl hummingbird opec moment ala...
1,1,wulvob get your medircations online qnb ikud v...
2,0,computer connection from cnn com wednesday es...
3,1,university degree obtain a prosperous future m...
4,0,thanks for all your answers guys i know i shou...
...,...,...
83443,0,hi given a date how do i get the last date of ...
83444,1,now you can order software on cd or download i...
83445,1,dear valued member canadianpharmacy provides a...
83446,0,subscribe change profile contact us long term ...


In [67]:
X = df['text']
Y = df['label']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state= 69)

In [68]:
train_l = np.array(Y_train)
test_l = np.array(Y_test)

V = TfidfVectorizer()

train_data_V = V.fit_transform(X_train)
test_data_V = V.transform(X_test)

print(f'train data: {train_data_V}')
print(f'train labels: {train_l}')

train data:   (0, 178944)	0.038784161371494666
  (0, 94330)	0.04240877888274913
  (0, 236431)	0.026733043880102423
  (0, 225423)	0.01693281957657583
  (0, 249933)	0.06836932422178009
  (0, 105454)	0.0467148139812539
  (0, 249694)	0.026146127841705336
  (0, 186942)	0.0387226671295079
  (0, 4458)	0.11313737050794989
  (0, 11589)	0.09417028824386715
  (0, 4460)	0.11313737050794989
  (0, 8403)	0.18111888805795037
  (0, 12001)	0.11730379095267024
  (0, 26015)	0.0393981532428271
  (0, 41536)	0.034796091825123304
  (0, 75114)	0.044082692096735696
  (0, 41899)	0.021712520986997136
  (0, 254641)	0.015295496066858033
  (0, 37024)	0.03843012778265765
  (0, 121437)	0.01805063756033849
  (0, 210916)	0.026118039467490858
  (0, 166526)	0.06311950227298327
  (0, 61579)	0.0633332298506845
  (0, 207265)	0.02985828604158815
  (0, 113772)	0.040878970181401304
  :	:
  (66757, 111422)	0.02098306211918183
  (66757, 157381)	0.02869818787923447
  (66757, 151310)	0.02589476612330671
  (66757, 23964)	0.042012381

In [72]:
model = LogisticRegression()

model.fit(train_data_V, train_l)

prediction = model.predict(test_data_V)

accuracy= accuracy_score(test_l, prediction)

cm = confusion_matrix(Y_test, prediction)

print(f'this is the accuracy: {accuracy}')
print(f'this is the confusion matrix: \n {cm}')

this is the accuracy: 0.9839424805272619
this is the confusion matrix: 
 [[7651  192]
 [  76 8771]]


In [80]:
visual_samples = ("Hurry up! Sale ends soon! Buy one get one free on all electronics. Visit our store now!",
"You have been selected for an exclusive reward! Claim your free luxury cruise tickets today!",
"Act now to secure your spot in this once-in-a-lifetime investment opportunity. Guaranteed high returns!",
"You won a $1000 gift card! Click here to redeem it immediately before the offer expires.",
"Cheap pharmaceuticals available for a limited time. No prescription needed. Order now!",
"You're the lucky winner of our grand prize. Send your details to receive a brand new car!",
"This is not a joke! You are the 1,000,000th visitor and have won an iPhone. Click to claim!",
"Exclusive deal just for you! 70% off on all designer clothes. Don't miss out, shop now!",
"Hi, can we reschedule tomorrow's meeting to 2 PM? The current time clashes with another appointment.",
"Reminder: Your subscription to the magazine is about to expire. Please renew to continue receiving issues.",
"Dear team, please review the attached project plan and provide your feedback by EOD.",
"Hi Mom, just checking in. Hope everything is fine. Let's catch up over the weekend.",
"John, I found a few interesting articles on data science. I'll forward them to you.",
"Invitation: You're cordially invited to Emma's wedding on June 15th. Please RSVP.",
"Thank you for shopping with us. Your order has been shipped and should arrive in 3-5 business days.",
"Congratulations! You've been selected to win a brand new iPhone! Click here to claim your prize now!")

In [79]:
import random

In [90]:

email_sample = random.choice(visual_samples)

prepros_sample = preprocess_emails(email_sample)

sample_V = V.transform([prepros_sample])

sample_pred = model.predict(sample_V)

if sample_pred[0] == 1:
    print(f'this email is classified as SPAM \n {email_sample} ')
else:    
    print(f'this email is classified as NON-SPAM \n {email_sample} ')

this email is classified as NON-SPAM 
 Dear team, please review the attached project plan and provide your feedback by EOD. 
