In [91]:
import json
import os
import zipfile

In [92]:
with open("kaggle.json") as f:
    kaggle_token= json.load(f)

In [None]:
!kaggle datasets download -d uciml/sms-spam-collection-dataset

In [None]:
with zipfile.ZipFile("sms-spam-collection-dataset.zip", 'r') as zip_ref:
    zip_ref.extractall()

In [None]:
import pandas as pd
import re 
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [67]:
stemmer = PorterStemmer()
stop_words = set(stopwords.words("english"))

In [68]:
stop_words = set(stopwords.words("english"))

In [69]:
data = pd.read_csv("spam.csv",encoding="latin-1")[["v1","v2"]]

In [70]:
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [71]:
data.columns = ['label', 'message']

In [72]:
data.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [73]:
data['label'] = data["label"].map({"ham": 0, "spam": 1})

In [74]:
data.head()

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [75]:
def preprocess_text(text):
    text = re.sub(r"\W", " ", text)
    text = text.lower()
    words=text.split()
    words=[stemmer.stem(word) for word in words if word not in stop_words]
    return " ".join(words)

In [76]:
data["cleaned_message"] = data["message"].apply(preprocess_text)

In [77]:
data.head()

Unnamed: 0,label,message,cleaned_message
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...
1,0,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entri 2 wkli comp win fa cup final tkt 21...
3,0,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah think goe usf live around though


In [78]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [79]:
vectorizer = TfidfVectorizer(max_features =3000)
X = vectorizer.fit_transform(data["cleaned_message"])

In [80]:
data.head()

Unnamed: 0,label,message,cleaned_message
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...
1,0,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entri 2 wkli comp win fa cup final tkt 21...
3,0,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah think goe usf live around though


In [81]:
y = data['label']

In [82]:
X_train, X_test,y_train, y_test = train_test_split(X, y,train_size = 0.2, random_state=42)

In [83]:
model = LogisticRegression()

In [84]:
model.fit(X_train, y_train)

In [85]:
y_pred= model.predict(X_test)

In [86]:
print(f"accuracy: {accuracy_score(y_test, y_pred)* 100:.2f}%")

accuracy: 91.68%


In [87]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      1.00      0.95      3868
           1       0.99      0.38      0.54       590

    accuracy                           0.92      4458
   macro avg       0.95      0.69      0.75      4458
weighted avg       0.92      0.92      0.90      4458



In [88]:
def predict_email(email_text):
    processed_text = preprocess_text(email)
    processed_data = preprocess_text(email_text)
    vectorized_text = vectorizer.transform([processed_text])
    prediction = model.predict(vectorized_text)
    return "Spam" if prediction[0] == 1 else "Ham - Not Spam"

In [89]:
email ="You have been selected for a chance to get a FREE iPhone! Click here to claim: www.fakeprize.com"

In [90]:
print(f"Email: {email}/n Prediction: {predict_email(email)}")

Email: You have been selected for a chance to get a FREE iPhone! Click here to claim: www.fakeprize.com/n Prediction: Spam
