 # Spam email filtering

In [75]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [4]:
df = pd.read_csv(r"C:\Users\Administrator\OneDrive\Documents\Spam_Emails_archive.zip")

In [6]:
df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [8]:
df.isnull().sum()

Category    0
Message     0
dtype: int64

 # step 1 : data preprocessing

In [49]:
label_encoder = LabelEncoder()
df["label"] = label_encoder.fit_transform(df["Category"])

In [51]:
stop_words = set(stopwords.words('english'))

In [35]:
def preprocess(Message):
    Message = Message.lower()
    tokens = word_tokenize(Message)
    tokens = [ word for word in tokens if word.isalpha() and word not in stop_words]
    return " ".join(tokens)

df["clean_Message"] = df["Message"].apply(preprocess)
print(df[["Message" , "clean_Message"]])
    
    

                                                Message  \
0     Go until jurong point, crazy.. Available only ...   
1                         Ok lar... Joking wif u oni...   
2     Free entry in 2 a wkly comp to win FA Cup fina...   
3     U dun say so early hor... U c already then say...   
4     Nah I don't think he goes to usf, he lives aro...   
...                                                 ...   
5567  This is the 2nd time we have tried 2 contact u...   
5568               Will ü b going to esplanade fr home?   
5569  Pity, * was in mood for that. So...any other s...   
5570  The guy did some bitching but I acted like i'd...   
5571                         Rofl. Its true to its name   

                                          clean_Message  
0     go jurong point crazy available bugis n great ...  
1                               ok lar joking wif u oni  
2     free entry wkly comp win fa cup final tkts may...  
3                   u dun say early hor u c already say  
4

it cleans text data by removing common stopwords and non-alphabetic words. It first converts the text to lowercase and splits it into words. Then it filters out stopwords and joins the remaining words back into a sentence. The cleaned text is saved in a new column called clean_text.

In [53]:
vectorizer = TfidfVectorizer()
x = vectorizer.fit_transform(df["clean_Message"])
y = df["lebel"]
df[['Category', 'Message', 'clean_Message', 'label']].head()

Unnamed: 0,Category,Message,clean_Message,label
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...,0
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry wkly comp win fa cup final tkts may...,1
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah think goes usf lives around though,0


TfidfVectorizer uses to convert cleaned text into numerical values that reflect how important each word is in the text. It then stores the result in X and the labels in y

 # step 2 : Model traning

In [66]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
model = MultinomialNB()
model.fit(X_train, y_train)

MultinomialNB is used for classifying text data based on word frequencies or TF-IDF features

# step 3 : model evaluation

In [77]:
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9713004484304932
Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.98       966
           1       1.00      0.79      0.88       149

    accuracy                           0.97      1115
   macro avg       0.98      0.89      0.93      1115
weighted avg       0.97      0.97      0.97      1115



 # step 4 : make prediction

In [89]:
def predict_emotion(Message):
    clean = preprocess(Message)
    vector = vectorizer.transform([clean])
    prediction = model.predict(vector)[0]
    return "spam" if prediction == 1 else "ham"


In [91]:
print(predict_emotion("Lol your always convencing"))

ham


This function cleans the input text, converts it into a numerical format using the vectorizer, and then uses the trained model to predict and return the emotion