# Spam Mails Dataset [Kaggle]
[Data Set from Kaggle](https://www.kaggle.com/datasets/venky73/spam-mails-dataset)

In [20]:
import nltk
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [21]:
df = pd.read_csv('/content/spam_ham_dataset.csv', encoding='latin-1')
print('Shape:', df.shape)
df.head()

Shape: (5171, 4)


Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


> We have to take only `label` and `text` from `df`

In [22]:
df = df[['label', 'text']]
df.rename(columns={'text': 'message'}, inplace=True)
df.head()

Unnamed: 0,label,message
0,ham,Subject: enron methanol ; meter # : 988291\r\n...
1,ham,"Subject: hpl nom for january 9 , 2001\r\n( see..."
2,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar..."
3,spam,"Subject: photoshop , windows , office . cheap ..."
4,ham,Subject: re : indian springs\r\nthis deal is t...


In [23]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

- `SnowballStemmer` is used to reduce words to their base form, also known as the root form.

- `stop words` are words that are commonly used in a language and do not carry much meaning or significance. Examples of stop words include “the”, “and”, “a”, “an”, etc.

In [24]:
stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer('english')

In [25]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [26]:
def clean_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.isalpha()]
    tokens = [word for word in tokens if not word in stop_words]
    tokens = [stemmer.stem(word) for word in tokens]
    cleaned_text = " ".join(tokens)
    return cleaned_text

In [27]:
df['cleaned_message'] = df['message'].apply(clean_text)
df.head()

Unnamed: 0,label,message,cleaned_message
0,ham,Subject: enron methanol ; meter # : 988291\r\n...,subject enron methanol meter follow note gave ...
1,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",subject hpl nom januari see attach file hplnol...
2,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",subject neon retreat ho ho ho around wonder ti...
3,spam,"Subject: photoshop , windows , office . cheap ...",subject photoshop window offic cheap main tren...
4,ham,Subject: re : indian springs\r\nthis deal is t...,subject indian spring deal book teco pvr reven...


In [28]:
vectorizer = CountVectorizer()
counts = vectorizer.fit_transform(df['cleaned_message'].values)

In [29]:
classifier = MultinomialNB()
targets = df['label'].values
classifier.fit(counts, targets)

In [34]:
examples = ['Free smartphones', "I'm going to attend the Linux users group tomorrow."]
example_counts = vectorizer.transform(examples)
predictions = classifier.predict(example_counts)
print(predictions)

['spam' 'ham']
