In [13]:
import numpy as np
import pandas as pd
import re
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.naive_bayes import MultinomialNB
from wordcloud import WordCloud
%matplotlib inline

In [14]:
raw_data = pd.read_csv('emails.csv', engine = 'python')
raw_data.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [15]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5728 entries, 0 to 5727
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5728 non-null   object
 1   spam    5728 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 89.6+ KB


In [16]:
raw_data['spam'].value_counts()


spam
0    4360
1    1368
Name: count, dtype: int64

In [17]:
raw_data.isnull().sum()


text    0
spam    0
dtype: int64

In [18]:
# Convert text to lowercase
raw_data['text'] = raw_data['text'].str.lower()


In [19]:
raw_data.head()

Unnamed: 0,text,spam
0,subject: naturally irresistible your corporate...,1
1,subject: the stock trading gunslinger fanny i...,1
2,subject: unbelievable new homes made easy im ...,1
3,subject: 4 color printing special request add...,1
4,"subject: do not have money , get software cds ...",1


In [20]:

raw_data['text'] = raw_data['text'].apply(
    lambda x: re.sub(r'[^\w\s]', '', x)  # removes everything except word chars and whitespace
)


In [21]:
raw_data.head()

Unnamed: 0,text,spam
0,subject naturally irresistible your corporate ...,1
1,subject the stock trading gunslinger fanny is...,1
2,subject unbelievable new homes made easy im w...,1
3,subject 4 color printing special request addi...,1
4,subject do not have money get software cds fr...,1


In [22]:
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\well\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [23]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    return ' '.join(word for word in text.split() if word not in stop_words)

raw_data['text'] = raw_data['text'].apply(remove_stopwords)


In [24]:
raw_data.head()

Unnamed: 0,text,spam
0,subject naturally irresistible corporate ident...,1
1,subject stock trading gunslinger fanny merrill...,1
2,subject unbelievable new homes made easy im wa...,1
3,subject 4 color printing special request addit...,1
4,subject money get software cds software compat...,1


In [25]:
import re

def custom_cleanup(text):
    # 1. Remove the exact word "subject" (case-insensitive if needed)
    #    Using \b ensures we match the standalone word "subject"
    text = re.sub(r'\bsubject\b', '', text, flags=re.IGNORECASE)
    
    # 2. Remove all digits
    text = re.sub(r'\d+', '', text)
    
    # 3. Replace multiple spaces (including newlines) with a single space
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

raw_data['text'] = raw_data['text'].apply(custom_cleanup)


In [26]:
raw_data.head()

Unnamed: 0,text,spam
0,naturally irresistible corporate identity lt r...,1
1,stock trading gunslinger fanny merrill muzo co...,1
2,unbelievable new homes made easy im wanting sh...,1
3,color printing special request additional info...,1
4,money get software cds software compatibility ...,1


In [31]:
# 1. Imports
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

# 2. Download NLTK stopwords if not already done
nltk.download('stopwords')

# 3. Prepare the stopwords list
stop_words = stopwords.words('english')

# 4. Create the TfidfVectorizer with parameters
vectorizer = TfidfVectorizer(
    max_features=2500,  # keep only top 2500 words
    min_df=5,           # ignore words that appear in <5 docs
    max_df=0.7,         # ignore words that appear in >70% of docs
    stop_words=stop_words
)

# 5. Fit the vectorizer on the text data and transform it to a dense array
X = vectorizer.fit_transform(raw_data['text']).toarray()

# 6. Define the target variable
y = raw_data['spam']


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\well\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [34]:
from sklearn.model_selection import train_test_split

# Split data: 80% training, 20% testing (you can adjust the test_size as you like)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42  # for reproducibility
)


In [35]:


# Instantiate the model
model = MultinomialNB()


In [36]:
model.fit(X_train, y_train)


In [37]:
y_pred = model.predict(X_test)


In [38]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

# Classification Report (includes precision, recall, F1-score)
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)


Accuracy: 0.9781849912739965
Confusion Matrix:
 [[849   7]
 [ 18 272]]
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.99      0.99       856
           1       0.97      0.94      0.96       290

    accuracy                           0.98      1146
   macro avg       0.98      0.96      0.97      1146
weighted avg       0.98      0.98      0.98      1146



In [47]:
print(raw_data['text'][56])
print(y[56])

localized software languages available hello would like offer localized software versions german french spanish uk many others aii iisted software available immediate downioad need wait week cd deiivery exampies norton lnternet security pro windows xp professionai sp fuil version corei draw graphics suite dreamweaver mx homesite inciudinq macromedia studio mx browse site find software need native ianguaqe best reqards kayieen
1


In [48]:
print (model.predict(vectorizer.transform([raw_data['text'][56]])))

[1]
