# Libraries

In [7]:
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
nltk.download('stopwords')
import ipywidgets as widgets
from IPython.display import display
import pickle

ModuleNotFoundError: No module named 'nltk'

# Load & view Data

In [8]:
file_path = 'lab11file.csv'
df = pd.read_csv(file_path)
print("Let's we see first Ten rows of our data")
df.head(10)  


Let's we see first Ten rows of our data


Unnamed: 0,Title,Location,Model,KM,Fuel Type,CC,Specs,Price
0,Honda Accord 2005 CL7 for Sale,Karachi,2005,110000,Petrol,2000.0,Automatic,23lacs
1,Hyundai Sonata 2021 2.5 for Sale,Gujranwala,2021,45000,Petrol,2500.0,Automatic,88lacs
2,Honda Civic Rebirth 2013 VTi Oriel 1.8 i-VTEC ...,Lahore,2013,170000,Petrol,1800.0,Manual,24.75lacs
3,Suzuki Cultus 2016 Limited Edition for Sale,Lahore,2016,96000,Petrol,1000.0,Manual,15.45lacs
4,Suzuki Alto 2019 VXR for Sale,Karachi,2019,55000,Petrol,660.0,Manual,22.25lacs
5,Prince Pearl 2023 MT for Sale,Karachi,2023,11000,Petrol,800.0,Manual,13.45lacs
6,Honda Civic Reborn 2011 VTi Oriel Prosmatec 1....,Rawalpindi,2011,156000,Petrol,1800.0,Automatic,25.3lacs
7,Toyota Raize 2020 Z for Sale,Rawalpindi,2020,80000,Petrol,1000.0,Automatic,59.5lacs
8,Suzuki Alto 2021 L Upgrade for Sale,Rawalpindi,2021,13000,Petrol,660.0,Automatic,32.25lacs
9,Toyota Land Cruiser 2003 for Sale,Lahore,2003,253000,Diesel,4200.0,Manual,1.2crore


In [9]:
print("Let's we see last Ten rows of our data")
df.tail(10)

Let's we see last Ten rows of our data


Unnamed: 0,Title,Location,Model,KM,Fuel Type,CC,Specs,Price
3685,Toyota Yaris 2021 GLI CVT 1.3 for Sale,Karachi,2021,19500,Petrol,1300.0,Automatic,45lacs
3686,Suzuki Alto 2011 VXR (CNG) for Sale,Risalpur,2011,212000,CNG,1000.0,Manual,9.95lacs
3687,Honda City 5th (GM2) Generation 2015 Aspire Pr...,Lahore,2015,76000,Petrol,1500.0,Automatic,34.85lacs
3688,Toyota Corolla 2016 XLi VVTi for Sale,Sargodha,2016,120,Petrol,1300.0,Manual,33.5lacs
3689,Suzuki Alto 2022 VXR for Sale,Islamabad,2022,49000,Petrol,660.0,Manual,25.65lacs
3690,Proton X70 2021 Premium FWD for Sale,Islamabad,2021,38000,Petrol,1500.0,Automatic,65lacs
3691,Toyota Corolla 2015 Altis Automatic 1.6 for Sale,Islamabad,2015,257000,Petrol,1600.0,Automatic,33.4lacs
3692,Suzuki Alto 2021 VXL AGS for Sale,Rawalpindi,2021,27850,Petrol,660.0,Automatic,28.8lacs
3693,Toyota Corolla 2005 XLi for Sale,Islamabad,2005,114203,Petrol,1300.0,Manual,28.5lacs
3694,Honda Civic 2021 Oriel 1.8 i-VTEC CVT for Sale...,Islamabad,2021,50358,Petrol,1800.0,Automatic,64.75lacs


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3695 entries, 0 to 3694
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Title      3695 non-null   object 
 1   Location   3695 non-null   object 
 2   Model      3695 non-null   int64  
 3   KM         3695 non-null   object 
 4   Fuel Type  3695 non-null   object 
 5   CC         3695 non-null   float64
 6   Specs      3695 non-null   object 
 7   Price      3639 non-null   object 
dtypes: float64(1), int64(1), object(6)
memory usage: 231.1+ KB


In [11]:
print(df['spam'].value_counts())

KeyError: 'spam'

### Null values

In [None]:
df.isnull().sum()

text    0
spam    0
dtype: int64

# Clean the text data 

In [None]:
def clean_text(text):
    text = re.sub(r'escapenumber', '', text)
    
    text = re.sub(r'\W', ' ', text)
    
    text = text.lower()  
    
    text = re.sub(r'\s+', ' ', text)
    return text

df['text'] = df['text'].apply(clean_text)
df.head(10)

Unnamed: 0,text,spam
0,subject naturally irresistible your corporate ...,1
1,subject the stock trading gunslinger fanny is ...,1
2,subject unbelievable new homes made easy im wa...,1
3,subject 4 color printing special request addit...,1
4,subject do not have money get software cds fro...,1
5,subject great nnews hello welcome to medzonlin...,1
6,subject here s a hot play in motion homeland s...,1
7,subject save your money buy getting this thing...,1
8,subject undeliverable home based business for ...,1
9,subject save your money buy getting this thing...,1


## removing stopwords

In [None]:
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

df['text'] = df['text'].apply(remove_stopwords)
df.head(10)

Unnamed: 0,text,spam
0,subject naturally irresistible corporate ident...,1
1,subject stock trading gunslinger fanny merrill...,1
2,subject unbelievable new homes made easy im wa...,1
3,subject 4 color printing special request addit...,1
4,subject money get software cds software compat...,1
5,subject great nnews hello welcome medzonline s...,1
6,subject hot play motion homeland security inve...,1
7,subject save money buy getting thing tried cia...,1
8,subject undeliverable home based business grow...,1
9,subject save money buy getting thing tried cia...,1


In [None]:
stemmer = PorterStemmer()

def stem_text(text):
    words = text.split()
    words = [stemmer.stem(word) for word in words]
    return ' '.join(words)

df['text'] = df['text'].apply(stem_text)
df.head(10)

Unnamed: 0,text,spam
0,subject natur irresist corpor ident lt realli ...,1
1,subject stock trade gunsling fanni merril muzo...,1
2,subject unbeliev new home made easi im want sh...,1
3,subject 4 color print special request addit in...,1
4,subject money get softwar cd softwar compat gr...,1
5,subject great nnew hello welcom medzonlin sh g...,1
6,subject hot play motion homeland secur invest ...,1
7,subject save money buy get thing tri ciall yet...,1
8,subject undeliver home base busi grownup messa...,1
9,subject save money buy get thing tri ciall yet...,1


### Labels (Spam = 1, Not Spam = 0)

In [None]:

vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['text']).toarray()

y = df['spam']


### Split the dataset 

In [None]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
model = MultinomialNB()
model.fit(X_train, y_train)


### Model Train

In [None]:
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')


Accuracy: 0.9781849912739965
Precision: 0.9818181818181818
Recall: 0.9310344827586207
F1 Score: 0.9557522123893805


In [None]:
with open("spam_email_model.pkl", "wb") as f:
    pickle.dump(model, f)
print("Model has been saved to 'spam_email_model.pkl'")

Model has been saved to 'spam_email_model.pkl'
