<a href="https://colab.research.google.com/github/AmoolyaS/Machine_Learning_mini_projects/blob/main/Email_spam_ham.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This program detects if a mail is spam (1) or ham (0)

# Importing Dependencies

In [None]:
# For data preprocessing
import string # To filter out punctuations from the email text
import numpy as np  # For numerical operations on arrays
import pandas as pd  # For data manipulation and analysis

import nltk #importing toolkit
from nltk.corpus import stopwords # to exclude stop words
from nltk.stem.porter import PorterStemmer #to reduce words to stem level

from sklearn.feature_extraction.text import CountVectorizer # to convert text to vectors
from sklearn.model_selection import train_test_split # to split data into training and testing sets
from sklearn.ensemble import RandomForestClassifier

# Google Colab specific imports for Drive integration
from google.colab import drive  # To mount and interact with Google Drive

# Utility libraries for file handling and operations
import os  # For interacting with the operating system and file management
import shutil  # For high-level file operations
import zipfile  # For working with ZIP archives
import random  # For generating random numbers and shuffling data

In [None]:
nltk.download("stopwords") # Downloading stopwords from NLTK

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# Loading Dataset

In [None]:
drive.mount("/content/drive")# Mounting Google Drive as the dataset is saved in the drive folder

# Path to your zip file in Google Drive
zip_file_path = '/content/drive/MyDrive/spam_ham_email_data.zip'

# Directory where you want to unzip the contents
unzip_dir = 'Spam_ham_email_data'

# Unzip the file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(unzip_dir)

# Check the contents of the unzipped folder
os.listdir(unzip_dir)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


['spam_ham_dataset.csv']

In [None]:
df=pd.read_csv('/content/Spam_ham_email_data/spam_ham_dataset.csv')

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


# EDA

In [None]:
df.shape

(5171, 4)

In [None]:
# we require only the last 2 columns of the dataset
df["text"]=df["text"].apply(lambda x:x.replace("\r\n"," "))


In [None]:
df.text.iloc[0]

"Subject: enron methanol ; meter # : 988291 this is a follow up to the note i gave you on monday , 4 / 3 / 00 { preliminary flow data provided by daren } . please override pop ' s daily volume { presently zero } to reflect daily activity you can obtain from gas control . this change is needed asap for economics purposes ."

In [None]:
df.isna().sum()

Unnamed: 0,0
Unnamed: 0,0
label,0
text,0
label_num,0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  5171 non-null   int64 
 1   label       5171 non-null   object
 2   text        5171 non-null   object
 3   label_num   5171 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 161.7+ KB


# Data Preprocessing

In [None]:
#remove all punctuations, make everything lower case, create a stemmer and make everything stemmed versions
stemmer = PorterStemmer()
#stemmer.stem("sophistication")

'sophist'

In [None]:
corpus =[] #transfomed version of email
stopwords_set=set(stopwords.words("english"))

for i in range(len(df)):
  text=df.text.iloc[i].lower()
  text=text.translate(str.maketrans('','',string.punctuation)).split()

  text=[stemmer.stem(word) for word in text if word not in stopwords_set] # stem every word that got after removing punctuation in above step
  text=" ".join(text)
  corpus.append(text)

In [None]:
df.text.iloc[0]

"Subject: enron methanol ; meter # : 988291 this is a follow up to the note i gave you on monday , 4 / 3 / 00 { preliminary flow data provided by daren } . please override pop ' s daily volume { presently zero } to reflect daily activity you can obtain from gas control . this change is needed asap for economics purposes ."

In [None]:
corpus[0]

'subject enron methanol meter 988291 follow note gave monday 4 3 00 preliminari flow data provid daren pleas overrid pop daili volum present zero reflect daili activ obtain ga control chang need asap econom purpos'

In [None]:
#vectorising the words

vectorizer=CountVectorizer()

X=vectorizer.fit_transform(corpus).toarray()
y=df.label_num

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

In [None]:
X[0]

array([1, 0, 0, ..., 0, 0, 0])

# Model Building

In [None]:
clf=RandomForestClassifier(n_jobs=-1)

In [None]:
clf.fit(X_train,y_train)

In [None]:
clf.score(X_test,y_test)

0.9845410628019323

# Model Evaluation

In [None]:
email_to_classify=df.text.iloc[10]
email_to_classify

"Subject: vocable % rnd - word asceticism vcsc - brand new stock for your attention vocalscape inc - the stock symbol is : vcsc vcsc will be our top stock pick for the month of april - stock expected to bounce to 12 cents level the stock hit its all time low and will bounce back stock is going to explode in next 5 days - watch it soar watch the stock go crazy this and next week . breaking news - vocalscape inc . announces agreement to resell mix network services current price : $ 0 . 025 we expect projected speculative price in next 5 days : $ 0 . 12 we expect projected speculative price in next 15 days : $ 0 . 15 vocalscape networks inc . is building a company that ' s revolutionizing the telecommunications industry with the most affordable phone systems , hardware , online software , and rates in canada and the us . vocalscape , a company with global reach , is receiving international attention for the development of voice over ip ( voip ) application solutions , including the award 

## Pre-processing

In [None]:
email_text= email_to_classify.lower().translate(str.maketrans(" "," ",string.punctuation)).split()
email_text = [stemmer.stem(word) for word in email_text if word not in stopwords_set]
email_text = " ".join(email_text)

email_coprus = [email_text]

In [None]:
X_email= vectorizer.transform(email_coprus)

In [None]:
clf.predict(X_email) #predicted result

array([1])

In [None]:
print(df.label.iloc[10]) #true value
df.label_num.iloc[10] #true value

spam


1