In [None]:
# Importing necessary libraries from the NLTK toolkit
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize  # For tokenizing text into words and sentences

# Importing stopwords from NLTK to remove common words that add little value
from nltk.corpus import stopwords

# Downloading required NLTK datasets
nltk.download('punkt')  # Tokenizer models for sentence and word tokenization
nltk.download('punkt_tab')  # Optional: Extra support for tokenization
nltk.download('stopwords')  # Predefined stopword lists for various languages

# Downloading the dataset from Kaggle using Kaggle CLI
!kaggle datasets download -d abdallahwagih/spam-emails  # Dataset containing spam emails
!unzip spam-emails.zip  # Extracting the downloaded dataset

# Importing pandas for working with data in tabular format
import pandas as pd

# Loading the CSV dataset into a DataFrame
df = pd.read_csv("spam.csv")  # CSV contains columns like 'Message' and labels indicating spam or not

# Step to clean the text data:
# - Removing punctuation, special characters, and multiple spaces
# - Preparing data for tokenization and further text processing

import re  # Regular expressions for text cleaning

cleaned = []  # List to store cleaned text
for text in df['Message']:  # Looping through each message in the 'Message' column
    cleaned_text = re.sub(r'[^\w\s]', '', text)  # Removing all characters except words and spaces
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)  # Replacing multiple spaces with a single space
    cleaned_data = cleaned_text.strip()  # Stripping leading and trailing whitespace
    cleaned.append(cleaned_data)  # Adding the cleaned text to the list

# Tokenizing the cleaned text into words
# This step splits each cleaned text into a list of words
tokens = [word_tokenize(x) for x in cleaned]
stop = set(stopwords.words('english'))  # Fetching the list of English stopwords
stpktn = []  # List to store stopword-removed tokens
for k in range(len(df['Message'])):  # Loop through the tokenized text
    p = [i for i in tokens[k] if i not in stop]  # Filter out tokens that are in the stopword list
    stpktn.append(p)
    """
stop=set(stopwords.words("english"))
s=[]
for i in tokens:
  for j in i:
    if j not in stop:
      s.append(j) """
# Removing stopwords from tokenized words
# Stopwords are commonly used words like "is", "the", "and", etc., which are removed to reduce noise
 # Append the filtered tokens to the list

# Summary of steps:
# 1. Dataset is downloaded and loaded into a pandas DataFrame.
# 2. Text messages are cleaned by removing punctuation, special characters, and extra spaces.
# 3. The cleaned text is tokenized into words.
# 4. Stopwords are removed to focus on meaningful words.

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Dataset URL: https://www.kaggle.com/datasets/abdallahwagih/spam-emails
License(s): apache-2.0
Downloading spam-emails.zip to /content
  0% 0.00/207k [00:00<?, ?B/s]
100% 207k/207k [00:00<00:00, 39.1MB/s]
Archive:  spam-emails.zip
  inflating: spam.csv                


In [None]:
from nltk.stem import PorterStemmer
ps=PorterStemmer()

In [None]:
stpktn

[['Go',
  'jurong',
  'point',
  'crazy',
  'Available',
  'bugis',
  'n',
  'great',
  'world',
  'la',
  'e',
  'buffet',
  'Cine',
  'got',
  'amore',
  'wat'],
 ['Ok', 'lar', 'Joking', 'wif', 'u', 'oni'],
 ['Free',
  'entry',
  '2',
  'wkly',
  'comp',
  'win',
  'FA',
  'Cup',
  'final',
  'tkts',
  '21st',
  'May',
  '2005',
  'Text',
  'FA',
  '87121',
  'receive',
  'entry',
  'questionstd',
  'txt',
  'rateTCs',
  'apply',
  '08452810075over18s'],
 ['U', 'dun', 'say', 'early', 'hor', 'U', 'c', 'already', 'say'],
 ['Nah', 'I', 'dont', 'think', 'goes', 'usf', 'lives', 'around', 'though'],
 ['FreeMsg',
  'Hey',
  'darling',
  '3',
  'weeks',
  'word',
  'back',
  'Id',
  'like',
  'fun',
  'still',
  'Tb',
  'ok',
  'XxX',
  'std',
  'chgs',
  'send',
  '150',
  'rcv'],
 ['Even',
  'brother',
  'like',
  'speak',
  'They',
  'treat',
  'like',
  'aids',
  'patent'],
 ['As',
  'per',
  'request',
  'Melle',
  'Melle',
  'Oru',
  'Minnaminunginte',
  'Nurungu',
  'Vettam',
  'set',

In [None]:
ps.stem("Studied")
stop_token=stpktn

In [None]:
stemdata =[]
#separate root word from word :stem
for message in stop_token:
  st =[ps.stem(word) for word in message]
  stemdata.append(st)


In [None]:
stemdata

[['go',
  'jurong',
  'point',
  'crazi',
  'avail',
  'bugi',
  'n',
  'great',
  'world',
  'la',
  'e',
  'buffet',
  'cine',
  'got',
  'amor',
  'wat'],
 ['ok', 'lar', 'joke', 'wif', 'u', 'oni'],
 ['free',
  'entri',
  '2',
  'wkli',
  'comp',
  'win',
  'fa',
  'cup',
  'final',
  'tkt',
  '21st',
  'may',
  '2005',
  'text',
  'fa',
  '87121',
  'receiv',
  'entri',
  'questionstd',
  'txt',
  'ratetc',
  'appli',
  '08452810075over18'],
 ['u', 'dun', 'say', 'earli', 'hor', 'u', 'c', 'alreadi', 'say'],
 ['nah', 'i', 'dont', 'think', 'goe', 'usf', 'live', 'around', 'though'],
 ['freemsg',
  'hey',
  'darl',
  '3',
  'week',
  'word',
  'back',
  'id',
  'like',
  'fun',
  'still',
  'tb',
  'ok',
  'xxx',
  'std',
  'chg',
  'send',
  '150',
  'rcv'],
 ['even',
  'brother',
  'like',
  'speak',
  'they',
  'treat',
  'like',
  'aid',
  'patent'],
 ['as',
  'per',
  'request',
  'mell',
  'mell',
  'oru',
  'minnaminungint',
  'nurungu',
  'vettam',
  'set',
  'callertun',
  'call

In [None]:
#applying pos_tagging to data
from nltk.tag import pos_tag
nltk.download('averaged_perceptron_tagger_eng')



[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [None]:
pos_tag(["studying","running","ram","in","and"])

[('studying', 'VBG'),
 ('running', 'VBG'),
 ('ram', 'NN'),
 ('in', 'IN'),
 ('and', 'CC')]

In [None]:
pos_token=[pos_tag(message) for message in stop_token]

In [None]:
pos_token

[[('Go', 'VB'),
  ('jurong', 'JJ'),
  ('point', 'NN'),
  ('crazy', 'NN'),
  ('Available', 'NNP'),
  ('bugis', 'NN'),
  ('n', 'RB'),
  ('great', 'JJ'),
  ('world', 'NN'),
  ('la', 'NN'),
  ('e', 'VBP'),
  ('buffet', 'JJ'),
  ('Cine', 'NNP'),
  ('got', 'VBD'),
  ('amore', 'RB'),
  ('wat', 'JJ')],
 [('Ok', 'NNP'),
  ('lar', 'JJ'),
  ('Joking', 'NNP'),
  ('wif', 'NN'),
  ('u', 'NN'),
  ('oni', 'NN')],
 [('Free', 'JJ'),
  ('entry', 'NN'),
  ('2', 'CD'),
  ('wkly', 'JJ'),
  ('comp', 'NN'),
  ('win', 'VBP'),
  ('FA', 'NNP'),
  ('Cup', 'NNP'),
  ('final', 'JJ'),
  ('tkts', 'NN'),
  ('21st', 'CD'),
  ('May', 'NNP'),
  ('2005', 'CD'),
  ('Text', 'NNP'),
  ('FA', 'NNP'),
  ('87121', 'CD'),
  ('receive', 'JJ'),
  ('entry', 'NN'),
  ('questionstd', 'NN'),
  ('txt', 'NN'),
  ('rateTCs', 'NN'),
  ('apply', 'VBP'),
  ('08452810075over18s', 'CD')],
 [('U', 'JJ'),
  ('dun', 'NNS'),
  ('say', 'VBP'),
  ('early', 'JJ'),
  ('hor', 'NN'),
  ('U', 'NNP'),
  ('c', 'NN'),
  ('already', 'RB'),
  ('say', 'VB')],

23-01-25

In [None]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download("omw-1.4")
lemmatizer=WordNetLemmatizer()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [None]:
lemmatizer.lemmatize("studying","v")

'study'

In [None]:
# Function to convert nltk's POS tags to WordNet's POS tags
from nltk.corpus import wordnet
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ  # Adjective
    elif tag.startswith('V'):
        return wordnet.VERB  # Verb
    elif tag.startswith('N'):
        return wordnet.NOUN  # Noun
    elif tag.startswith('R'):
        return wordnet.ADV  # Adverb
    else:
        return wordnet.NOUN  # Default to noun if POS tag is unknown

In [None]:
pos_token[0][0][1]

'VB'

In [None]:
get_wordnet_pos(pos_token[0][0][1])

'v'

In [None]:
lemmatizer.lemmatize(pos_token[0][0][0],get_wordnet_pos(pos_token[0][0][1]))

'Go'

In [None]:

[lemmatizer.lemmatize(pos_token[0][i][0],get_wordnet_pos(pos_token[0][i][1])) for i in range(len(pos_token[0]))]

['Go',
 'jurong',
 'point',
 'crazy',
 'Available',
 'bugis',
 'n',
 'great',
 'world',
 'la',
 'e',
 'buffet',
 'Cine',
 'get',
 'amore',
 'wat']

In [None]:
lem_data=[]
for x in range(len(pos_token)):
  lem=[lemmatizer.lemmatize(pos_token[x][i][0],get_wordnet_pos(pos_token[x][i][1])) for i in range(len(pos_token[x]))]
  lem_data.append(lem)

In [None]:
lem_data

[['Go',
  'jurong',
  'point',
  'crazy',
  'Available',
  'bugis',
  'n',
  'great',
  'world',
  'la',
  'e',
  'buffet',
  'Cine',
  'get',
  'amore',
  'wat'],
 ['Ok', 'lar', 'Joking', 'wif', 'u', 'oni'],
 ['Free',
  'entry',
  '2',
  'wkly',
  'comp',
  'win',
  'FA',
  'Cup',
  'final',
  'tkts',
  '21st',
  'May',
  '2005',
  'Text',
  'FA',
  '87121',
  'receive',
  'entry',
  'questionstd',
  'txt',
  'rateTCs',
  'apply',
  '08452810075over18s'],
 ['U', 'dun', 'say', 'early', 'hor', 'U', 'c', 'already', 'say'],
 ['Nah', 'I', 'dont', 'think', 'go', 'usf', 'life', 'around', 'though'],
 ['FreeMsg',
  'Hey',
  'darling',
  '3',
  'week',
  'word',
  'back',
  'Id',
  'like',
  'fun',
  'still',
  'Tb',
  'ok',
  'XxX',
  'std',
  'chgs',
  'send',
  '150',
  'rcv'],
 ['Even',
  'brother',
  'like',
  'speak',
  'They',
  'treat',
  'like',
  'aid',
  'patent'],
 ['As',
  'per',
  'request',
  'Melle',
  'Melle',
  'Oru',
  'Minnaminunginte',
  'Nurungu',
  'Vettam',
  'set',
  'c

24-01-25

In [None]:
#bag of words-used to convert charates into numberical data(count vectoration)
stemedata=lem_data

NameError: name 'lem_data' is not defined

In [None]:
stem_vec =[''.join(message) for message in stemdata]

In [None]:
stem_vec

['gojurongpointcraziavailbugingreatworldlaebuffetcinegotamorwat',
 'oklarjokewifuoni',
 'freeentri2wklicompwinfacupfinaltkt21stmay2005textfa87121receiventriquestionstdtxtratetcappli08452810075over18',
 'udunsayearlihorucalreadisay',
 'nahidontthinkgoeusflivearoundthough',
 'freemsgheydarl3weekwordbackidlikefunstilltbokxxxstdchgsend150rcv',
 'evenbrotherlikespeaktheytreatlikeaidpatent',
 'asperrequestmellmelloruminnaminungintnurunguvettamsetcallertuncallerpress9copifriendcallertun',
 'winnerasvalunetworkcustomselectreceivea900prizerewardtoclaimcall09061701461claimcodekl341valid12hour',
 'hadmobil11monthurentitlupdatlatestcolourmobilcamerafreecallthemobilupdatcofree08002986030',
 'imgonnahomesoondontwanttalkstuffanymortonightkivecrienoughtoday',
 'sixchancwincashfrom10020000poundtxtcsh11send87575cost150pday6day16tsandcapplireplihl4info',
 'urgentyou1weekfreemembership100000prizejackpottxtwordclaimno81010tcwwwdbuknetlccltdpobox4403ldnw1a7rw18',
 'ivesearchrightwordthankbreatheripromiswont

In [None]:
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
cv =CountVectorizer()

In [None]:
x=cv.fit_transform(stem_vec)

In [None]:
x

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
x=cv.fit_transform(stem_vec).toarray()

In [None]:
x

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
y = df["Category"]

In [None]:
y

Unnamed: 0,Category
0,ham
1,ham
2,spam
3,ham
4,ham
...,...
5567,spam
5568,ham
5569,ham
5570,ham


In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rc = RandomForestClassifier()

In [None]:
rc.fit(x,y)

In [None]:
rc.score(x,y)

0.9996410624551328

In [None]:
#importing multinomialnb from sklearn (naive_bayes)
from sklearn.naive_bayes import MultinomialNB

In [None]:
mb=MultinomialNB()

In [None]:
mb.fit(x,y)

In [None]:
df["Message"][0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [None]:
x[0]

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
mb.predict([x[0]])

array(['ham'], dtype='<U4')

In [None]:
hs=[]
for i in x:
  a=mb.predict([i])
  i+1
  hs.append(a)

In [None]:
mhs=[]
for i in range(len(df['Message'])):
  p=(df['Message'][i],hs[i])
  mhs.append(p)

In [None]:
mhs

[('Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
  array(['ham'], dtype='<U4')),
 ('Ok lar... Joking wif u oni...', array(['ham'], dtype='<U4')),
 ("Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
  array(['ham'], dtype='<U4')),
 ('U dun say so early hor... U c already then say...',
  array(['ham'], dtype='<U4')),
 ("Nah I don't think he goes to usf, he lives around here though",
  array(['ham'], dtype='<U4')),
 ("FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv",
  array(['ham'], dtype='<U4')),
 ('Even my brother is not like to speak with me. They treat me like aids patent.',
  array(['ham'], dtype='<U4')),
 ("As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune f

In [None]:
hx=["spam","ham"]
spam=[]
for i in range(len(mhs)):
  if mhs[i][1]==hx[0]:
    k=mhs[i]
    spam.append(k)

In [None]:
spam

[('Congrats! 1 year special cinema pass for 2 is yours. call 09061209465 now! C Suprman V, Matrix3, StarWars3, etc all 4 FREE! bx420-ip4-5we. 150pm. Dont miss out!',
  array(['spam'], dtype='<U4')),
 ('Please call our customer service representative on FREEPHONE 0808 145 4742 between 9am-11pm as you have WON a guaranteed £1000 cash or £5000 prize!',
  array(['spam'], dtype='<U4')),
 ('HMV BONUS SPECIAL 500 pounds of genuine HMV vouchers to be won. Just answer 4 easy questions. Play Now! Send HMV to 86688 More info:www.100percent-real.com',
  array(['spam'], dtype='<U4')),
 ('December only! Had your mobile 11mths+? You are entitled to update to the latest colour camera mobile for Free! Call The Mobile Update Co FREE on 08002986906',
  array(['spam'], dtype='<U4')),
 ("Loan for any purpose £500 - £75,000. Homeowners + Tenants welcome. Have you been previously refused? We can still help. Call Free 0800 1956669 or text back 'help'",
  array(['spam'], dtype='<U4')),
 ('Congrats! 1 year spec

In [None]:
import pickle
with open("vectorizer.pickle","wb")as mdl_file:
  pickle.dump(x,mdl_file)

In [None]:
import pickle
with open("model.pickle","wb")as mdl_file:
  pickle.dump(mb,mdl_file)


In [None]:
stop = set(stopwords.words('english'))