### بسم الله الرحمن الرحيم

In [302]:
import re
import nltk
import numpy as np
import pandas as pd
import string
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression


### Data Pre-Processing

In [303]:
# nltk.download('stopwords')

In [304]:
# nltk.download('punkt')

In [305]:
# nltk.download('wordnet')

In [306]:
stopwords_list = set(stopwords.words('english'))
print(stopwords_list)

{"isn't", 'than', 'should', 'him', 'ourselves', 'all', 'how', 'hers', 'if', 'were', 'that', 'am', 'doing', 'down', 'our', 's', 'didn', 'from', 'myself', 'very', 'my', 'once', 'again', 'as', "weren't", 'who', 'itself', 'it', 'a', "hadn't", 'yourself', 'are', "haven't", 'its', 'same', 'through', 'mightn', 'ma', "that'll", 't', 'had', 'couldn', 'between', 'few', 'about', 'while', 'too', 'you', 'have', 'not', 'can', 'which', 'y', 'was', 'these', 'above', 'up', 'me', 'nor', 'theirs', 'i', 'hasn', 'herself', 'with', 'the', 'her', 'don', 'm', "mightn't", 'weren', 'further', "doesn't", 'off', "should've", 'themselves', 'of', 'such', 'what', "needn't", 'their', 'will', "aren't", "don't", 've', 'this', 'needn', 'after', 'his', 'those', 'he', 'more', 'now', 're', 'to', 'most', "you'd", 'into', 'has', "couldn't", 'then', 'shan', 'ours', 'there', "you're", 'but', 'when', 'own', 'here', 'so', 'they', 'won', 'wasn', "didn't", 'on', 'during', 'before', 'ain', 'himself', "you'll", 'she', "it's", 'them'

In [307]:
data_frame = pd.read_csv('./Data_Set/data_spam.csv')
# pd.set_option('display.max_rows', data_frame.shape[0]+1)

In [308]:
# change columns name
data_frame.rename(columns={'v1': 'label', 'v2':'text'}, inplace=True)
data_frame = data_frame[['label', 'text']]

In [309]:
# show insformation about dataset
data_frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   5572 non-null   object
 1   text    5571 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [310]:
# display description of data Frame
data_frame.describe()

Unnamed: 0,label,text
count,5572,5571
unique,2,5163
top,ham,"Sorry, I'll call later"
freq,4825,30


In [311]:
# display description of text column
data_frame['text'].describe()

count                       5571
unique                      5163
top       Sorry, I'll call later
freq                          30
Name: text, dtype: object

In [312]:
# display (rows, colmuns)
data_frame.shape

(5572, 2)

In [313]:
# show top 5 rows
data_frame.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [314]:
# counting the number of missing values in the dataset
data_frame.isnull().sum()

label    0
text     1
dtype: int64

In [315]:
# replacing the null values with empty string
data_frame = data_frame.fillna('')

In [316]:
# counting the number of duplicated values in the data_frame
data_frame.duplicated().sum()

408

In [317]:
# remove duplicated values in the data_frame
data_frame = data_frame.drop_duplicates()

In [318]:
# replace "spam" with 0 & "ham" with 1
data_frame['label'].replace({'spam': 0, 'ham': 1}, inplace=True)

In [319]:
data_frame.head()

Unnamed: 0,label,text
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [320]:
def removing_punctuations(content):
    content = content.lower()
    return content.translate(str.maketrans("","",string.punctuation))

In [321]:
data_frame['unpunctuated_text'] = data_frame['text'].apply(removing_punctuations)

In [322]:
data_frame.head()

Unnamed: 0,label,text,unpunctuated_text
0,1,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...
1,1,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,0,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...
3,1,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say
4,1,"Nah I don't think he goes to usf, he lives aro...",nah i dont think he goes to usf he lives aroun...


In [323]:
port_stem = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [324]:
def stemming(content):
    stemmed_content = content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords_list]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [325]:
data_frame['stemmed_text'] = data_frame['unpunctuated_text'].apply(stemming)

In [326]:
data_frame.head()

Unnamed: 0,label,text,unpunctuated_text,stemmed_text
0,1,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...,go jurong point crazi avail bugi n great world...
1,1,Ok lar... Joking wif u oni...,ok lar joking wif u oni,ok lar joke wif u oni
2,0,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...,free entri 2 wkli comp win fa cup final tkt 21...
3,1,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say,u dun say earli hor u c alreadi say
4,1,"Nah I don't think he goes to usf, he lives aro...",nah i dont think he goes to usf he lives aroun...,nah dont think goe usf live around though


In [327]:
# lemetization function
def lemetize(content):
    lemetized_content = content.split()
    lemetized_content = [lemmatizer.lemmatize(word) for word in lemetized_content]
    lemetized_content = ' '.join(lemetized_content)
    return lemetized_content

In [328]:
# excute lemtization function on data
data_frame['lemetized_text'] = data_frame['unpunctuated_text'].apply(lemetize)

In [329]:
data_frame.head()

Unnamed: 0,label,text,unpunctuated_text,stemmed_text,lemetized_text
0,1,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...,go jurong point crazi avail bugi n great world...,go until jurong point crazy available only in ...
1,1,Ok lar... Joking wif u oni...,ok lar joking wif u oni,ok lar joke wif u oni,ok lar joking wif u oni
2,0,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...,free entri 2 wkli comp win fa cup final tkt 21...,free entry in 2 a wkly comp to win fa cup fina...
3,1,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say,u dun say earli hor u c alreadi say,u dun say so early hor u c already then say
4,1,"Nah I don't think he goes to usf, he lives aro...",nah i dont think he goes to usf he lives aroun...,nah dont think goe usf live around though,nah i dont think he go to usf he life around h...


In [330]:
# applay tokenization function
data_frame['tokenized_words'] = data_frame['lemetized_text'].apply(word_tokenize)

In [331]:
data_frame.head(10)

Unnamed: 0,label,text,unpunctuated_text,stemmed_text,lemetized_text,tokenized_words
0,1,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...,go jurong point crazi avail bugi n great world...,go until jurong point crazy available only in ...,"[go, until, jurong, point, crazy, available, o..."
1,1,Ok lar... Joking wif u oni...,ok lar joking wif u oni,ok lar joke wif u oni,ok lar joking wif u oni,"[ok, lar, joking, wif, u, oni]"
2,0,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...,free entri 2 wkli comp win fa cup final tkt 21...,free entry in 2 a wkly comp to win fa cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f..."
3,1,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say,u dun say earli hor u c alreadi say,u dun say so early hor u c already then say,"[u, dun, say, so, early, hor, u, c, already, t..."
4,1,"Nah I don't think he goes to usf, he lives aro...",nah i dont think he goes to usf he lives aroun...,nah dont think goe usf live around though,nah i dont think he go to usf he life around h...,"[nah, i, dont, think, he, go, to, usf, he, lif..."
5,0,FreeMsg Hey there darling it's been 3 week's n...,freemsg hey there darling its been 3 weeks now...,freemsg hey darl 3 week word back id like fun ...,freemsg hey there darling it been 3 week now a...,"[freemsg, hey, there, darling, it, been, 3, we..."
6,1,Even my brother is not like to speak with me. ...,even my brother is not like to speak with me t...,even brother like speak treat like aid patent,even my brother is not like to speak with me t...,"[even, my, brother, is, not, like, to, speak, ..."
7,1,As per your request 'Melle Melle (Oru Minnamin...,as per your request melle melle oru minnaminun...,per request mell mell oru minnaminungint nurun...,a per your request melle melle oru minnaminung...,"[a, per, your, request, melle, melle, oru, min..."
8,0,WINNER!! As a valued network customer you have...,winner as a valued network customer you have b...,winner valu network custom select receivea �90...,winner a a valued network customer you have be...,"[winner, a, a, valued, network, customer, you,..."
9,0,Had your mobile 11 months or more? U R entitle...,had your mobile 11 months or more u r entitled...,mobil 11 month u r entitl updat latest colour ...,had your mobile 11 month or more u r entitled ...,"[had, your, mobile, 11, month, or, more, u, r,..."


In [332]:
X = data_frame['lemetized_text'].values # input data
Y = data_frame['label'].values # result

In [333]:
print(X)

['go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat'
 'ok lar joking wif u oni'
 'free entry in 2 a wkly comp to win fa cup final tkts 21st may 2005 text fa to 87121 to receive entry questionstd txt ratetcs apply 08452810075over18s'
 ... 'pity wa in mood for that soany other suggestion'
 'the guy did some bitching but i acted like id be interested in buying something else next week and he gave it to u for free'
 'rofl it true to it name']


In [334]:
# converting the textual data to numerical data
vectorizer = TfidfVectorizer()
vectorizer.fit(X)
X = vectorizer.transform(X)

In [335]:
print(X)

  (0, 8491)	0.2309218750279521
  (0, 8263)	0.18687785983494606
  (0, 8021)	0.2309218750279521
  (0, 7632)	0.15791531910805345
  (0, 5964)	0.22351212751204189
  (0, 5597)	0.160260545944459
  (0, 4527)	0.27583716379007334
  (0, 4383)	0.3270259593188449
  (0, 4115)	0.10983626863124553
  (0, 3642)	0.18408220757620894
  (0, 3604)	0.15362936454639892
  (0, 3552)	0.14535610284318143
  (0, 2366)	0.25262584515728814
  (0, 2104)	0.27583716379007334
  (0, 1796)	0.27583716379007334
  (0, 1794)	0.3120541963981378
  (0, 1363)	0.25262584515728814
  (0, 1143)	0.3270259593188449
  (1, 8393)	0.430072403170494
  (1, 5591)	0.5460452394462293
  (1, 5561)	0.2795386833865282
  (1, 4559)	0.4065704703661188
  (1, 4353)	0.5229062072934929
  (2, 8446)	0.18893457134886524
  (2, 8407)	0.14402125725756823
  :	:
  (5162, 7041)	0.1854374037896603
  (5162, 5352)	0.20965061537346655
  (5162, 4658)	0.1602873349093203
  (5162, 4247)	0.11554552633627473
  (5162, 4191)	0.2862026300577176
  (5162, 4115)	0.11396371876094434


### Model

In [336]:
# spliting the dataset to (80%) training data & (20%) test data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify=Y, random_state=2)

#### Logistic Regression

In [337]:
model = LogisticRegression()

In [338]:
model.fit(X_train, Y_train)

In [339]:

# accuracy score on the training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)


In [340]:
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  0.9670781893004116


In [341]:

# accuracy score on the test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)


In [342]:
print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the test data :  0.9603097773475314


#### SVM

#### Decision Tree

#### Naive Bayes