### بسم الله الرحمن الرحيم

In [60]:
import re
import nltk
import numpy as np
import pandas as pd
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier

In [2]:
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')

In [3]:
# reade Data Set

data_frame = pd.read_csv('./Data_Set/data_spam.csv')

### Data Pre-Processing

#### get info of Data-Set

In [4]:
# display (rows, colmuns)

data_frame.shape

(5572, 38)

In [5]:
# show information about dataset

data_frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 38 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   v1           5572 non-null   object
 1   v2           5571 non-null   object
 2   Unnamed: 2   255 non-null    object
 3   Unnamed: 3   167 non-null    object
 4   Unnamed: 4   32 non-null     object
 5   Unnamed: 5   23 non-null     object
 6   Unnamed: 6   10 non-null     object
 7   Unnamed: 7   10 non-null     object
 8   Unnamed: 8   5 non-null      object
 9   Unnamed: 9   4 non-null      object
 10  Unnamed: 10  4 non-null      object
 11  Unnamed: 11  1 non-null      object
 12  Unnamed: 12  1 non-null      object
 13  Unnamed: 13  1 non-null      object
 14  Unnamed: 14  1 non-null      object
 15  Unnamed: 15  1 non-null      object
 16  Unnamed: 16  1 non-null      object
 17  Unnamed: 17  1 non-null      object
 18  Unnamed: 18  1 non-null      object
 19  Unnamed: 19  1 non-null    

In [6]:
# change columns name
data_frame.rename(columns={'v1': 'label', 'v2':'text'}, inplace=True)
data_frame = data_frame[['label', 'text']]

In [7]:
# show description information of data_frame['label']

data_frame['label'].describe()

count     5572
unique       2
top        ham
freq      4825
Name: label, dtype: object

In [8]:
# show description information of data_frame['text']

data_frame['text'].describe()

count                       5571
unique                      5163
top       Sorry, I'll call later
freq                          30
Name: text, dtype: object

In [9]:
data_frame.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [10]:
# counting the number of missing values in the dataset

data_frame.isnull().sum()

label    0
text     1
dtype: int64

In [11]:
# counting the number of duplicated values in the data_frame

data_frame.duplicated().sum()

408

#### Data Cleaning

In [12]:
# replacing the null values with empty string

data_frame = data_frame.fillna('')

In [13]:
# remove duplicated values in the data_frame

data_frame = data_frame.drop_duplicates()

In [14]:
# replace "spam" with 0 & "ham" with 1

data_frame['label'].replace({'spam': 0, 'ham': 1}, inplace=True)

In [15]:
data_frame.head()

Unnamed: 0,label,text
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [16]:
# function take text then return the same text without punctuation.

def remove_punctuations(content):
    content = content.lower()
    return content.translate(str.maketrans("","",string.punctuation))

In [17]:
# apply above function on data_frame['text']
# recive new text in new column named unpunctuated_text

data_frame['unpunctuated_text'] = data_frame['text'].apply(remove_punctuations)

In [18]:
# showing new column [unpunctuated_text]

data_frame.head()

Unnamed: 0,label,text,unpunctuated_text
0,1,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...
1,1,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,0,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...
3,1,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say
4,1,"Nah I don't think he goes to usf, he lives aro...",nah i dont think he goes to usf he lives aroun...


In [19]:
""""
 apply tokenization on data_frame['unpunctuated_text'] to split text into list of words then return this list
 recive list_of_words in new column named tokenized_words
"""


data_frame['tokenized_words'] = data_frame['unpunctuated_text'].apply(word_tokenize)

In [20]:
# showing new column [tokenized_words]

data_frame.head()

Unnamed: 0,label,text,unpunctuated_text,tokenized_words
0,1,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...,"[go, until, jurong, point, crazy, available, o..."
1,1,Ok lar... Joking wif u oni...,ok lar joking wif u oni,"[ok, lar, joking, wif, u, oni]"
2,0,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f..."
3,1,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say,"[u, dun, say, so, early, hor, u, c, already, t..."
4,1,"Nah I don't think he goes to usf, he lives aro...",nah i dont think he goes to usf he lives aroun...,"[nah, i, dont, think, he, goes, to, usf, he, l..."


#### remove stop words

In [21]:
# get stop words in english to remove from my text

stopwords_list = set(stopwords.words('english')) 

print(stopwords_list)

{'of', 'doing', 'd', 'for', 'be', 'll', 'its', 'about', 't', 'your', 'both', "doesn't", 'ours', 'just', "shan't", "mustn't", 'yourselves', 'how', 'there', 'all', 'itself', 'being', 'does', "aren't", 'these', 'each', 'against', 'can', 'am', 'so', 'same', "wouldn't", "shouldn't", 'doesn', 'myself', 'her', 'and', 'from', 'more', 'herself', 'have', 'over', 'their', 'off', 'me', "you'll", 'them', 'been', 'isn', 'our', 'if', 'now', 'that', 'is', 'a', 'hasn', 'when', 'had', 'did', 's', "don't", 'other', 'above', 'under', 'while', 'during', 'up', 'because', 'theirs', 'this', 'few', 'o', "weren't", 'haven', 'but', 'which', "needn't", 'themselves', "it's", 'any', 'will', "hasn't", 'than', 'where', 'his', 'aren', 'once', 'out', 'hadn', 'they', 'were', 'or', 'y', 'he', 'no', 'shouldn', 'yourself', 'those', 'an', "couldn't", 'was', 'are', 'until', 'then', 'into', 'should', "you're", 'm', "won't", 'nor', 'very', 'down', 'with', 'yours', 'in', "haven't", 'again', 'after', "should've", 'why', 'has', "

In [22]:
def remove_stopwords(content):
    content = [word for word in content if not word in stopwords_list]
    return content

In [23]:
# apply remove_stopwords function to remove from tokenized_words _list

data_frame['tokenized_words'] = data_frame['tokenized_words'].apply(remove_stopwords)

 #### stemming

In [24]:
port_stem = PorterStemmer()

In [25]:
"""
 function for stemming take tokenizd_words_list then make stem for very word in tokenizd_words_list
 return Text (not List) after stemming
"""

def stemming(content):
    stemmed_content = [port_stem.stem(word) for word in content]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [26]:
# Apply stemming function on tokenized words

data_frame['stemmed_text'] = data_frame['tokenized_words'].apply(stemming)

In [27]:
data_frame.head()

Unnamed: 0,label,text,unpunctuated_text,tokenized_words,stemmed_text
0,1,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...,"[go, jurong, point, crazy, available, bugis, n...",go jurong point crazi avail bugi n great world...
1,1,Ok lar... Joking wif u oni...,ok lar joking wif u oni,"[ok, lar, joking, wif, u, oni]",ok lar joke wif u oni
2,0,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...,"[free, entry, 2, wkly, comp, win, fa, cup, fin...",free entri 2 wkli comp win fa cup final tkt 21...
3,1,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say,"[u, dun, say, early, hor, u, c, already, say]",u dun say earli hor u c alreadi say
4,1,"Nah I don't think he goes to usf, he lives aro...",nah i dont think he goes to usf he lives aroun...,"[nah, dont, think, goes, usf, lives, around, t...",nah dont think goe usf live around though


#### lemmatization

In [28]:
lemmatizer = WordNetLemmatizer()

In [29]:
"""
 function for lemetize text take tokenizd_words_list then make stem for very word in tokenizd_words_list
 return Text (not List) after lemetizing
"""

def lemetize(content):
    lemetized_content = [lemmatizer.lemmatize(word) for word in content]
    lemetized_content = ' '.join(lemetized_content)
    return lemetized_content

In [30]:
# excute lemtization function on data

data_frame['lemetized_text'] = data_frame['tokenized_words'].apply(lemetize)

In [31]:
data_frame.head()

Unnamed: 0,label,text,unpunctuated_text,tokenized_words,stemmed_text,lemetized_text
0,1,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...,"[go, jurong, point, crazy, available, bugis, n...",go jurong point crazi avail bugi n great world...,go jurong point crazy available bugis n great ...
1,1,Ok lar... Joking wif u oni...,ok lar joking wif u oni,"[ok, lar, joking, wif, u, oni]",ok lar joke wif u oni,ok lar joking wif u oni
2,0,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...,"[free, entry, 2, wkly, comp, win, fa, cup, fin...",free entri 2 wkli comp win fa cup final tkt 21...,free entry 2 wkly comp win fa cup final tkts 2...
3,1,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say,"[u, dun, say, early, hor, u, c, already, say]",u dun say earli hor u c alreadi say,u dun say early hor u c already say
4,1,"Nah I don't think he goes to usf, he lives aro...",nah i dont think he goes to usf he lives aroun...,"[nah, dont, think, goes, usf, lives, around, t...",nah dont think goe usf live around though,nah dont think go usf life around though


#### divide data into X, Y

In [32]:
# put text in X

X = data_frame['lemetized_text'].values # input data

# put label value to Y

Y = data_frame['label'].values # result

#### Bag Of Words

In [33]:
bow = CountVectorizer(stop_words='english')

bow.fit(X)

In [34]:
# show Extraction Feature 

print(bow.get_feature_names_out())

['008704050406' '0089my' '0121' ... 'zoom' 'zouk' 'zyada']


In [35]:
# transform X to numerical data

X = bow.transform(X)

In [36]:
# show X after apply Bag of Words

print(X)

  (0, 1125)	1
  (0, 1331)	1
  (0, 1746)	1
  (0, 1748)	1
  (0, 2050)	1
  (0, 2308)	1
  (0, 3498)	1
  (0, 3535)	1
  (0, 4250)	1
  (0, 4392)	1
  (0, 5780)	1
  (0, 8016)	1
  (0, 8222)	1
  (1, 4220)	1
  (1, 4424)	1
  (1, 5394)	1
  (1, 5421)	1
  (1, 8129)	1
  (2, 71)	1
  (2, 436)	1
  (2, 449)	1
  (2, 850)	1
  (2, 1199)	1
  (2, 2157)	1
  (2, 2359)	1
  :	:
  (5159, 5296)	1
  (5159, 5841)	1
  (5159, 5933)	1
  (5159, 7481)	1
  (5159, 7634)	1
  (5160, 2924)	1
  (5160, 3249)	1
  (5160, 3463)	1
  (5160, 3804)	1
  (5161, 5015)	1
  (5161, 5710)	1
  (5161, 6825)	1
  (5161, 7155)	1
  (5162, 967)	1
  (5162, 1553)	1
  (5162, 1780)	1
  (5162, 3260)	1
  (5162, 3367)	1
  (5162, 3588)	1
  (5162, 3939)	1
  (5162, 4063)	1
  (5162, 4521)	1
  (5162, 8060)	1
  (5163, 6336)	1
  (5163, 7645)	1


In [37]:
X.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

### Models

In [38]:
# spliting the dataset to (80%) training data & (20%) test data

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify=Y, random_state=2)

#### Logistic Regression

In [39]:
model = LogisticRegression()

In [40]:
model.fit(X_train, Y_train)

In [41]:

# accuracy score on the training data
X_train_prediction = model.predict(X_train)
precision = accuracy_score(X_train_prediction, Y_train)
recall = recall_score(X_train_prediction, Y_train)
f1score = f1_score(X_train_prediction, Y_train)


In [42]:
print('Accuracy score of the training data : ', precision)
print("Recall : ",recall)
print("f1socre : ",f1score)

Accuracy score of the training data :  0.9937061244250787
Recall :  0.992847317744154
f1socre :  0.996410822749862


In [43]:

# accuracy score on the test data
X_test_prediction = model.predict(X_test)
precision = accuracy_score(X_test_prediction, Y_test)
recall = recall_score(X_test_prediction, Y_test)
f1score = f1_score(X_test_prediction, Y_test)


In [44]:
print('Accuracy score of the test data : ', precision)
print("Recall : ",recall)
print("f1socre : ",f1score)

Accuracy score of the test data :  0.9757986447241046
Recall :  0.9750812567713976
f1socre :  0.9863013698630138


#### SVM

In [45]:
from sklearn import svm

In [46]:
svm = svm.SVC()

In [47]:
svm.fit(X_train, Y_train)

In [48]:
# accuracy score on the training data
X_train_prediction = svm.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [49]:
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  0.9946744129750665


In [50]:
# accuracy score on the test data
X_test_prediction = svm.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)


In [51]:
print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the test data :  0.9661181026137464


In [52]:
precision = precision_score(Y_test, X_test_prediction)
recall = recall_score(Y_test, X_test_prediction)
f1score = f1_score(Y_test, X_test_prediction)
print("Precision : ",precision)
print("Recall : ",recall)
print("f1socre : ",f1score)

Precision :  0.9636363636363636
Recall :  0.9988913525498891
f1socre :  0.9809471965160589


#### Decision Tree

In [53]:
clf = DecisionTreeClassifier()

In [54]:
clf.fit(X_train, Y_train)

In [55]:
# accuracy score on the training data
X_train_prediction = clf.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [56]:
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  1.0


In [57]:
# accuracy score on the test data
X_test_prediction = clf.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)


In [58]:
print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the test data :  0.957405614714424


In [59]:
precision = precision_score(Y_test, X_test_prediction)
recall = recall_score(Y_test, X_test_prediction)
f1score = f1_score(Y_test, X_test_prediction)
print("Precision : ",precision)
print("Recall : ",recall)
print("f1socre : ",f1score)

Precision :  0.9693654266958425
Recall :  0.9822616407982262
f1socre :  0.9757709251101322
