#### Importing the libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import spacy
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('omw-1.4')
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bhumu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\bhumu\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


#### Load the dataset

In [2]:
dataset = pd.read_csv(r"C:\Users\bhumu\Downloads\SPAM text message 20170820 - Data.csv")

In [3]:
dataset

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [4]:
dataset.isnull().sum()

Category    0
Message     0
dtype: int64

In [5]:
dataset.dtypes

Category    object
Message     object
dtype: object

In [6]:
dataset.shape

(5572, 2)

In [7]:
mess = dataset["Message"][0]

In [8]:
mess

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

#### Removing unwanted punctuations

In [9]:
import re
re_punt = "[^A-Za-z\s]"
mess = re.sub(re_punt, "",mess)
mess

'Go until jurong point crazy Available only in bugis n great world la e buffet Cine there got amore wat'

#### Normalising the case

In [10]:
mess = mess.lower()
mess

'go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat'

#### Tokenization

In [11]:
import spacy

In [12]:
nlp = spacy.load("en_core_web_sm")
mess = nlp(mess)
msg = []
for token in mess:
    msg.append(token.text)

msg

['go',
 'until',
 'jurong',
 'point',
 'crazy',
 'available',
 'only',
 'in',
 'bugis',
 'n',
 'great',
 'world',
 'la',
 'e',
 'buffet',
 'cine',
 'there',
 'got',
 'amore',
 'wat']

#### Stopwords Removal

In [13]:
import nltk

In [14]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bhumu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [15]:
from nltk.corpus import stopwords

In [16]:
sw_list = stopwords.words("english")

In [17]:
sw_list

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [18]:
msg = [word for word in msg if word not in sw_list]

In [19]:
msg

['go',
 'jurong',
 'point',
 'crazy',
 'available',
 'bugis',
 'n',
 'great',
 'world',
 'la',
 'e',
 'buffet',
 'cine',
 'got',
 'amore',
 'wat']

In [20]:
lemma = WordNetLemmatizer()

In [21]:
# msgs = []
# for word in msg:
#     msgs.append(lemma.lemmatize(word, pos='v'))

# msgs

#### Entire text preprocessing using defining a function

In [22]:
def text_preprocessing(message):
    re_punt = "[^A-Za-z\s]"
    message = re.sub(re_punt, "",message) #Removal of unwanted punctuations
    message = message.lower() #Normalising the case
    message = nlp(message) 
    
    msg = [] # Tokenization
    for token in message:
        msg.append(token.text)
        
    sw_list = stopwords.words("english") #Removal of stop words
    msg = [word for word in msg if word not in sw_list]
    
    
    msgs = ' ' #Lemmatization
    for word in msg:
        msgs += ' ' + lemma.lemmatize(word, pos='v')
    
    return msgs



In [23]:
dataset["Message1"] = dataset["Message"].apply(text_preprocessing)

In [24]:
dataset

Unnamed: 0,Category,Message,Message1
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n grea...
1,ham,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry wkly comp win fa cup final tkts...
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah nt think go usf live around though
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,nd time try contact u u pound prize cl...
5568,ham,Will ü b going to esplanade fr home?,b go esplanade fr home
5569,ham,"Pity, * was in mood for that. So...any other s...",pity mood soany suggestions
5570,ham,The guy did some bitching but I acted like i'd...,guy bitch act like interest buy something el...


#### Vectorization

In [25]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [26]:
cv = CountVectorizer()
tfv = TfidfVectorizer()
X = tfv.fit_transform(dataset['Message1']).toarray()

In [27]:
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [28]:
X = pd.DataFrame(X, columns = tfv.get_feature_names_out())

In [29]:
X

Unnamed: 0,aa,aah,aaniye,aaooooright,aathilove,aathiwhere,ab,abbey,abdomen,abeg,...,zero,zf,zhong,zindgi,zoe,zogtorius,zoom,zouk,zs,zyada
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5568,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5569,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5570,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Encoding of target variable

In [58]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [55]:
ohe = OneHotEncoder()

In [56]:
enc_data = ohe.fit_transform(np.array(dataset['Category']).reshape(len(dataset['Category']), 1)).toarray()

In [57]:
enc_data.shape

(5572, 2)

In [64]:
ct = ColumnTransformer(transformers = [("ohe", ohe, ['Category'])], remainder = "passthrough") 

In [67]:
new_df = ct.fit_transform(dataset)
# new_df1 = pd.DataFrame(new_df, columns = ct.get_feature_names_out())
# new_df1

In [69]:
new_df1 = pd.DataFrame(new_df, columns = ct.get_feature_names_out())
new_df1

Unnamed: 0,ohe__Category_ham,ohe__Category_spam,remainder__Message,remainder__Message1
0,1.0,0.0,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n grea...
1,1.0,0.0,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,0.0,1.0,Free entry in 2 a wkly comp to win FA Cup fina...,free entry wkly comp win fa cup final tkts...
3,1.0,0.0,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,1.0,0.0,"Nah I don't think he goes to usf, he lives aro...",nah nt think go usf live around though
...,...,...,...,...
5567,0.0,1.0,This is the 2nd time we have tried 2 contact u...,nd time try contact u u pound prize cl...
5568,1.0,0.0,Will ü b going to esplanade fr home?,b go esplanade fr home
5569,1.0,0.0,"Pity, * was in mood for that. So...any other s...",pity mood soany suggestions
5570,1.0,0.0,The guy did some bitching but I acted like i'd...,guy bitch act like interest buy something el...


In [70]:
y = new_df1["ohe__Category_ham"]

In [83]:
y

0       1.0
1       1.0
2       0.0
3       1.0
4       1.0
       ... 
5567    0.0
5568    1.0
5569    1.0
5570    1.0
5571    1.0
Name: ohe__Category_ham, Length: 5572, dtype: object

In [85]:
y = y.astype(int)

In [86]:
y

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: ohe__Category_ham, Length: 5572, dtype: int32

In [87]:
np.array(y)

array([1, 1, 0, ..., 1, 1, 1])

#### Splitting the data into train-set and test-set

In [88]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, np.array(y), test_size = 0.2, random_state = 10)

#### Building the model

In [89]:
from sklearn.naive_bayes import GaussianNB

In [90]:
nb = GaussianNB()

In [91]:
nb.fit(X_train, y_train)

GaussianNB()

#### Prediction

In [92]:
y_pred = nb.predict(X_test)

In [95]:
y_pred

array([1, 1, 1, ..., 1, 0, 1])

#### Evaluation

In [93]:
from sklearn.metrics import accuracy_score

test_accuracy = accuracy_score(y_test, y_pred)

test_accuracy

0.8484304932735426

#### Classification Report

In [97]:
from sklearn.metrics import classification_report
cls_names = ['spam', 'ham']
print(classification_report(y_test, y_pred, target_names = cls_names))

              precision    recall  f1-score   support

        spam       0.45      0.82      0.58       143
         ham       0.97      0.85      0.91       972

    accuracy                           0.85      1115
   macro avg       0.71      0.84      0.74      1115
weighted avg       0.90      0.85      0.87      1115



#  Balanced data classification

#### Preparation of balanced datasets

In [208]:
spam_indices = y[y == 0].index.tolist()
ham_indices = y[y == 1].index.tolist()

In [218]:
# separating into two datasets, Spam or ham
df_spam = X.iloc[spam_indices]
df_ham = X.iloc[ham_indices]

In [219]:
# dividing total spam data into 80:20 ratio i.e., 597:150 because it has less count and ham data into 3860:965
# Since we have more ham samples let's create 5 combined datasets (597 + 772) where ham set is divided into 5-sets and spam is present in 
# every set. 

#'''''''''''''' test_set ''''''''''''''#
test_data = pd.concat([df_spam[:150], df_ham[:965]])


#'''''''''''''' train sets preparation ''''''''''''''''''#
train_data1 = pd.concat([df_spam[150:], df_ham[965:965+772]])

train_data2 = pd.concat([df_spam[150:], df_ham[965+772:965+772*2]])

train_data3 = pd.concat([df_spam[150:], df_ham[965+772*2:965+772*3]])

train_data4 = pd.concat([df_spam[150:], df_ham[965+772*3:965+772*4]])

train_data5 = pd.concat([df_spam[150:], df_ham[965+772*4:965+772*5]])

In [225]:
len(df_spam[150:]), len(df_ham[965:965+772])

(597, 772)

In [227]:
spam_labels = np.zeros(len(df_spam[150:]), dtype = int)
ham_labels = np.ones(len(df_ham[965:965+772]), dtype = int)

train_labels = np.hstack((spam_labels, ham_labels))

In [236]:
spam_test = np.zeros(150, dtype = int)
ham_test = np.ones(965, dtype = int)

test_labels = np.hstack((spam_test, ham_test))

#### Building Models

In [235]:
nb1 = GaussianNB()
nb1.fit(train_data1, train_labels)

nb2 = GaussianNB()
nb2.fit(train_data2, train_labels)

nb3 = GaussianNB()
nb3.fit(train_data3, train_labels)

nb4 = GaussianNB()
nb4.fit(train_data4, train_labels)

nb5 = GaussianNB()
nb5.fit(train_data5, train_labels)

GaussianNB()

#### Prediction and Evaluation

In [240]:
y_pred1 = nb1.predict(test_data)
print(accuracy_score(test_labels, y_pred1))

y_pred2 = nb2.predict(test_data)
print(accuracy_score(test_labels, y_pred2))

y_pred3 = nb3.predict(test_data)
print(accuracy_score(test_labels, y_pred3))

y_pred4 = nb4.predict(test_data)
print(accuracy_score(test_labels, y_pred4))

y_pred5 = nb5.predict(test_data)
print(accuracy_score(test_labels, y_pred5))

0.7847533632286996
0.8062780269058296
0.7820627802690583
0.8026905829596412
0.7811659192825112


#### Ensemble predictions through Majority Voting

In [243]:
pred_sum = y_pred1 + y_pred2 + y_pred3 + y_pred4 + y_pred5

pred_maj = [0 if i<=2 else 1 for i in pred_sum]

In [246]:
accuracy_score(test_labels, pred_maj)

0.8017937219730942

In [247]:
# Ensemble approach classification report
cls_names = ['spam', 'ham']
print(classification_report(test_labels, pred_maj, target_names = cls_names))

              precision    recall  f1-score   support

        spam       0.40      0.94      0.56       150
         ham       0.99      0.78      0.87       965

    accuracy                           0.80      1115
   macro avg       0.69      0.86      0.72      1115
weighted avg       0.91      0.80      0.83      1115



In [248]:
# Naive Bayes classification report
print(classification_report(y_test, y_pred, target_names = cls_names))

              precision    recall  f1-score   support

        spam       0.45      0.82      0.58       143
         ham       0.97      0.85      0.91       972

    accuracy                           0.85      1115
   macro avg       0.71      0.84      0.74      1115
weighted avg       0.90      0.85      0.87      1115



# <font size = 2.8> Observation: We could see significant improvement in recall of spam classes through balanced data classification.