# **Case Study 5: NLP Classifier**

# **Case Study 5: NLP Classifier (Email Spam)**

### **Installing required libraries**

In [1]:
!pip install gensim



In [2]:
!pip install wget

Collecting wget
  Downloading wget-3.2.zip (10 kB)
Building wheels for collected packages: wget
  Building wheel for wget (setup.py): started
  Building wheel for wget (setup.py): finished with status 'done'
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9675 sha256=9023c7bc5146dc5deb09ee5eebbb5055b61db7ea59fc47d4fb4c5bf629d5d4bf
  Stored in directory: c:\users\hp\appdata\local\pip\cache\wheels\04\5f\3e\46cc37c5d698415694d83f607f833f83f0149e49b3af9d0f38
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


In [3]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.4.0/en_core_web_lg-3.4.0-py3-none-any.whl (587.7 MB)


2022-10-18 02:08:57.494556: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'cudart64_110.dll'; dlerror: cudart64_110.dll not found
2022-10-18 02:08:57.494759: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-10-18 02:09:04.838577: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'cudart64_110.dll'; dlerror: cudart64_110.dll not found
2022-10-18 02:09:04.839443: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'cublas64_11.dll'; dlerror: cublas64_11.dll not found
2022-10-18 02:09:04.840274: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'cublasLt64_11.dll'; dlerror: cublasLt64_11.dll not found
2022-10-18 02:09:04.843077: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'cu

Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.4.0
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_lg')


## **Importing Libraries**

In [4]:
import numpy as np
import pandas as pd

In [5]:
import spacy

In [6]:
import gensim
from gensim.utils import simple_preprocess
from gensim.models import Word2Vec
from nltk import sent_tokenize

## **Dataset**

In [8]:
df = pd.read_csv('data/spam.csv', encoding='ISO-8859-1')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [10]:
df.describe()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
count,5572,5572,50,12,6
unique,2,5169,43,10,5
top,ham,"Sorry, I'll call later","bt not his girlfrnd... G o o d n i g h t . . .@""","MK17 92H. 450Ppw 16""","GNT:-)"""
freq,4825,30,3,2,2


In [11]:
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)

In [12]:
df.rename({'v1':'category', 'v2':'text'}, axis=1, inplace=True)

In [13]:
df.head()

Unnamed: 0,category,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## **Preprocessing**

In [14]:
nlp = spacy.load('en_core_web_lg')

In [15]:
df['text'][0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [16]:
nlp(df['text'][0])

Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...

In [17]:
def preprocess(text):
    filtered = []
    doc = nlp(text)
    for token in doc:
        if token.is_stop or token.is_punct or token.is_space:
            continue
        if token.has_vector:
            filtered.append(token.lemma_)
    return " ".join(filtered)

In [18]:
df['spacy_filtered'] = df['text'].apply(preprocess)

In [19]:
df.head()

Unnamed: 0,category,text,spacy_filtered
0,ham,"Go until jurong point, crazy.. Available only ...",point crazy available n great world la e buffe...
1,ham,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 comp win FA Cup final 21st 2005 t...
3,ham,U dun say so early hor... U c already then say...,U dun early hor u c
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah think go live


In [20]:
df['spacy_vector'] = df['spacy_filtered'].apply(lambda text: nlp(text).vector)

In [21]:
df['spacy_vector'][0].shape

(300,)

## **Label Encoding**

In [22]:
from sklearn.preprocessing import LabelEncoder

In [23]:
le = LabelEncoder()
y = le.fit_transform(df['category'])

In [24]:
X = df['text']

In [25]:
np.unique(y, return_counts=True)

(array([0, 1]), array([4825,  747], dtype=int64))

## **Splitting Dataset**

In [26]:
from sklearn.model_selection import train_test_split

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [28]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4457,), (1115,), (4457,), (1115,))

In [29]:
X_train

184                            Going on nothing great.bye
2171                        I wont. So wat's wit the guys
5422              Ok k..sry i knw 2 siva..tats y i askd..
4113    Where are you ? What do you do ? How can you s...
4588         Have you not finished work yet or something?
                              ...                        
1932                            Jus finished avatar nigro
5316                         Jus finish watching tv... U?
2308    Moby Pub Quiz.Win a å£100 High Street prize if...
1903    Free entry in 2 a weekly comp for a chance to ...
763     Nothing but we jus tot u would ask cos u ba gu...
Name: text, Length: 4457, dtype: object

In [30]:
X_train[0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [31]:
simple_preprocess(X_train[0])

['go',
 'until',
 'jurong',
 'point',
 'crazy',
 'available',
 'only',
 'in',
 'bugis',
 'great',
 'world',
 'la',
 'buffet',
 'cine',
 'there',
 'got',
 'amore',
 'wat']

In [32]:
text_data = X_train.apply(simple_preprocess)
text_data

184                      [going, on, nothing, great, bye]
2171                      [wont, so, wat, wit, the, guys]
5422                     [ok, sry, knw, siva, tats, askd]
4113    [where, are, you, what, do, you, do, how, can,...
4588    [have, you, not, finished, work, yet, or, some...
                              ...                        
1932                       [jus, finished, avatar, nigro]
5316                          [jus, finish, watching, tv]
2308    [moby, pub, quiz, win, high, street, prize, if...
1903    [free, entry, in, weekly, comp, for, chance, t...
763     [nothing, but, we, jus, tot, would, ask, cos, ...
Name: text, Length: 4457, dtype: object

## **Word2Vec Model**

In [33]:
model = gensim.models.Word2Vec(window=10, min_count=2, workers=4)

In [34]:
model.build_vocab(text_data, progress_per=1000)

In [35]:
model.epochs

5

In [36]:
model.train(text_data, total_examples=model.corpus_count, epochs=model.epochs)

(237203, 312390)

In [37]:
model.wv.most_similar("good")

[('all', 0.9997727870941162),
 ('amp', 0.9997497200965881),
 ('was', 0.9997419118881226),
 ('up', 0.9997297525405884),
 ('and', 0.9997207522392273),
 ('of', 0.9997079372406006),
 ('here', 0.9997063279151917),
 ('is', 0.9997038841247559),
 ('nice', 0.9996875524520874),
 ('got', 0.999681293964386)]