Case study NLP classifier

Email Classification

In [7]:
## Import the dependencies

In [8]:
import pandas as pd
import numpy as np
import gensim
from gensim.models import Word2Vec as wtv
from gensim.models import KeyedVectors

Read the data

In [9]:
data = pd.read_csv('spam.csv', encoding='ISO-8859-1')

In [10]:
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [11]:
data.tail()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,
5571,ham,Rofl. Its true to its name,,,


In [12]:
data.isna().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

In [13]:
data.shape

(5572, 5)

In [14]:
data['v1'].value_counts()

ham     4825
spam     747
Name: v1, dtype: int64

****Data preprocessing****

In [15]:
## since more than 80% of the datas are missing in the last 3 columns we can simply drop them

In [16]:
data.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [17]:
data.rename({'v1':'Class', 'v2':'Text'}, axis=1, inplace=True)

In [18]:
data

Unnamed: 0,Class,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


NLTK Preprocessing

In [19]:
import nltk
import re
import string
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [20]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [21]:
def strip_html(text): #html strips removal
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def remove_between_square_brackets(text): #square brackets removal
    return re.sub('\[[^]]*\]', '', text)


def to_lower(text): #converting to lower 
    return text.lower()

In [22]:
#Removing Special characters
def remove_special_characters(text, remove_digits=True):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"I'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\^^", "", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    return(text)

Text tokenization

In [23]:
def simple_tokenize(text):
    return nltk.word_tokenize(text)

Text lematization

In [24]:
def simple_lemmatizer(token_list):
    wlemma = WordNetLemmatizer()
    return [wlemma.lemmatize(token) for token in token_list]

In [25]:
def remove_punct(token_list):# Remove Punctuation
    return [token for token in token_list if token not in string.punctuation]

Stop Words Removal

In [26]:
stop_words = stopwords.words('english')
def remove_stopwords(token_list):
    return [token for token in token_list if token not in stop_words]

In [27]:
# NLTK Preprocessor
def nltk_preprocess(text):
    text = to_lower(text)
    text = remove_special_characters(text)
    text = simple_tokenize(text)
    text = remove_punct(text)
    text = remove_stopwords(text)
    text = simple_lemmatizer(text)
    return text

In [28]:
data['nltktext'] = data['Text'].apply(nltk_preprocess)

In [29]:
data['nltktext'][0]

['go',
 'jurong',
 'point',
 'crazy',
 'available',
 'bugis',
 'n',
 'great',
 'world',
 'la',
 'e',
 'buffet',
 'cine',
 'got',
 'amore',
 'wat']

In [30]:
tokens = pd.Series(data.nltktext.values)

# 1. CBOW Model

Training

In [31]:
cbow_model = wtv(tokens, size=300, window=9, min_count=2, sg=0)

In [32]:
 # extract vectrs from words
def get_embedding_cbow(doc_tokens):
    embeddings = []
    model = cbow_model
        
    for tok in doc_tokens:
        if tok in model.wv.vocab:
            embeddings.append(model.wv.word_vec(tok))
    return np.mean(embeddings, axis=0)

In [33]:
data['cbow_vectors'] = data['Text'].apply(lambda x: get_embedding_cbow(x))

  out=out, **kwargs)


In [34]:
data.isna().sum()

Class            0
Text             0
nltktext         0
cbow_vectors    54
dtype: int64

In [35]:
data.head()

Unnamed: 0,Class,Text,nltktext,cbow_vectors
0,ham,"Go until jurong point, crazy.. Available only ...","[go, jurong, point, crazy, available, bugis, n...","[0.10343209, -0.16199186, 0.24350882, -0.05605..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]","[0.09912243, -0.15490437, 0.23387703, -0.05369..."
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, 2, wkly, comp, win, fa, cup, fin...","[0.09431396, -0.14916258, 0.22399727, -0.05155..."
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, early, hor, u, c, already, say]","[0.13475178, -0.21078396, 0.31635448, -0.07276..."
4,ham,"Nah I don't think he goes to usf, he lives aro...","[nah, dont, think, go, usf, life, around, though]","[0.102684975, -0.16070558, 0.24136566, -0.0554..."


In [36]:
data = data.dropna().reset_index(drop=True)
data.head()

Unnamed: 0,Class,Text,nltktext,cbow_vectors
0,ham,"Go until jurong point, crazy.. Available only ...","[go, jurong, point, crazy, available, bugis, n...","[0.10343209, -0.16199186, 0.24350882, -0.05605..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]","[0.09912243, -0.15490437, 0.23387703, -0.05369..."
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, 2, wkly, comp, win, fa, cup, fin...","[0.09431396, -0.14916258, 0.22399727, -0.05155..."
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, early, hor, u, c, already, say]","[0.13475178, -0.21078396, 0.31635448, -0.07276..."
4,ham,"Nah I don't think he goes to usf, he lives aro...","[nah, dont, think, go, usf, life, around, though]","[0.102684975, -0.16070558, 0.24136566, -0.0554..."


In [37]:
# create X from w2vec
X_cbow = pd.DataFrame(data['cbow_vectors'].values.tolist())
X_cbow.shape

(5518, 300)

## Encoding

In [38]:
#label encoding
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(data.Class)

In [39]:
y

array([0, 0, 1, ..., 0, 0, 0])

## Train-Test split

In [40]:
from sklearn.model_selection import train_test_split
X_train_cb, X_test_cb, y_train_cb, y_test_cb = train_test_split(X_cbow, y, test_size=0.2, random_state=24)

## model

In [41]:
from sklearn.svm import SVC
model1 = SVC()
model1 = model1.fit(X_train_cb, y_train_cb)


In [42]:
pred1 = model1.predict(X_test_cb)

In [43]:
from sklearn.metrics import accuracy_score
a1 = accuracy_score(y_test_cb, pred1)
print("Accuracy:", a1*100, "%")


Accuracy: 87.5 %


## Skipgram Model

In [44]:
skgram_model = wtv(tokens, size=300, window=9, min_count=2, sg=1)

In [45]:
def get_embedding_sg(doc_tokens):
    embeddings = []
    model = skgram_model
    for tok in doc_tokens:
        if tok in model.wv.vocab:
            embeddings.append(model.wv.word_vec(tok))
   
    return np.mean(embeddings, axis=0)

In [46]:
data['sgram_vectors'] = data['Text'].apply(lambda x: get_embedding_sg(x))

In [47]:
# create X from w2vec
X_skg = pd.DataFrame(data['sgram_vectors'].values.tolist())
X_skg.shape

(5518, 300)

In [48]:
 # train and test split
X_train_sg, X_test_sg, y_train_sg, y_test_sg = train_test_split(X_skg, y, test_size=0.2, random_state=42) 

In [49]:
# model
model2 = SVC()
model2 = model2.fit(X_train_sg, y_train_sg)
pred2 = model2.predict(X_test_sg)

a2 = accuracy_score(y_test_sg, pred2)
print("Accuracy:", a2*100, "%")

Accuracy: 86.59420289855072 %


## Pretrained Google Word2Vec Model Based


In [2]:
file_name = '/content/drive/MyDrive/Word2vec/GoogleNews-vectors-negative300.bin.gz'

In [4]:
from gensim.models import KeyedVectors
google_w2vec = KeyedVectors.load_word2vec_format(file_name, binary=True)

In [5]:
# extract vectors from all words in doc
def get_embedding_ggl(doc_tokens):
    embeddings = []
    model = google_w2vec
    for tok in doc_tokens:
        if tok in model.wv.vocab:
            embeddings.append(model.wv.word_vec(tok))
   
    return np.mean(embeddings, axis=0)

In [51]:
data['google_vectors'] = data['Text'].apply(lambda x: get_embedding_ggl(x))

  
  import sys


In [53]:
X_ggl = pd.DataFrame(data['google_vectors'].values.tolist())
X_ggl.shape

(5518, 300)

# Label Encoding

In [55]:
le = LabelEncoder()
y = le.fit_transform(data.Class)

In [57]:
X_train_gl, X_test_gl, y_train_gl, y_test_gl = train_test_split(X_ggl, y, test_size=0.2, random_state=42)

# Text classification model

In [58]:
model3 = SVC()
model3 = model3.fit(X_train_gl, y_train_gl)
pred3 = model3.predict(X_test_gl)

a3 = accuracy_score(y_test_gl, pred3)
print("Accuracy:", a3*100, "%")

Accuracy: 97.01086956521739 %
