# NLTK library training

In [1]:
import nltk

In [2]:
dir(nltk)[:10]

['ARLSTem',
 'ARLSTem2',
 'AbstractLazySequence',
 'AffixTagger',
 'AlignedSent',
 'Alignment',
 'AnnotationTask',
 'ApplicationExpression',
 'Assignment',
 'BigramAssocMeasures']

In [3]:
from nltk.corpus import stopwords

stopwords.words('english')[:100:10]

['i',
 "you've",
 'himself',
 'they',
 'that',
 'been',
 'a',
 'while',
 'through',
 'in']

Reading spam/ham massages:

In [4]:
import pandas as pd

messages = pd.read_csv('spam.csv', encoding='latin-1')

In [5]:
messages.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [6]:
messages.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace=True)
messages.rename(columns={'v1': 'label', 'v2': 'text'}, inplace=True)

In [7]:
messages.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
messages.shape

(5572, 2)

In [9]:
messages['label'].value_counts()

label
ham     4825
spam     747
Name: count, dtype: int64

In [10]:
print(f'Number of null values in labels is: {messages["label"].isnull().sum()}')
print(f'Number of null values in text is: {messages["text"].isnull().sum()}')

Number of null values in labels is: 0
Number of null values in text is: 0


## 1. Pre-processing data:

In [11]:
pd.set_option('display.max.colwidth', 100)

### 1.1. Removing punctuation:

In [12]:
import string
print(string.punctuation)

def remove_punctuation(text):
    text = ''.join([sign for sign in text if sign not in string.punctuation])
    return text

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [13]:
messages['cleaned'] = messages['text'].apply(lambda x: remove_punctuation(x))

In [14]:
messages.head()

Unnamed: 0,label,text,cleaned
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...",Go until jurong point crazy Available only in bugis n great world la e buffet Cine there got amo...
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say
4,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though


### 1.2. Tokenizing:

In [15]:
import re

def tokenize(text):
    tokens = re.split('\W+', text)
    return tokens

In [16]:
messages['tokenized'] = messages['cleaned'].apply(lambda y: tokenize(y.lower()))

In [17]:
messages.head()

Unnamed: 0,label,text,cleaned,tokenized
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...",Go until jurong point crazy Available only in bugis n great world la e buffet Cine there got amo...,"[go, until, jurong, point, crazy, available, only, in, bugis, n, great, world, la, e, buffet, ci..."
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...,"[free, entry, in, 2, a, wkly, comp, to, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, to..."
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,"[u, dun, say, so, early, hor, u, c, already, then, say]"
4,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though,"[nah, i, dont, think, he, goes, to, usf, he, lives, around, here, though]"


### 1.3. Removing stopwords:

In [18]:
def remove_stopwords(text):
    text = [char for char in text if char not in stopwords]
    return text

stopwords = nltk.corpus.stopwords.words('english')

messages['non_stop'] = messages['tokenized'].apply(lambda a: remove_stopwords(a))
messages.head()

Unnamed: 0,label,text,cleaned,tokenized,non_stop
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...",Go until jurong point crazy Available only in bugis n great world la e buffet Cine there got amo...,"[go, until, jurong, point, crazy, available, only, in, bugis, n, great, world, la, e, buffet, ci...","[go, jurong, point, crazy, available, bugis, n, great, world, la, e, buffet, cine, got, amore, wat]"
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[ok, lar, joking, wif, u, oni]","[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...,"[free, entry, in, 2, a, wkly, comp, to, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, to...","[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv..."
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,"[u, dun, say, so, early, hor, u, c, already, then, say]","[u, dun, say, early, hor, u, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though,"[nah, i, dont, think, he, goes, to, usf, he, lives, around, here, though]","[nah, dont, think, goes, usf, lives, around, though]"


Term Frequency - Inverse Document Frequency
TF-IDF:

- Creates a document-term matrix with 1 row per document and 1 column per word in the corpus
- Generates a weight for each word-document pair which corresponds to how important the word is in the document within the context of its frequency within a larger corpus;

1.4. Vectorizing using TF-IDF:

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

def text_cleaner(text):
    text = ''.join([char.lower() for char in text if char not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [word for word in tokens if word not in stopwords]
    return text


tfidf_vec = TfidfVectorizer(analyzer=text_cleaner)

In [20]:
X_tfidf = tfidf_vec.fit_transform(messages['text'])

In [21]:
X_tfidf.shape

(5572, 9395)

In [22]:
tfidf_vec.get_feature_names_out()

array(['', '0', '008704050406', ..., 'ûïharry', 'ûò', 'ûówell'],
      dtype=object)

In [23]:
X_tfidf

<5572x9395 sparse matrix of type '<class 'numpy.float64'>'
	with 50453 stored elements in Compressed Sparse Row format>

In [24]:
X_features = pd.DataFrame(X_tfidf.toarray())

In [25]:
X_features

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9385,9386,9387,9388,9389,9390,9391,9392,9393,9394
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5568,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5569,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5570,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 2. Modelling

### 2.1. Random Forest Classifier

In [26]:
from sklearn.ensemble import RandomForestClassifier
print(RandomForestClassifier().__dict__)

{'estimator': DecisionTreeClassifier(), 'n_estimators': 100, 'estimator_params': ('criterion', 'max_depth', 'min_samples_split', 'min_samples_leaf', 'min_weight_fraction_leaf', 'max_features', 'max_leaf_nodes', 'min_impurity_decrease', 'random_state', 'ccp_alpha'), 'base_estimator': 'deprecated', 'bootstrap': True, 'oob_score': False, 'n_jobs': None, 'random_state': None, 'verbose': 0, 'warm_start': False, 'class_weight': None, 'max_samples': None, 'criterion': 'gini', 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'min_weight_fraction_leaf': 0.0, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'ccp_alpha': 0.0}


In [27]:
from sklearn.metrics import precision_score, recall_score, precision_recall_curve
from sklearn.model_selection import train_test_split

In [28]:
# splitting data into training and testing sets:

X_train, X_test, y_train, y_test = train_test_split(X_features, messages['label'], test_size=0.2)

In [29]:
rfc = RandomForestClassifier()

rfc_model = rfc.fit(X_train, y_train)  # fitted-model 

In [30]:
y_hat = rfc_model.predict(X_test)  # predicting labels on test set

In [31]:
# Evaluating model with precision and recall scores:
precision = precision_score(y_true=y_test, y_pred=y_hat, pos_label='spam')
recall = recall_score(y_true=y_test, y_pred=y_hat, pos_label='spam')

In [32]:
print(precision)
print(recall)

1.0
0.8424657534246576


Precision of 100% means that all of the examples which the model assigned to be 'spam' where actually spam. So the model did not assign any ham as 'spam'.

Recall of ~80% means that 80% of the examples which were actually spam the model correctly assigned to be 'spam'. This mean that 20% of spam examples were labelled incorrectly as 'ham'.

## 2.2. word2vec

First, it is good to explore pre-trained embeddings:
- glove-twitter-{25/50/100/200}
- glove-wiki-gigaword-{50/200/300}
- word2vec-google-news-300
- word2vec-ruscorpora-news-300

In [33]:
import gensim.downloader as api

wiki_embeddings = api.load('glove-twitter-100')

In [34]:
wiki_embeddings['boy']  # vector of length 100

array([ 1.3953e-01, -4.1967e-01, -1.5159e-01,  1.6103e-01,  7.2636e-02,
       -8.8888e-02,  3.8640e-01,  6.2376e-01,  1.2745e-03, -7.5120e-02,
       -4.4217e-01,  2.6583e-01, -3.7124e+00,  4.1712e-01,  4.3375e-01,
        1.7227e-01,  4.6666e-01, -4.0818e-01, -1.0639e-01, -5.2362e-01,
        3.3871e-01, -1.8730e-02,  3.9345e-02,  2.2960e-01,  1.3549e-01,
       -8.4601e-01, -1.2257e-01, -2.5892e-01, -9.8715e-01, -9.2446e-01,
        1.0380e-01,  1.6817e-02, -3.4140e-01, -6.2911e-02,  3.5725e-01,
        7.5862e-01,  8.2661e-02, -9.5920e-02, -1.5026e-01, -1.4088e-01,
       -1.1398e+00,  2.3303e-01,  6.5569e-02,  1.2191e-01,  2.0027e-01,
       -4.5072e-02, -7.6274e-02,  1.8245e-01,  2.2045e-01, -1.7153e-01,
       -6.7000e-01,  3.3304e-01,  4.7586e-01, -3.2999e-01,  8.2177e-02,
        4.3802e-02, -3.6812e-01,  9.1956e-01,  1.8130e-01,  2.2137e-01,
       -4.1335e-01,  4.2237e-01,  1.6803e-01, -2.2091e-01, -3.8649e-01,
       -9.9980e-01, -6.6202e-01,  2.4330e-01, -1.1965e-01,  3.11

In [35]:
wiki_embeddings.most_similar('idiot')

[('stupid', 0.76278156042099),
 ('idiots', 0.7308687567710876),
 ('annoying', 0.7250351905822754),
 ('dumbass', 0.7217867970466614),
 ('asshole', 0.7147734761238098),
 ('moron', 0.6978527307510376),
 ('kidding', 0.6939009428024292),
 ('loser', 0.6928614377975464),
 ('piss', 0.6862791180610657),
 ('dumb', 0.6856788992881775)]

In [36]:
from gensim.utils import simple_preprocess
messages['cleaned_text'] = messages['text'].apply(lambda x: simple_preprocess(x))

In [37]:
messages.head()

Unnamed: 0,label,text,cleaned,tokenized,non_stop,cleaned_text
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...",Go until jurong point crazy Available only in bugis n great world la e buffet Cine there got amo...,"[go, until, jurong, point, crazy, available, only, in, bugis, n, great, world, la, e, buffet, ci...","[go, jurong, point, crazy, available, bugis, n, great, world, la, e, buffet, cine, got, amore, wat]","[go, until, jurong, point, crazy, available, only, in, bugis, great, world, la, buffet, cine, th..."
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[ok, lar, joking, wif, u, oni]","[ok, lar, joking, wif, u, oni]","[ok, lar, joking, wif, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...,"[free, entry, in, 2, a, wkly, comp, to, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, to...","[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv...","[free, entry, in, wkly, comp, to, win, fa, cup, final, tkts, st, may, text, fa, to, to, receive,..."
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,"[u, dun, say, so, early, hor, u, c, already, then, say]","[u, dun, say, early, hor, u, c, already, say]","[dun, say, so, early, hor, already, then, say]"
4,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though,"[nah, i, dont, think, he, goes, to, usf, he, lives, around, here, though]","[nah, dont, think, goes, usf, lives, around, though]","[nah, don, think, he, goes, to, usf, he, lives, around, here, though]"


In [38]:
X_train, X_test, y_train, y_test = train_test_split(messages['cleaned_text'], messages['label'], test_size=0.2)

In [39]:
from gensim.models import Word2Vec

w2v_model = Word2Vec(X_train, vector_size=100, window=5, min_count=2)

In [40]:
w2v_model.wv['boy']

array([-1.45659417e-01,  1.91729099e-01,  4.96371053e-02,  1.21490076e-01,
        1.35902122e-01, -3.34450632e-01,  1.20811820e-01,  5.29424071e-01,
       -2.58313835e-01, -1.62302032e-01, -9.77684334e-02, -3.55696470e-01,
       -5.24468683e-02,  1.33679822e-01,  1.72329873e-01, -1.52294755e-01,
        5.12429401e-02, -2.47607961e-01,  7.39076314e-03, -4.30651516e-01,
        1.37692347e-01,  8.39523971e-02,  1.98822498e-01, -1.32835194e-01,
       -4.80864011e-02, -5.19449152e-02, -1.77740991e-01, -1.38990700e-01,
       -1.57179534e-01, -2.44505540e-03,  2.98438966e-01,  2.08388269e-02,
        1.55801564e-01, -1.86148167e-01, -1.22334555e-01,  2.06548333e-01,
        3.92116681e-02, -2.16806605e-01, -1.32471487e-01, -4.18110013e-01,
        3.32814194e-02, -3.01791459e-01, -4.32790257e-02,  2.89284736e-02,
        1.84741154e-01, -6.34280592e-02, -1.80797264e-01, -4.32794802e-02,
        1.81309819e-01,  1.10381693e-01,  1.07713148e-01, -2.13687792e-01,
        5.04756607e-02, -

In [41]:
w2v_model.wv.most_similar('idiot')

[('order', 0.9879282712936401),
 ('most', 0.9876950979232788),
 ('true', 0.9876198768615723),
 ('store', 0.9874565005302429),
 ('march', 0.9873875379562378),
 ('afternoon', 0.9873272776603699),
 ('is', 0.9873237013816833),
 ('gd', 0.987308144569397),
 ('life', 0.9872430562973022),
 ('before', 0.9872103333473206)]

This result clearly shows that the wikipedia embeddings are fit better than the w2v model trained on 5k samples.