# GloVe Vectors

Downloaded Golve Vectors from here : https://nlp.stanford.edu/projects/glove/

In [1]:
## we set the seed so that we receive the same random weights at each time we train the program
import tensorflow as tf
tf.random.set_seed(1)

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Flatten,Embedding,Activation,Dropout
from tensorflow.keras.layers import Conv1D,MaxPooling1D,GlobalMaxPooling1D
from tensorflow.keras.optimizers import Adam

import numpy as np
import pandas as pd

In [3]:
from sklearn.model_selection import train_test_split

### Load Data

In [4]:
df = pd.read_csv('datasets/spam.csv')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [5]:
df.tail()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will �_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,
5571,ham,Rofl. Its true to its name,,,


In [6]:
df.shape

(5572, 5)

In [7]:
df.drop(columns = ['Unnamed: 2','Unnamed: 3','Unnamed: 4'], inplace = True)
df.columns = ['sentiment','review']
df.head()

Unnamed: 0,sentiment,review
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
df['spam'] = pd.get_dummies(df['sentiment'],drop_first=True)
df.head()

Unnamed: 0,sentiment,review,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [9]:
df.drop(columns = ['sentiment'], inplace=True)
df.head() 
# 0 for ham and 1 for spam

Unnamed: 0,review,spam
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [10]:
df['review'].isnull().sum()

0

In [11]:
df['spam'].value_counts()

0    4825
1     747
Name: spam, dtype: int64

### Data Cleaning

In [12]:
import contractions as cont # to fix the contractions we use this library

print(cont.fix("u"))
print(cont.fix("i'd"))
print(cont.fix("we'll've"))

you
I would
we will have


In [13]:
from bs4 import BeautifulSoup
import re
import unicodedata
import contractions as cont # to fix the contractions we use this library

def DataCleaner(x):
    x = BeautifulSoup(x, 'html.parser').get_text() # remove html tags
    x = re.sub(r'(http|ftp|https)\S+\s*', '', x)  # remove URLs
    x = re.sub(r'([a-zA-Z0-9+._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)', '', x) # remove Emails
    x = unicodedata.normalize('NFKD', x).encode('ascii', 'ignore').decode('utf-8', 'ignore') # remove Accented Text
    x = " ".join([cont.fix(word.lower()) for word in x.split()]) # we expand the contraction of words
    x = re.sub('[^a-zA-Z]+', ' ', x) # here we replace all with a space character except for alphabets.
    x = " ".join([word.lower() for word in x.split()])  # remove extra whitespace around the words.
    return x

# we haven't removed the stop words here because glove vectors also contain vector representation for the stop words

In [14]:
df['review'] = df['review'].apply(lambda x: DataCleaner(x))
df.head()

Unnamed: 0,review,spam
0,go until jurong point crazy available only in ...,0
1,ok lar joking wif you oni,0
2,free entry in a wkly comp to win fa cup final ...,1
3,you dun say so early hor you c already then say,0
4,nah i do not think he goes to usf he lives aro...,0


In [15]:
df.tail()

Unnamed: 0,review,spam
5567,this is the nd time we have tried contact you ...,1
5568,will b going to esplanade fr home,0
5569,pity was in mood for that so any other suggest...,0
5570,the guy did some bitching but i acted like i w...,0
5571,rofl its true to its name,0


In [16]:
text = df['review'].tolist()
text[:3]

['go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat',
 'ok lar joking wif you oni',
 'free entry in a wkly comp to win fa cup final tkts st may text fa to to receive entry question std txt rate t c s apply over s']

In [17]:
y = df['spam']

In [18]:
token = Tokenizer()
token.fit_on_texts(text)

In [19]:
vocab_size = len(token.word_index)+1
vocab_size

7636

In [20]:
#token.index_word

In [21]:
encoded_text = token.texts_to_sequences(text)
encoded_text[:2]

[[45,
  444,
  3859,
  777,
  701,
  619,
  65,
  9,
  1226,
  73,
  117,
  319,
  939,
  143,
  2676,
  1227,
  58,
  51,
  3860,
  130],
 [43, 320, 1374, 445, 1, 1766]]

In [22]:
# getting maximum sentence length and its position 

max_length = len(encoded_text[0])
max_ind = 0
for i, arr in enumerate(encoded_text):
    if max_length < len(arr):
        max_length = len(arr)
        max_ind = i
print("maximum array length {} and it is present at index of {}".format(max_length, max_ind))

maximum array length 190 and it is present at index of 1084


In [23]:
## it shows an array of maximum length 
#encoded_text[max_ind]

In [24]:
max_len = max_length + 10  # just adding extra length to make it 200 dimension
max_len

200

In [25]:
X = pad_sequences(encoded_text, maxlen = max_len, padding = 'pre')
X.shape

(5572, 200)

In [26]:
X[:2]

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0, 

# GloVe Vectors

In [27]:
glove_vectors = dict()

In [28]:
### Make a dictionary of glove vectors.

file = open('GloVe models/glove.6B.200d.txt', encoding = 'utf-8')

for line in file:
    values = line.split()
    word = values[0]
    vectors = np.asarray(values[1:])
    glove_vectors[word] = vectors
file.close()

In [29]:
keys = glove_vectors.keys()
list(keys)[:15]

['the',
 ',',
 '.',
 'of',
 'to',
 'and',
 'in',
 'a',
 '"',
 "'s",
 'for',
 '-',
 'that',
 'on',
 'is']

In [30]:
len(glove_vectors.keys())

400000

In [31]:
glove_vectors.get('you').shape

(200,)

In [32]:
a =glove_vectors.get('asciii')
print(a)

None


## matching and making a vector of words present in glove_vectors

In [33]:
word_vector_matrix = np.zeros(shape=(vocab_size, max_len))   
print(word_vector_matrix.shape)

(7636, 200)


In [34]:
unMatched_words = []
for word, index in token.word_index.items():
    vector = glove_vectors.get(word)
    
    if vector is not None:
        word_vector_matrix[index] = vector
    else:
        unMatched_words.append(word)

In [35]:
print("{} words present in the dataset that are not matched with glove vectors.".format(len(unMatched_words)))
unMatched_words[:10]

1300 words present in the dataset that are not matched with glove vectors.


['pobox',
 'aight',
 'thanx',
 'optout',
 'chikku',
 'mths',
 'msgs',
 'knw',
 'frnd',
 'boytoy']

* word_vector_matrix -- contains the weights from glove vectors for each matched word in X and remaining unmatched words make into zeroes...


In [36]:

word_vector_matrix[:2]  ## these are the pretained-weights that can be used for our model building. 

array([[ 0.       ,  0.       ,  0.       ,  0.       ,  0.       ,
         0.       ,  0.       ,  0.       ,  0.       ,  0.       ,
         0.       ,  0.       ,  0.       ,  0.       ,  0.       ,
         0.       ,  0.       ,  0.       ,  0.       ,  0.       ,
         0.       ,  0.       ,  0.       ,  0.       ,  0.       ,
         0.       ,  0.       ,  0.       ,  0.       ,  0.       ,
         0.       ,  0.       ,  0.       ,  0.       ,  0.       ,
         0.       ,  0.       ,  0.       ,  0.       ,  0.       ,
         0.       ,  0.       ,  0.       ,  0.       ,  0.       ,
         0.       ,  0.       ,  0.       ,  0.       ,  0.       ,
         0.       ,  0.       ,  0.       ,  0.       ,  0.       ,
         0.       ,  0.       ,  0.       ,  0.       ,  0.       ,
         0.       ,  0.       ,  0.       ,  0.       ,  0.       ,
         0.       ,  0.       ,  0.       ,  0.       ,  0.       ,
         0.       ,  0.       ,  0.       ,  0. 

In [37]:
word_vector_matrix.shape

(7636, 200)

word_vector_matrix --- It has the weights for corresponding numbers in X

## Model Building with Keras

In [38]:
X[:2] # this is previously built with pad sequences.

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0, 

In [39]:
X_train ,X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size = 0.2, shuffle=False)

In [40]:
vec_size=200 
#changing each word into a vector of size 200

In [41]:
model = Sequential()
model.add(Embedding(vocab_size, vec_size, input_length = max_len, weights = [word_vector_matrix], trainable = False))

model.add(Conv1D(64, 8, activation = 'relu'))
model.add(Conv1D(64, 8, activation = 'relu'))
model.add(MaxPooling1D(2))

model.add(Flatten())
model.add(Dense(32, kernel_initializer = 'he_normal',activation = 'relu'))
model.add(Dropout(0.25))

model.add(Dense(16, kernel_initializer = 'he_normal', activation='relu'))
#model.add(GlobalMaxPooling1D())

model.add(Dense(1, activation='sigmoid'))



In [42]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 200, 200)          1527200   
_________________________________________________________________
conv1d (Conv1D)              (None, 193, 64)           102464    
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 186, 64)           32832     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 93, 64)            0         
_________________________________________________________________
flatten (Flatten)            (None, 5952)              0         
_________________________________________________________________
dense (Dense)                (None, 32)                190496    
_________________________________________________________________
dropout (Dropout)            (None, 32)                0

In [43]:
model.compile(optimizer=Adam(learning_rate=0.001), loss = 'binary_crossentropy',metrics = ['accuracy'])

In [44]:
# implementing early stopping and model check point 

from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint

# stop training process if the accuracy of the train data is not increased after 5 epochs
es = EarlyStopping(monitor= "accuracy" , min_delta= 0.01, patience= 5, verbose=1)

# do not save the weights until and unless the validation accuracy is not increased.
mc = ModelCheckpoint(filepath= "saved models/Spam detection with Glove Vectors.h5", monitor="val_accuracy", verbose=1, save_best_only= True)

In [45]:
X_train.shape

(4457, 200)

In [46]:
model.fit(X_train, y_train, epochs = 20, validation_data = (X_test,y_test), callbacks=[es,mc])

Epoch 1/20
Epoch 00001: val_accuracy improved from -inf to 0.97489, saving model to saved models/Spam detection with Glove Vectors.h5
Epoch 2/20
Epoch 00002: val_accuracy did not improve from 0.97489
Epoch 3/20
Epoch 00003: val_accuracy improved from 0.97489 to 0.98475, saving model to saved models/Spam detection with Glove Vectors.h5
Epoch 4/20
Epoch 00004: val_accuracy improved from 0.98475 to 0.98655, saving model to saved models/Spam detection with Glove Vectors.h5
Epoch 5/20
Epoch 00005: val_accuracy did not improve from 0.98655
Epoch 6/20
Epoch 00006: val_accuracy did not improve from 0.98655
Epoch 7/20
Epoch 00007: val_accuracy did not improve from 0.98655
Epoch 8/20
Epoch 00008: val_accuracy improved from 0.98655 to 0.98744, saving model to saved models/Spam detection with Glove Vectors.h5
Epoch 9/20
Epoch 00009: val_accuracy did not improve from 0.98744
Epoch 10/20
Epoch 00010: val_accuracy did not improve from 0.98744
Epoch 11/20
Epoch 00011: val_accuracy did not improve from

<tensorflow.python.keras.callbacks.History at 0x1ad84ec1ac0>

# load saved model and Evaluate predictions

In [48]:
from tensorflow.keras.models import load_model

In [49]:
Model = load_model('saved models/Spam detection with Glove Vectors.h5')

In [53]:
from sklearn.metrics import confusion_matrix, classification_report

def MetricEvaluation(x_data, y_data):
    pred = Model.predict(x_data)
    pred = pred > 0.5
    print(confusion_matrix(y_data, pred))
    print(classification_report(y_data, pred))

In [55]:
# metric evaluation for train data
MetricEvaluation(X_train, y_train)


[[3855    0]
 [   3  599]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3855
           1       1.00      1.00      1.00       602

    accuracy                           1.00      4457
   macro avg       1.00      1.00      1.00      4457
weighted avg       1.00      1.00      1.00      4457



In [56]:
# metric evaluation for test data
MetricEvaluation(X_test, y_test)


[[967   3]
 [ 11 134]]
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       970
           1       0.98      0.92      0.95       145

    accuracy                           0.99      1115
   macro avg       0.98      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115



### Predicting New Data
* **spam = 1**
* **ham = 0**

In [59]:
def get_encode(x):
    x = DataCleaner(x)
    x = token.texts_to_sequences([x])
    x = pad_sequences(x, maxlen = max_len, padding = 'pre')
    return x

In [60]:
result = get_encode('u won a free ticket to singapore for a tour')

In [64]:
pred = (model.predict(result) > 0.5).astype('int32')
if pred == 1:
    print("The message is Spam!")
else:
    print("The message is not a Spam!")

The message is Spam!
