# Word Embeddings Using Keras Embedding Layer

# Importing Required Libraries

In [94]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import SimpleRNN

# Loading The Data 

In [95]:
data=pd.read_csv(r'/Users/arunamballa/Desktop/NLP/Projects/Word_Embeddings/Fake.csv')

In [96]:
data.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


# Size of Data

In [97]:
data.shape

(72134, 4)

# Checking For Null Values

In [98]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72134 entries, 0 to 72133
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  72134 non-null  int64 
 1   title       71576 non-null  object
 2   text        72095 non-null  object
 3   label       72134 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 2.2+ MB


# Dropping the Null Values

In [99]:
data=data.dropna()

# Size of Data After Dropping Null Values

In [100]:
data.shape

(71537, 4)

# Checking Again for Null Values

In [101]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 71537 entries, 0 to 72133
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  71537 non-null  int64 
 1   title       71537 non-null  object
 2   text        71537 non-null  object
 3   label       71537 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 2.7+ MB


# Dependent and Independent Variables

In [102]:
X=data[['title']]
Y=data[['label']]

# Size of Independent Variable 

In [103]:
X.shape

(71537, 1)

In [104]:
X.head()

Unnamed: 0,title
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...
2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...
3,"Bobby Jindal, raised Hindu, uses story of Chri..."
4,SATAN 2: Russia unvelis an image of its terrif...
5,About Time! Christian Group Sues Amazon and SP...


# Size of Dependent Variable

In [105]:
Y.shape

(71537, 1)

In [106]:
Y.head()

Unnamed: 0,label
0,1
2,1
3,0
4,1
5,1


# Vocabulary Size

In [107]:
voc_size=5000

In [108]:
messages=X.copy()

In [109]:
messages

Unnamed: 0,title
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...
2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...
3,"Bobby Jindal, raised Hindu, uses story of Chri..."
4,SATAN 2: Russia unvelis an image of its terrif...
5,About Time! Christian Group Sues Amazon and SP...
...,...
72129,Russians steal research on Trump in hack of U....
72130,WATCH: Giuliani Demands That Democrats Apolog...
72131,Migrants Refuse To Leave Train At Refugee Camp...
72132,Trump tussle gives unpopular Mexican leader mu...


In [110]:
messages.reset_index(inplace=True)

In [111]:
messages['title']

0        LAW ENFORCEMENT ON HIGH ALERT Following Threat...
1        UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...
2        Bobby Jindal, raised Hindu, uses story of Chri...
3        SATAN 2: Russia unvelis an image of its terrif...
4        About Time! Christian Group Sues Amazon and SP...
                               ...                        
71532    Russians steal research on Trump in hack of U....
71533     WATCH: Giuliani Demands That Democrats Apolog...
71534    Migrants Refuse To Leave Train At Refugee Camp...
71535    Trump tussle gives unpopular Mexican leader mu...
71536    Goldman Sachs Endorses Hillary Clinton For Pre...
Name: title, Length: 71537, dtype: object

In [112]:
import nltk
import re
from nltk.corpus import stopwords

# Text Preprocessing

In [113]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/arunamballa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [114]:
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['title'][i])
    review = review.lower()
    review = review.split()
    review = [word for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

# Pre-Processed Text

In [115]:
corpus

['law enforcement high alert following threats cops whites blacklivesmatter fyf terrorists video',
 'unbelievable obama attorney general says charlotte rioters peaceful protesters home state north carolina video',
 'bobby jindal raised hindu uses story christian conversion woo evangelicals potential bid',
 'satan russia unvelis image terrifying new supernuke western world takes notice',
 'time christian group sues amazon splc designation hate group',
 'dr ben carson targeted irs never audit spoke national prayer breakfast',
 'house intel chair trump russia fake story evidence anything video',
 'sports bar owner bans nfl games show true american sports like speak rural america video',
 'latest pipeline leak underscores dangers dakota access pipeline',
 'gop senator smacked punchable alt right nazi internet',
 'may brexit offer would hurt cost eu citizens eu parliament',
 'schumer calls trump appoint official oversee puerto rico relief',
 'watch hilarious ad calls question health aging c

# One Hot Representation of Text

In [116]:
onehot_repr=[one_hot(words,voc_size)for words in corpus] 
onehot_repr[13]

[4306, 1959, 3748, 4512, 2865, 1475, 3587, 2535, 2731, 1701]

# Padding the One Hot Vectors 

In [117]:
sent_length=25
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
print(embedded_docs)

[[   0    0    0 ... 1902 2014 4287]
 [   0    0    0 ... 2512 3298 4287]
 [   0    0    0 ... 1871  723 3655]
 ...
 [   0    0    0 ... 4004 2447 3944]
 [   0    0    0 ... 4420 1443 3459]
 [   0    0    0 ... 2306  235  580]]


# Building the Model

In [118]:
embeded_vector_size = 5
model = Sequential()
model.add(Embedding(voc_size, embeded_vector_size, input_length=sent_length,name="embedding"))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

# Model Summary

In [119]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
print(model.summary())

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 25, 5)             25000     
                                                                 
 flatten_2 (Flatten)         (None, 125)               0         
                                                                 
 dense_2 (Dense)             (None, 1)                 126       
                                                                 
Total params: 25,126
Trainable params: 25,126
Non-trainable params: 0
_________________________________________________________________
None


In [120]:
x=embedded_docs

# Training the Model

In [124]:
model.fit(x, Y, epochs=10,validation_data=(x,Y))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fb6205960d0>

In [126]:
loss, accuracy = model.evaluate(x, Y)
accuracy



0.9908159375190735

# Vector Representation of words

In [127]:
weights = model.get_layer('embedding').get_weights()[0]

# Embedding Matrix

In [128]:
weights

array([[ 0.14594033, -0.12557507,  0.16180767, -0.5552409 , -0.01409557],
       [ 0.25719252,  0.250373  , -0.70995235, -0.2943045 , -0.43104315],
       [ 0.05882793,  0.13476382,  0.17608473, -1.3089788 , -0.0503479 ],
       ...,
       [ 0.31923202,  0.17728499, -0.5192481 ,  0.9203519 , -0.13629559],
       [-0.22885236,  0.2732947 ,  0.17715603, -0.27341366,  0.25535563],
       [ 0.6892309 ,  0.69419754, -0.9245471 , -0.836947  ,  0.35552913]],
      dtype=float32)

In [129]:
weights[623]

array([ 0.4343438 ,  0.5049921 ,  0.00836909,  0.21064806, -0.02364959],
      dtype=float32)

In [130]:
weights[3587]

array([ 0.44354784, -0.03145494, -0.523498  , -0.04465419,  0.47175002],
      dtype=float32)