# **Text Classification Using Learned Word Embeddings**

We will download 3 books:


*   The Call of the Wild, by Jack London
*   Dracula, by Bram Stoker
*   The Adventures of Sherlock Holmes, by Arthur Conan Doyle

We will split the books into a collection of paragraphs and train a machine learning model to determine the book a paragraph was taken from.

In [None]:
import bs4 as bs
import urllib.request
import numpy as np
import matplotlib.pyplot as plt

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.activations import *
from tensorflow.keras.initializers import *

from keras.models import Model
from keras.regularizers import l2

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint, Callback

In [None]:
def get_paragraph_list(url,skip):
    paragraphs = []
    data = urllib.request.urlopen(url).read()
    soup = bs.BeautifulSoup(data,'lxml')
    for paragraph in soup.find_all('p'):
        par  = paragraph.get_text()
        if par:
            if len(par)>=25:
              paragraphs.append(par)           
    return paragraphs[skip:]

url_list = ['http://www.gutenberg.org/files/215/215-h/215-h.htm', 'http://www.gutenberg.org/files/345/345-h/345-h.htm', 'http://www.gutenberg.org/files/1661/1661-h/1661-h.htm']

paragraphs = []
targets = []
first_par = []
skip = [1,4,0]
for u, url in enumerate(url_list):
    par = get_paragraph_list(url,skip[u])
    paragraphs = paragraphs + par
    targets = targets + [u for i in par]
    print('\nBook {} contains {} paragraphs'.format(u,len(par)))
    lengths = np.array([len(wl) for wl in par])
    print('Paragraph length stats:')
    print('min = {} max = {} mean = {:4f}'.format(np.min(lengths),np.max(lengths),np.mean(lengths)))
    print('First paragraph:')
    print(par[0])    


In [None]:
np.random.seed(5361)
n = len(paragraphs)
ind = np.random.permutation(n)
x_test = [paragraphs[i] for i in ind[:n//5]]
x_train = [paragraphs[i] for i in ind[n//5:]]
y_test = tf.keras.utils.to_categorical([targets[i] for i in ind[:n//5]],3)
y_train = tf.keras.utils.to_categorical([targets[i] for i in ind[n//5:]],3)

In [None]:
print(len(x_train))
print(y_train.shape)
print(len(x_test))
print(y_test.shape)

Now let's extract the integer sequences to describe the data. 

Notice that the vocabulary must be extracted from x_train only, since we are not allowed to use x_test for anything before testing.

Since all sequences must have the same length, we choose a length and pad with zeros the shorter sequences and truncate the longer ones.

We also choose a max_words, the maximum vocabulary size. These means that only the max_words most common words will be included in the vector description of the data. Other words will be assigned to token '0', and considered unknown. 

In [None]:
max_words = 15000
seq_len = 250
tokenizer = Tokenizer(num_words = max_words,filters='’‘”“!"#$%&()*+,./:;<=>?@[\\]^_`{|}~\t\n\r')

tokenizer.fit_on_texts(x_train)

x_train_seq0 = tokenizer.texts_to_sequences(x_train)
x_test_seq0 = tokenizer.texts_to_sequences(x_test)

x_train_seq = pad_sequences(x_train_seq0,seq_len,truncating='post')
x_test_seq = pad_sequences(x_test_seq0,seq_len,truncating='post')

In [None]:
x_train

In [None]:
print(x_train_seq.shape)
print(x_test_seq.shape)

Remove sequences that have no in-vocabulary words. 

In [None]:
print(np.sum(np.sum(x_train_seq,axis=1)==0))
print(np.sum(np.sum(x_test_seq,axis=1)==0))

has_valid_words = np.where(np.max(x_train_seq,axis=1)>0)[0]

x_train_seq = x_train_seq[has_valid_words]
x_train = [x_train[i] for i in has_valid_words]
y_train = y_train[has_valid_words]

has_valid_words = np.where(np.max(x_test_seq,axis=1)>0)[0]
x_test_seq = x_test_seq[has_valid_words]
x_test = [x_test[i] for i in has_valid_words]
y_test = y_test[has_valid_words]

print(np.sum(np.sum(x_train_seq,axis=1)==0))
print(np.sum(np.sum(x_test_seq,axis=1)==0))

In [None]:
word_index = tokenizer.word_index
print('Found {} unique words'.format(len(word_index)))

We may also need to perform reverse queries such as 'which word has index i?'. For this, we build a list such that if i == word_index[w], word w is stored in position i in the list.

In [None]:
word_list = ['UNKNOWN' for w in range(len(word_index)+1)]
for w in word_index.keys():
    word_list[word_index[w]] = w

Position 0 is reserved for unknown words. After that words are sorted by the frequency in which they appear in the text. 

In [None]:
for w in word_list[:20]:
  print(w)

Near the end we have the least-common words. 

In [None]:
for w in word_list[-20:]:
  print(w)

Let's see a random training example and its corresponding sequence representation.

In [None]:
r = np.random.randint(len(x_train))
print('r=',r)
print(x_train[r])
print(x_train_seq[r])

for i in x_train_seq[r]:
  if i>0:
    print(word_list[i],end=' ')

In [None]:
def plot_results(history):
  loss = history.history['loss']
  val_loss = history.history['val_loss']
  accuracy = history.history['accuracy']
  val_accuracy = history.history['val_accuracy']

  fig, ax = plt.subplots()
  ax.plot(accuracy,label = 'train')
  ax.plot(val_accuracy,label = 'test')
  ax.set_title('Accuracy')
  ax.legend(loc='lower right')
  fig, ax = plt.subplots()
  ax.plot(loss,label = 'train')
  ax.plot(val_loss,label = 'test')
  ax.set_title('Loss')
  ax.legend(loc='upper right')
  plt.show()

We will create an embedding layer. An embedding layer receives parameters (max_words, emb_len, seq_len). Where max_words is the number of words in the vocabulary, emb_len is the chosen number of dimensions to represent the word embeddings, and seq_len is the number of words in each paragraph (padded or truncated, as described above).

The parameters, or weights, of an embedding layer consist of a 2D array with max_words+1 rows and emb_len columns such that row i contains the embedding of word word_list[i].


In [None]:
def cnn1D(vocab_size,emb_len=50, seq_len=250, n_classes=3,dropout=0.5,n=128):
  ks = 7
  model = tf.keras.models.Sequential()
  model.add(Embedding(vocab_size,emb_len,input_length=seq_len,name='embeddings'))
  model.add(Conv1D(n, kernel_size = ks, padding='same',activation="relu"))
  model.add(MaxPooling1D(4, padding='same'))
  model.add(Dropout(dropout))
  model.add(Conv1D(2*n, kernel_size = ks, padding='same', activation="relu"))
  model.add(MaxPooling1D(4, padding='same'))
  model.add(Dropout(dropout))
  model.add(Conv1D(2*n, kernel_size = ks, padding='same', activation="relu"))
  model.add(MaxPooling1D(4, padding='same'))
  model.add(Dropout(dropout))
  model.add(Flatten())
  #model.add(Dense(n,activation= 'relu'))
  #model.add(Dropout(dropout))
  model.add(Dense(n_classes,activation= 'softmax'))
  return model

In [None]:
rop = ReduceLROnPlateau(monitor='val_loss',factor=0.5, patience=2, verbose=1)
es = EarlyStopping(monitor='val_accuracy', verbose=1, patience=5)

model = cnn1D(vocab_size = max_words, emb_len=50, seq_len=x_train_seq.shape[1], n=64)
model.summary()
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(
    x_train_seq, y_train,
    validation_data=(x_test_seq, y_test),
    epochs = 30, 
    verbose = 1,
    batch_size=32,
    callbacks = [rop, es]
)
acc = history.history['val_accuracy']
print('max accuracy = {:.4f} in epoch {}, final accuracy = {:.4f}'.format(np.amax(acc),np.argmax(acc)+1,acc[-1]))
plot_results(history)

We will also try a simpler network that uses a single convolutional layerand global average pooling before the classification layer. 

In [None]:
def cnn1D_small(vocab_size,emb_len=100, seq_len=128, n_classes=3,dropout=0.5,n=128,kernel_size = 4):
  model = tf.keras.models.Sequential()
  model.add(Embedding(vocab_size,emb_len,input_length=seq_len,name='embeddings'))
  model.add(Conv1D(n, kernel_size = kernel_size, padding='same', activation="relu"))
  model.add(Dropout(dropout))
  model.add(GlobalAveragePooling1D())
  model.add(Dense(n_classes,activation= 'softmax'))
  return model

In [None]:
rop = ReduceLROnPlateau(monitor='val_loss',factor=0.5, patience=3, verbose=1)
es = EarlyStopping(monitor='val_accuracy', verbose=1, patience=6)

model = cnn1D_small(vocab_size = max_words, emb_len=50, seq_len=x_train_seq.shape[1], n=128)
model.summary()
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(
    x_train_seq, y_train,
    validation_data=(x_test_seq, y_test),
    epochs = 30, 
    verbose = 1,
    batch_size=32,
    callbacks = [rop, es]
)
acc = history.history['val_accuracy']
print('max accuracy = {:.4f} in epoch {}, final accuracy = {:.4f}'.format(np.amax(acc),np.argmax(acc)+1,acc[-1]))
plot_results(history)

We can also experiment with varios kernel sizes.

In [None]:
accuracies = []
for kernel_size in [2,4,8,16,32,64,128]:
  rop = ReduceLROnPlateau(monitor='val_loss',factor=0.5, patience=3, verbose=1)
  es = EarlyStopping(monitor='val_accuracy', verbose=1, patience=6)

  model = cnn1D_small(vocab_size = max_words, emb_len=50,seq_len=x_train_seq.shape[1], n=128, kernel_size = kernel_size)
  #model.summary()
  model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
  history = model.fit(
    x_train_seq, y_train,
    validation_data=(x_test_seq, y_test),
    epochs = 30, 
    verbose = 1,
    batch_size=32,
    callbacks = [rop, es]
  )
  acc = history.history['val_accuracy']
  print('kernel size=',kernel_size)
  print('max accuracy = {:.4f} in epoch {}, final accuracy = {:.4f}'.format(np.amax(acc),np.argmax(acc)+1,acc[-1]))
  accuracies.append(acc[-1])
  plot_results(history)

In [None]:
print(accuracies)

Now let's observe the similarity/disimilarity of the embeddings of various words to see if they match our expectations. Words with similar meanings and syntactic should have positive cosine similarity.

In [None]:
embedding_matrix = model.get_weights()[0] 
print(embedding_matrix.shape)

In [None]:
# Define the cosine similarity of two words. 
def cosine_similarity(w1,w2,emb,word_index):
  e1,e2 = emb[word_index[w1]], emb[word_index[w2]]
  cs = np.dot(e1,e2)/np.linalg.norm(e1)/np.linalg.norm(e2)
  print('cosine similarity({},{})={:4.3f}'.format(w1,w2,cs))

# Extract embedding matrix from trained network
embedding_matrix = model.get_weights()[0] 
print(embedding_matrix.shape)

# Show (dis)similarities
W1 = ['buck','dracula','holmes']
W2 = ['deer','wolf','spitz','blood','vampire','detective','watson']
for w1 in W1:
  for w2 in W2:
      cosine_similarity(w1,w2,embedding_matrix,word_index)

# **Text Classification Using Pretrained Word Embeddings**

We will see how we can also initialize our embedding matrix with pretrained values. 

We can download a zip file containing word embeddings of lengths 50,100,200, and 300. 

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip

Now we'll build a dictionary where embedding[w] contains the embedding of word w. We will use embeddings of length 50. 

In [None]:
import numpy as np

def read_embeddings(n=1000):
    # Reads n embeddings from file
    # Returns a dictionary were embedding[w] is the embeding of string w
    embedding = {}
    count = 0
    with open('glove.6B.50d.txt', encoding="utf8") as f: 
        for line in f: 
            count+=1
            ls = line.split(" ")
            emb = np.array([np.float32(x) for x in ls[1:]])
            embedding[ls[0]]=emb
            if count>= n:
                break
    return embedding

vocabulary_size = 1000000000        
embedding = read_embeddings(vocabulary_size)

In [None]:
emb_mat = np.zeros((len(embedding),len(embedding['a'])))
for i, k in enumerate(embedding.keys()):
  emb_mat[i] = embedding[k]

plt.plot(np.mean(emb_mat,axis=0))

In [None]:
plt.plot(np.mean(emb_mat,axis=0))

In [None]:
plt.plot(np.std(emb_mat,axis=0))

In [None]:
print(np.std(emb_mat))
print(np.mean(emb_mat))

In [None]:
num_tokens = len(tokenizer.word_index)+1
embedding_dim = len(embedding['a'])
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim)) + np.mean(emb_mat,axis=0,keepdims=True)
#yembedding_matrix = np.random.normal(loc=0.0, scale=0.1, size=(num_tokens, embedding_dim))
for word, i in tokenizer.word_index.items():

    embedding_vector = embedding.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))
print(embedding_matrix.shape)

In [None]:
embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=Constant(embedding_matrix),
    trainable=False,
)

In [None]:
def cnn1D(embedding_matrix, seq_len=250, n_classes=3,dropout=0.5,n=128):
  ks = 6
  model = tf.keras.models.Sequential()
  model.add(Embedding(embedding_matrix.shape[0],embedding_matrix.shape[1],input_length=seq_len,name='embeddings',embeddings_initializer=Constant(embedding_matrix),
    trainable=False))
  for i in range(1,3):
    model.add(Conv1D(n*i, kernel_size = ks, padding='same',activation="relu"))
    model.add(Conv1D(n*i, kernel_size = ks, padding='same',activation="relu"))
    model.add(MaxPooling1D(5, padding='same'))
    model.add(Dropout(dropout))

  
  model.add(Flatten())
  #model.add(Dense(n,activation= 'relu'))
  #model.add(Dropout(dropout/2))
  model.add(Dense(n_classes,activation= 'softmax'))
  return model

In [None]:
rop = ReduceLROnPlateau(monitor='val_loss',factor=0.5, patience=3, verbose=1)
es = EarlyStopping(monitor='val_accuracy', verbose=1, patience=6)

model = cnn1D(embedding_matrix=embedding_matrix, seq_len=seq_len, n=128)
model.summary()
model.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(), metrics=['accuracy'])
history = model.fit(
    x_train_seq, y_train,
    validation_data=(x_test_seq, y_test),
    epochs = 100, 
    verbose = 1,
    batch_size=32,
    callbacks = [rop, es]
)
acc = history.history['val_accuracy']
print('max accuracy = {:.4f} in epoch {}, final accuracy = {:.4f}'.format(np.amax(acc),np.argmax(acc)+1,acc[-1]))

In [None]:
def cnn1D_small(embedding_matrix, seq_len=250, n_classes=3,dropout=0.5,n=128):
  model = tf.keras.models.Sequential()
  model.add(Embedding(embedding_matrix.shape[0],embedding_matrix.shape[1],input_length=seq_len,name='embeddings',embeddings_initializer=Constant(embedding_matrix),trainable=True))
  model.add(Conv1D(n, kernel_size = 32, padding='same', activation="relu"))
  model.add(Dropout(dropout))
  model.add(GlobalAveragePooling1D())
  model.add(Dense(n_classes,activation= 'softmax'))
  return model

In [None]:
rop = ReduceLROnPlateau(monitor='val_loss',factor=0.5, patience=3, verbose=1)
es = EarlyStopping(monitor='val_accuracy', verbose=1, patience=6)

model = cnn1D_small(embedding_matrix=embedding_matrix, seq_len=seq_len, n=64)
#model = cnn1D_small(seq_len=seq_len, n=128)
model.summary()
model.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(), metrics=['accuracy'])
history = model.fit(
    x_train_seq, y_train,
    validation_data=(x_test_seq, y_test),
    epochs = 100, 
    verbose = 1,
    batch_size=32,
    callbacks = [rop, es]
)
acc = history.history['val_accuracy']
print('max accuracy = {:.4f} in epoch {}, final accuracy = {:.4f}'.format(np.amax(acc),np.argmax(acc)+1,acc[-1]))
plot_results(history)