In [0]:
!git clone https://github.com/adobe-research/deft_corpus.git

Cloning into 'deft_corpus'...
remote: Enumerating objects: 894, done.[K
remote: Counting objects: 100% (894/894), done.[K
remote: Compressing objects: 100% (462/462), done.[K
remote: Total 2196 (delta 601), reused 669 (delta 424), pack-reused 1302[K
Receiving objects: 100% (2196/2196), 42.39 MiB | 5.42 MiB/s, done.
Resolving deltas: 100% (1386/1386), done.


In [0]:
!unzip src.zip

Archive:  src.zip
   creating: scripts/
  inflating: scripts/__init__.py     
  inflating: scripts/task1_converter.py  
   creating: source/
  inflating: source/__init__.py      
  inflating: source/classifiers.py   
  inflating: source/data_loader.py   
  inflating: source/text_vectorizers.py  
  inflating: Data Loading and Preparation.ipynb  
  inflating: README.md               


# Loading The Data

In [0]:
from source.data_loader import DeftCorpusLoader

In [0]:
loader = DeftCorpusLoader('deft_corpus/data')

In [0]:
train_df, dev_df = loader.load_classification_data()

In [0]:
train_df.head()

Unnamed: 0,Sentence,HasDef
0,6110 . Defining obscenity has been something ...,0
1,"Into the early twentieth century , written wo...",0
2,"In 1973 , the Supreme Court established the M...",1
3,"Miller v. California , 413 U.S. 15 ( 1973 ) .",0
4,"However , the application of this standard ha...",0


# Imports

In [0]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from gensim.models import Doc2Vec
from gensim.models import Word2Vec
from sklearn import utils
from sklearn.model_selection import train_test_split
import gensim
from sklearn.linear_model import LogisticRegression
from gensim.models.doc2vec import TaggedDocument
import re
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.layers import LSTM, Bidirectional
from tensorflow.keras.layers import Embedding
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Dense, Activation, GlobalMaxPooling1D, Dropout, Flatten
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Data Preprocessing

In [0]:
loader.preprocess_data(train_df)

In [0]:
loader.clean_data(train_df)

In [0]:
loader.preprocess_data(dev_df)

In [0]:
loader.clean_data(dev_df)

In [0]:
train_df.head()

Unnamed: 0,Sentence,HasDef,Parsed
0,6110 . Defining obscenity has been something ...,0,"[defining, obscenity, challenge, court, suprem..."
1,"Into the early twentieth century , written wo...",0,"[early, 20, century, write, work, frequently, ..."
2,"In 1973 , the Supreme Court established the M...",1,"[supreme, court, establish, miller, test, deci..."
5,"In particular , the concept of “ contemporary...",0,"[particular, concept, contemporary, community,..."
6,6113 . Free expression includes the right to ...,0,"[free, expression, include, right, assemble, p..."


In [0]:
vocab = np.unique([y for x in train_df['Parsed'] for y in x])

In [0]:
vocab_size = len(vocab)

In [0]:
max_length = np.max([np.count_nonzero(x) for x in train_df['Parsed']])

In [0]:
avg_length = int(np.ceil(np.average([np.count_nonzero(x) for x in train_df['Parsed']])))

In [0]:
len(train_df['HasDef'])

16165

In [0]:
train_positive_class_length = np.count_nonzero([x for x in train_df['HasDef'] if x == 1])
train_negative_class_length = np.abs(len(train_df['HasDef']) - train_positive_class_length)

In [0]:
MAX_NB_WORDS = vocab_size    # max no. of words for tokenizer
MAX_SEQUENCE_LENGTH = avg_length # max length of each entry (sentence), including padding
EMBEDDING_DIM = 100      # embedding dimensions for word vectors (word2vec/GloVe)
GLOVE_DIR = "glove.6B."+str(EMBEDDING_DIM)+"d.txt"

# LSTM

Long short-term memory (LSTM) is an artificial recurrent neural network (RNN) architecture. A common LSTM unit is composed of a cell, an input gate, an output gate and a forget gate. The cell remembers values over arbitrary time intervals and the three gates regulate the flow of information into and out of the cell. 
LSTM networks are well-suited to classifying, processing and making predictions based on time series data, since there can be lags of unknown duration between important events in a time series. 
Intuitively, the cell is responsible for keeping track of the dependencies between the elements in the input sequence. The input gate controls the extent to which a new value flows into the cell, the forget gate controls the extent to which a value remains in the cell and the output gate controls the extent to which the value in the cell is used to compute the output activation of the LSTM unit. The activation function of the LSTM gates is often the logistic sigmoid function. 

![LSTM](https://colah.github.io/posts/2015-08-Understanding-LSTMs/img/LSTM3-chain.png)

## Data Preprocessing for Model

To use keras and make a model with an embedding layer as an input layer we need each word in the vocab to be represented by a number so the Tokenizer class is used. The input, also, has to be a vector of numbers and that vector has to be of the same size for all the documents. The average length is taken and extra words are removed form the vectors and padding is used if the length is less than the average.

In [0]:
tokenizer = Tokenizer(num_words=vocab_size)

In [0]:
tokenizer.fit_on_texts(train_df['Parsed'])

In [0]:
train_sequences = tokenizer.texts_to_sequences(train_df['Parsed'])

In [0]:
word_index = tokenizer.word_index

In [0]:
trunc_type = 'post'
padding_type = 'post'

In [0]:
train_padded = pad_sequences(train_sequences, maxlen=avg_length, padding=padding_type, truncating=trunc_type)
print(len(train_sequences[0]))
print(len(train_padded[0]))

print(len(train_sequences[1]))
print(len(train_padded[1]))

print(len(train_sequences[10]))
print(len(train_padded[10]))

19
13
24
13
18
13


In [0]:
print(train_padded[10])

[ 756 2521  145   17   20   22  573 1134  114   19  648   24   27]


In [0]:
dev_sequences = tokenizer.texts_to_sequences(dev_df['Parsed'])
dev_padded = pad_sequences(dev_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

print(len(dev_sequences))
print(dev_padded.shape)

780
(780, 57)


## Word2Vec Model

The first approach to use with the LSTM is Word2Vec. It's used as the pretrained, freezed embeddings to the embeddings layer. Firstly, the word2vec continous bag of words model is trained on the train dataset. The embeddings of the dataset's vocab are added to the embedding layer and freezed.

In [0]:
w2v_model = Word2Vec(size=100, min_count=2, window=5, iter=100)

In [0]:
w2v_model.build_vocab(train_df['Parsed'])
w2v_model.train(train_df['Parsed'], total_examples=w2v_model.corpus_count, epochs=w2v_model.epochs)
w2v_model.wv.init_sims(replace=True)

In [0]:
w2v_model.wv.syn0.shape

  """Entry point for launching an IPython kernel.


(10102, 100)

In [0]:
w2v_pretrained_weights = w2v_model.wv.syn0

vocab_size, emdedding_size = w2v_pretrained_weights.shape

print('Result embedding shape:', w2v_pretrained_weights.shape)

print('Checking similar words:')

for word in ['model', 'network', 'train', 'learn']:

  most_similar = ', '.join('%s (%.2f)' % (similar, dist) for similar, dist in w2v_model.most_similar(word)[:8])

  print('  %s -> %s' % (word, most_similar))

Result embedding shape: (10102, 100)
Checking similar words:
  model -> bohr (0.45), informational (0.44), concept (0.43), economics (0.41), planetary (0.41), algorithm (0.40), acquisitive (0.39), rendition (0.38)
  network -> adenohypophysis (0.53), capillary (0.48), peritubular (0.48), buren (0.43), infundibulum (0.43), gyrus (0.42), mantle (0.42), anchor (0.41)
  train -> persuade (0.45), rebel (0.44), postdoctoral (0.43), volunteer (0.42), toilet (0.41), weaponry (0.41), nurse (0.41), online (0.41)
  learn -> associative (0.53), habituation (0.49), earlier (0.48), helplessness (0.47), powerless (0.46), tolman (0.45), thought (0.45), biopsychology (0.44)


  """Entry point for launching an IPython kernel.
  # This is added back by InteractiveShellApp.init_path()
  if np.issubdtype(vec.dtype, np.int):


In [0]:
def word2idx(word):
  return word_model.wv.vocab[word].index

def idx2word(idx):
  return word_model.wv.index2word[idx]

## Glove Pretrained Embeddings

The second approach used with LSTM is using Glove's embeddings as the fixed embeddings of the embeddings layer.GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space. The network is used with freezing the weights and with training them to experiment on the different results.

In [0]:
!wget http://nlp.stanford.edu/data/glove.6B.zip

--2020-01-17 19:17:07--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2020-01-17 19:17:07--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2020-01-17 19:17:08--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2020-0

In [0]:
!unzip glove.6B.zip

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [0]:
word, i = zip(*word_index.items())

In [0]:
embeddings_index = {}
f = open(GLOVE_DIR)
print('Loading GloVe from:', GLOVE_DIR,'...', end='')
for line in f:
    values = line.split()
    word = values[0]
    embeddings_index[word] = np.asarray(values[1:], dtype='float32')
f.close()
print("Done.\n Proceeding with Embedding Matrix...", end="")

glove_embedding_matrix = np.random.random((len(word_index), EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        glove_embedding_matrix[i-1] = embedding_vector
print(" Completed!")

Loading GloVe from: glove.6B.100d.txt ...Done.
 Proceeding with Embedding Matrix... Completed!


## Model Defintition

In [0]:
def get_lstm_embeddings_model(weights=[], weights_trainable=False):
  model = Sequential()
  if weights != []:
    weights = [weights]
  model.add(Embedding(input_dim=vocab_size, output_dim=EMBEDDING_DIM, weights=weights, trainable=weights_trainable))
  model.add(Bidirectional(LSTM(units=EMBEDDING_DIM)))
  model.add(Dense(vocab_size, activation='relu'))
  model.add(Dense(1, activation='sigmoid'))
  model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
  return model

In [0]:
validation_index = int(len(dev_padded)/2)

## Training Using Our Word2Vec Pretrained Weights

In [0]:
num_epochs = 3
word2VecLSTMModel = get_lstm_embeddings_model(w2v_pretrained_weights, True)
history = word2VecLSTMModel.fit(train_padded, train_df['HasDef'].values, epochs=num_epochs, validation_data=(dev_padded[:validation_index], dev_df['HasDef'].values[:validation_index]), verbose=2)

  This is separate from the ipykernel package so we can avoid doing imports until


Train on 16165 samples, validate on 390 samples
Epoch 1/3
16165/16165 - 21s - loss: 0.5794 - acc: 0.6993 - val_loss: 0.5235 - val_acc: 0.7590
Epoch 2/3
16165/16165 - 19s - loss: 0.4357 - acc: 0.8002 - val_loss: 0.5617 - val_acc: 0.7000
Epoch 3/3
16165/16165 - 20s - loss: 0.2923 - acc: 0.8767 - val_loss: 0.6748 - val_acc: 0.7051


### Inference

In [0]:
from sklearn.metrics import classification_report

predicted_labels_word2vec_lstm = [1 if x>0.5 else 0 for x in word2VecLSTMModel.predict(dev_padded[validation_index:])]
print('Dev classification report:\n {}'.format(classification_report(dev_df['HasDef'].values[validation_index:], predicted_labels_word2vec_lstm)))


Dev classification report:
               precision    recall  f1-score   support

           0       0.71      0.88      0.79       239
           1       0.70      0.42      0.53       151

    accuracy                           0.71       390
   macro avg       0.70      0.65      0.66       390
weighted avg       0.70      0.71      0.69       390



## Training Using Glove 6B 100 Dim Pretrained Weights

In [0]:
vocab_size = len(vocab)

### Trainable Weights

In [0]:
num_epochs = 3
glove_lstm_trainable_weights_model = get_lstm_embeddings_model(glove_embedding_matrix, True)
history = glove_lstm_trainable_weights_model.fit(train_padded, train_df['HasDef'].values, epochs=num_epochs, validation_data=(dev_padded[:validation_index], dev_df['HasDef'].values[:validation_index]), verbose=2)

  This is separate from the ipykernel package so we can avoid doing imports until


Train on 16165 samples, validate on 390 samples
Epoch 1/3
16165/16165 - 22s - loss: 0.6021 - acc: 0.6864 - val_loss: 0.5650 - val_acc: 0.7000
Epoch 2/3
16165/16165 - 20s - loss: 0.4949 - acc: 0.7613 - val_loss: 0.9451 - val_acc: 0.6923
Epoch 3/3
16165/16165 - 20s - loss: 0.3645 - acc: 0.8387 - val_loss: 0.7191 - val_acc: 0.7359


#### Inference

In [0]:
from sklearn.metrics import classification_report

predicted_labels_glove_lstm = [1 if x>0.5 else 0 for x in glove_lstm_trainable_weights_model.predict(dev_padded[validation_index:])]
print('Dev classification report:\n {}'.format(classification_report(dev_df['HasDef'].values[validation_index:], predicted_labels_glove_lstm)))


Dev classification report:
               precision    recall  f1-score   support

           0       0.66      0.96      0.78       239
           1       0.77      0.23      0.35       151

    accuracy                           0.67       390
   macro avg       0.72      0.59      0.57       390
weighted avg       0.70      0.67      0.61       390



### Non Trainable Weights

In [0]:
num_epochs = 3
glove_lstm_freezed_weights_model = get_lstm_embeddings_model(glove_embedding_matrix, False)
history = glove_lstm_freezed_weights_model.fit(train_padded, train_df['HasDef'].values, epochs=num_epochs, validation_data=(dev_padded[:validation_index], dev_df['HasDef'].values[:validation_index]), verbose=2)

  This is separate from the ipykernel package so we can avoid doing imports until


Train on 16165 samples, validate on 390 samples
Epoch 1/3
16165/16165 - 20s - loss: 0.6222 - acc: 0.6617 - val_loss: 0.5924 - val_acc: 0.6949
Epoch 2/3
16165/16165 - 17s - loss: 0.5922 - acc: 0.6889 - val_loss: 0.5981 - val_acc: 0.7077
Epoch 3/3
16165/16165 - 17s - loss: 0.5457 - acc: 0.7280 - val_loss: 0.5753 - val_acc: 0.7256


#### Inference

In [0]:
from sklearn.metrics import classification_report

predicted_labels_glove_lstm_freezed_weights = [1 if x>0.5 else 0 for x in glove_lstm_freezed_weights_model.predict(dev_padded[validation_index:])]
print('Dev classification report:\n {}'.format(classification_report(dev_df['HasDef'].values[validation_index:], predicted_labels_glove_lstm_freezed_weights)))


Dev classification report:
               precision    recall  f1-score   support

           0       0.65      0.92      0.76       239
           1       0.61      0.21      0.31       151

    accuracy                           0.64       390
   macro avg       0.63      0.56      0.53       390
weighted avg       0.63      0.64      0.58       390

