In [2]:
import keras
import os
os.environ['KERAS_BACKEND'] = 'tensorflow'
import sys
import numpy as np
import random
from keras.models import Sequential
from keras.layers import Dense, Conv1D, GlobalMaxPooling1D, GlobalAveragePooling1D, Activation, Dropout

import torch

from allennlp.commands.elmo import ElmoEmbedder
import scipy

In [9]:
elmo = ElmoEmbedder()

# token1 = ["data mining"]
# token2 = ["zxcxzc"]

# vector1 = elmo.embed_sentence(token1)
# vector2 = elmo.embed_sentence(token2)

# scipy.spatial.distance.cosine(vector1[0], vector2[1])

tokens = ["data mining"]
vectors = elmo.embed_sentence(tokens)

vectors2 = elmo.embed_sentence(["aerospace engineering"])
scipy.spatial.distance.cosine(vectors[2][0], vectors2[2][0]) # cosine distance between "apple" and "carrot" in the last layer



0.3859228491783142

In [11]:
import torch

# print (torch.__version__)
torch.version.cuda

'9.0.176'

In [2]:
# Load all files from a directory into dictionaries
def load_directory_data(directory, label):
    data = []
    for file_path in os.listdir(directory):
        with open(os.path.join(directory, file_path), "r") as f:
            data.append({"text": f.read().replace("<br />", " "), "label": label})
    return data

# Load the positive and negative examples from the dataset
def load_dataset(directory):
    pos_data = load_directory_data(os.path.join(directory, "pos"), 1)
    neg_data = load_directory_data(os.path.join(directory, "neg"), 0)
    return pos_data+neg_data

# Download and process the IMDB dataset
def download_and_load_datasets(force_download=False):
    dataset = keras.utils.get_file(
      fname="aclImdb.tar.gz", 
      origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
      extract=True)

    train_data = load_dataset(os.path.join(os.path.dirname(dataset), "aclImdb", "train"))
    test_data = load_dataset(os.path.join(os.path.dirname(dataset), "aclImdb", "test"))

    return train_data, test_data

train_data, test_data = download_and_load_datasets()

random.shuffle(train_data)
random.shuffle(test_data)

In [4]:
train_data[0]

{'text': 'The first "side-story" in the universal century Gundam universe presents a refreshing new look at the war between earth and the space colonies. The focus is no longer on a small group of individuals who would go on to play pivotal roles in the conflict, but on the everyday civilian population and how the war is seen through their eyes.  The story does contain some Gundam staples, its premise being the attempts by a ZEON squad to capture an experimental Gundam, but it the execution of the plot that made this show so interesting to watch. This series focuses on the experiences of a young boy named Alfred and the relationship between his neighbor, Christina Mckenzie who is secretly a Federation pilot and a newbie Zeon pilot named Bernie Wiseman. Alfred develops a sort of "brotherly love" for Bernie while our young Zeon pilot also falls for Christina.  "War in the Pocket" proves that you do not need a sweeping epic tale about special individuals to make for a good war story. Ther

In [3]:
#Tokenize text. Note, it would be better to first split it into sentences.
def tokenize_text(documents, max_tokens):
    for document in documents:
        document['tokens'] = keras.preprocessing.text.text_to_word_sequence(document['text'], lower=False)
        document['tokens'] = document['tokens'][0:max_tokens]
 
max_tokens = 100
tokenize_text(train_data, max_tokens)
tokenize_text(test_data, max_tokens)

In [8]:
print (train_data[0])

{'text': "If only to avoid making this type of film in the future. This film is interesting as an experiment but tells no cogent story.  One might feel virtuous for sitting thru it because it touches on so many IMPORTANT issues but it does so without any discernable motive. The viewer comes away with no new perspectives (unless one comes up with one while one's mind wanders, as it will invariably do during this pointless film).  One might better spend one's time staring out a window at a tree growing.  ", 'label': 0, 'tokens': ['If', 'only', 'to', 'avoid', 'making', 'this', 'type', 'of', 'film', 'in', 'the', 'future', 'This', 'film', 'is', 'interesting', 'as', 'an', 'experiment', 'but', 'tells', 'no', 'cogent', 'story', 'One', 'might', 'feel', 'virtuous', 'for', 'sitting', 'thru', 'it', 'because', 'it', 'touches', 'on', 'so', 'many', 'IMPORTANT', 'issues', 'but', 'it', 'does', 'so', 'without', 'any', 'discernable', 'motive', 'The', 'viewer', 'comes', 'away', 'with', 'no', 'new', 'persp

In [12]:
# Lookup the ELMo embeddings for all documents (all sentences) in our dataset. Store those
# in a numpy matrix so that we must compute the ELMo embeddings only once.
def create_elmo_embeddings(elmo, documents, max_sentences = 1000):
    
    num_sentences = min(max_sentences, len(documents)) if max_sentences > 0 else len(documents)
    print("\n\n:: Lookup of "+str(num_sentences)+" ELMo representations. This takes a while ::")
    embeddings = []
    labels = []
    tokens = [document['tokens'] for document in documents]
    
    documentIdx = 0
    for elmo_embedding in elmo.embed_sentences(tokens):  
        document = documents[documentIdx]
        # Average the 3 layers returned from ELMo
        avg_elmo_embedding = np.average(elmo_embedding, axis=0)
             
        embeddings.append(avg_elmo_embedding)        
        labels.append(document['label'])
            
        # Some progress info
        documentIdx += 1
        percent = 100.0 * documentIdx / num_sentences
        line = '[{0}{1}]'.format('=' * int(percent / 2), ' ' * (50 - int(percent / 2)))
        status = '\r{0:3.0f}%{1} {2:3d}/{3:3d} sentences'
        sys.stdout.write(status.format(percent, line, documentIdx, num_sentences))
        
        if max_sentences > 0 and documentIdx >= max_sentences:
            break
            
    return embeddings, labels


elmo = ElmoEmbedder(cuda_device=0) #Set cuda_device to the ID of your GPU if you have one
train_x, train_y = create_elmo_embeddings(elmo, train_data, 1000)
test_x, test_y  = create_elmo_embeddings(elmo, test_data, 1000)



:: Lookup of 1000 ELMo representations. This takes a while ::

:: Lookup of 1000 ELMo representations. This takes a while ::

In [7]:
print (len(train_x))

1000


In [25]:
# :: Pad the x matrix to uniform length ::
def pad_x_matrix(x_matrix):
    for sentenceIdx in range(len(x_matrix)):
        sent = x_matrix[sentenceIdx]
        sentence_vec = np.array(sent, dtype=np.float32)
        padding_length = max_tokens - sentence_vec.shape[0]
        if padding_length > 0:
            x_matrix[sentenceIdx] = np.append(sent, np.zeros((padding_length, sentence_vec.shape[1])), axis=0)

    matrix = np.array(x_matrix, dtype=np.float32)
    return matrix

train_x = pad_x_matrix(train_x)
train_y = np.array(train_y)

test_x = pad_x_matrix(test_x)
test_y = np.array(test_y)

print("Shape Train X:", train_x.shape)
print("Shape Test Y:", test_x.shape)

Shape Train X: (1000, 100, 1024)
Shape Test Y: (1000, 100, 1024)


In [8]:
# Simple model for sentence / document classification using CNN + global max pooling
model = Sequential()
model.add(Conv1D(filters=250, kernel_size=3, padding='same'))
model.add(GlobalMaxPooling1D())
model.add(Dense(250, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(train_x, train_y, validation_data=(test_x, test_y), epochs=10, batch_size=32)

Train on 1000 samples, validate on 1000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7efc57f135c0>