### Tutorial on Keras with Gensim
https://www.depends-on-the-definition.com/guide-to-word-vectors-with-gensim-and-keras/

In [35]:
import datetime
import os
import re
import urllib
import sys
import math
import numpy as np
import pandas as pd
from scipy.stats import describe
from sklearn.model_selection import train_test_split

from nltk.tokenize import WordPunctTokenizer
from collections import Counter
from string import punctuation, ascii_lowercase
from tqdm import tqdm

from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, CuDNNLSTM, Embedding, Dropout,SpatialDropout1D, Bidirectional
from keras.models import Sequential
from keras.optimizers import Adam
from keras.layers.normalization import BatchNormalization

# Visualization
import matplotlib.pyplot as plt
import matplotlib.cm as cmap
%matplotlib inline

from wordcloud import WordCloud
import networkx as nx
from gensim.models import KeyedVectors

import utils

In [2]:
# Constants

OUTPUT_DIR = './week-6-plots'

#### Preprocess data

In [3]:
data_articles, data_articles_pol, data_authors, data_comments_pol = utils.load_data()

In [4]:
X = data_comments_pol['comment_text']
y = data_comments_pol['upvotes']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [42]:
y_test = ['pos' if x > 2 else 'neg' for x in y_test]  # 2 -> ? vs. ?
y_test = pd.get_dummies(y_test).values

In [43]:
y_train = ['pos' if x > 2 else 'neg' for x in y_train]  # 2 -> ? vs. ?
y_train = pd.get_dummies(y_train).values

In [5]:
# We have no NaN words but it could be useful
list_sentences_train = list(X_train.fillna("NAN_WORD").values)
list_sentences_test = list(X_test.fillna("NAN_WORD").values)

In [18]:
# Replace urls
re_url = re.compile(r"((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\
                    .([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*",
                    re.MULTILINE|re.UNICODE)
# Replace ips
re_ip = re.compile("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}")


# Setup tokenizer
tokenizer = WordPunctTokenizer()

# Use a counter for selecting the X most common words (therefore tokenize)
vocab = Counter()

def text_to_wordlist(text, lower=False):
    # Looks like all URLs are removed
    text = re_url.sub("URL", text)
    
    # But there some IPs we'd like to replace
    text = re_ip.sub("IPADDRESS", text)
    
    # Tokenize
    text = tokenizer.tokenize(text)
    
    # optional: lower case
    if lower:
        text = [t.lower() for t in text]
    
    # Return a list of words
    vocab.update(text)
    
    # Return a list of words
    return text

def process_comments(list_sentences, lower=False):
    comments = []
    for text in tqdm(list_sentences):
        txt = text_to_wordlist(text, lower=lower)
        comments.append(txt)
    return comments

comments = process_comments(list_sentences_train + list_sentences_test, lower=True)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 665523/665523 [05:15<00:00, 2108.62it/s]


#### Load embedding model

In [9]:
ft_model = utils.load_embedding()

Loading embeddings...


In [10]:
word_vectors = ft_model.wv

In [11]:
print("Number of word vectors: {}".format(len(word_vectors.vocab)))

Number of word vectors: 1286151


In [12]:
word_vectors.most_similar_cosmul(positive=['president', 'german'])

[('president,', 0.708040177822113),
 ('presdident', 0.7070919871330261),
 ('presiden', 0.6998500227928162),
 ('president,a', 0.6993380784988403),
 ('foreiner', 0.697151243686676),
 ('Bundespresident', 0.6964511275291443),
 ('presidento', 0.6927176117897034),
 ('indident', 0.6924108266830444),
 ('Bundespraesident', 0.6900625228881836),
 ('president,but', 0.6889559626579285)]

In [13]:
word_vectors.most_similar_cosmul(positive=['woman', 'king'], negative=['man'])

[('queen', 0.9264782071113586),
 ('regnant', 0.901670515537262),
 ('king/queen', 0.8998395800590515),
 ('monarhy', 0.8881024718284607),
 ('royal', 0.8818458914756775),
 ('regent', 0.8790533542633057),
 ('virgina', 0.8765912652015686),
 ('monarch', 0.8760467171669006),
 ('empress', 0.8739269375801086),
 ('prince', 0.8708903789520264)]

#### Pad/Cut tokenized comments to a certain length

In [14]:
MAX_NB_WORDS = len(word_vectors.vocab)
MAX_SEQUENCE_LENGTH = 200

In [19]:
word_index = {t[0]: i+1 for i,t in enumerate(vocab.most_common(MAX_NB_WORDS))}
sequences = [[word_index.get(t, 0) for t in comment]
             for comment in comments[:len(list_sentences_train)]]
test_sequences = [[word_index.get(t, 0)  for t in comment] 
                  for comment in comments[len(list_sentences_train):]]

# pad
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, 
                     padding="pre", truncating="post")
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', y_label.shape)

test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding="pre",
                          truncating="post")
print('Shape of test_data tensor:', test_data.shape)


Shape of data tensor: (532418, 200)
Shape of label tensor: (665523,)
Shape of test_data tensor: (133105, 200)


In [20]:
WV_DIM = 100
nb_words = min(MAX_NB_WORDS, len(word_vectors.vocab))
# we initialize the matrix with random numbers
wv_matrix = (np.random.rand(nb_words, WV_DIM) - 0.5) / 5.0
for word, i in word_index.items():
    if i >= MAX_NB_WORDS:
        continue
    try:
        embedding_vector = word_vectors[word]
        # words not found in embedding index will be all-zeros.
        wv_matrix[i] = embedding_vector
    except:
        pass

#### Prepare Keras Model

In [36]:
embed_dim = 128
lstm_out = 64
batch_size = 16

model = Sequential()
model.add(Embedding(nb_words,
                    WV_DIM,
                    mask_zero=False,
                    weights=[wv_matrix],
                    input_length=MAX_SEQUENCE_LENGTH,
                    trainable=False))
# model.add(Embedding(vocabulary, embed_dim, input_length = X.shape[1]))
# model.add(Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32'))
model.add(SpatialDropout1D(0.1))
# model.add(LSTM(lstm_out, return_sequences=True, recurrent_dropout=0.3, dropout=0.3))
model.add(LSTM(lstm_out, recurrent_dropout=0, dropout=0.1))
model.add(Dense(2, activation='softmax'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics = ['accuracy'])

In [37]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 200, 100)          128615100 
_________________________________________________________________
spatial_dropout1d_2 (Spatial (None, 200, 100)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                42240     
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 130       
Total params: 128,657,470
Trainable params: 42,370
Non-trainable params: 128,615,100
_________________________________________________________________


In [None]:
epochs = 20
batch_size = 32

# Here we train the Network.
try:
    history = model.fit([data[:50000]], y_train[:50000], validation_split=0.1,
                        batch_size = batch_size, epochs = epochs,
                        verbose = 2, shuffle=True)
except KeyboardInterrupt:
    print("Fitting stopped manually")

Train on 45000 samples, validate on 5000 samples
Epoch 1/20
 - 208s - loss: 0.6897 - acc: 0.5296 - val_loss: 0.6865 - val_acc: 0.5410
Epoch 2/20
 - 212s - loss: 0.6853 - acc: 0.5505 - val_loss: 0.6953 - val_acc: 0.5052
Epoch 3/20
 - 212s - loss: 0.6863 - acc: 0.5458 - val_loss: 0.6842 - val_acc: 0.5536
Epoch 4/20
 - 165s - loss: 0.6843 - acc: 0.5546 - val_loss: 0.6857 - val_acc: 0.5452
Epoch 5/20
 - 138s - loss: 0.6829 - acc: 0.5574 - val_loss: 0.6847 - val_acc: 0.5610
Epoch 6/20
 - 136s - loss: 0.6820 - acc: 0.5595 - val_loss: 0.6861 - val_acc: 0.5490
Epoch 7/20
 - 149s - loss: 0.6805 - acc: 0.5660 - val_loss: 0.6860 - val_acc: 0.5540
Epoch 8/20


In [None]:
plot_history(history)

In [None]:
# Measuring score and accuracy on test set

score, acc = model.evaluate(X_test, y_test, verbose = 2,
                            batch_size = batch_size)
print("Logloss score: %.2f" % (score))
print("Test set Accuracy: %.2f" % (acc))