### Tutorial on Keras with Gensim
https://www.depends-on-the-definition.com/guide-to-word-vectors-with-gensim-and-keras/

- http://www.orbifold.net/default/2017/01/10/embedding-and-tokenizer-in-keras/
- https://github.com/keras-team/keras/issues/853
- http://adventuresinmachinelearning.com/gensim-word2vec-tutorial/
- https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/
- https://stats.stackexchange.com/questions/320701/how-to-use-keras-pre-trained-embedding-layer
- https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html

In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# from nltk.tokenize import WordPunctTokenizer
from collections import Counter

from keras.layers import Dense, Input, LSTM, CuDNNLSTM, Embedding, Dropout,SpatialDropout1D, Bidirectional
from keras.models import Sequential
from keras.optimizers import Adam
from keras.layers.normalization import BatchNormalization
from keras.preprocessing.text import Tokenizer

# Visualization
import matplotlib.pyplot as plt
import matplotlib.cm as cmap
%matplotlib inline

import importlib
import utils
importlib.reload(utils)
import text_utils
importlib.reload(text_utils)

<module 'text_utils' from 'C:\\Users\\Thomas\\HPI\\Text Mining in Practice\\analysis\\text_utils.py'>

In [5]:
# Constants

OUTPUT_DIR = './week-6-plots'
ORIG_COMMENTS = '../data/guardian-all/sorted_comments-standardized-pol-all.csv'
TOKENIZED_COMMENTS = '../data/embedding/sorted_comments-standardized-tokenized.csv'

In [6]:
tokenized_comments = pd.read_csv(TOKENIZED_COMMENTS, ';')

FileNotFoundError: File b'../data/guardian-all/sorted_comments-standardized-tokenized.csv' does not exist

In [7]:
tokenized_comments.shape

NameError: name 'tokenized_comments' is not defined

#### Preprocess data

In [14]:
data_articles, data_articles_pol, data_authors, data_comments_pol = utils.load_data()

In [15]:
X = data_comments_pol['comment_text']
y = pd.get_dummies(['pos' if x > 2 else 'neg' for x in data_comments_pol['upvotes']]).values  # 2 -> ? vs. ?
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

#### Load embedding

In [None]:
ft_model = text_utils.load_embedding()
word_vectors = ft_model.wv
EMBEDDING_DIM = 50
print("Number of word vectors: {}".format(len(word_vectors.vocab)))

In [6]:
# Setup tokenizer
# tokenizer = WordPunctTokenizer()
# Use a counter for selecting the X most common words (therefore tokenize)
# vocab = Counter()
# comments = text_utils.process_comments(tokenizer, vocab, X, lower=True)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
index_dict = tokenizer.word_index

100%|████████████████████████████████████████████████████████| 665523/665523 [01:58<00:00, 5629.16it/s]


In [80]:
backtranslater = dict((i, w) for w, i in index_dict.items())

In [41]:
# assemble the embedding_weights in one numpy array
n_symbols = len(index_dict) + 1 # adding 1 to account for 0th index (for masking)
embedding_weights = np.zeros((n_symbols, EMBEDDING_DIM))
for word, index in index_dict.items():
    if word in word_vectors:
        embedding_weights[index, :] = word_vectors[word]

# define inputs here
# embedding_layer = Embedding(output_dim=vocab_dim, input_dim=n_symbols, weights=[embedding_weights], trainable=False)
# embedded = embedding_layer(input_layer)
# embedding_layer = model.wv.get_keras_embedding(train_embeddings=False)

In [42]:
word_vectors.most_similar_cosmul(positive=['president', 'german'])

[('president,', 0.708040177822113),
 ('presdident', 0.7070919871330261),
 ('presiden', 0.6998500227928162),
 ('president,a', 0.6993380784988403),
 ('foreiner', 0.697151243686676),
 ('Bundespresident', 0.6964511275291443),
 ('presidento', 0.6927176117897034),
 ('indident', 0.6924108266830444),
 ('Bundespraesident', 0.6900625228881836),
 ('president,but', 0.6889559626579285)]

In [43]:
word_vectors.most_similar_cosmul(positive=['woman', 'king'], negative=['man'])

[('queen', 0.9264782071113586),
 ('regnant', 0.901670515537262),
 ('king/queen', 0.8998395800590515),
 ('monarhy', 0.8881024718284607),
 ('royal', 0.8818458914756775),
 ('regent', 0.8790533542633057),
 ('virgina', 0.8765912652015686),
 ('monarch', 0.8760467171669006),
 ('empress', 0.8739269375801086),
 ('prince', 0.8708903789520264)]

#### Pad/Cut tokenized comments to a certain length

In [68]:
MAX_NB_WORDS = len(word_vectors.vocab)
MAX_SEQUENCE_LENGTH = 200
train_size = len(X_train)
most_common = sorted(tokenizer.word_counts.items(), key=lambda x: x[1], reverse=True)
# word_index tells ous the
X_train, X_test, word_index = text_utils.pad_or_cut_tokenized_comments(
    most_common, comments, train_size, MAX_NB_WORDS, MAX_SEQUENCE_LENGTH)

Shape of training data tensor: (532418, 200)
Shape of test_data tensor: (133105, 200)


In [70]:
# https://codekansas.github.io/blog/2016/gensim.html
WV_DIM = 50
nb_words = min(MAX_NB_WORDS, len(word_vectors.vocab))
wv_matrix = word_vectors.syn0 #  text_utils.get_weights_matrix(word_index, word_vectors, nb_words, WV_DIM)

#### Prepare Keras Model

In [71]:
embed_dim = 128
lstm_out = 64
batch_size = 16

model = Sequential()
model.add(Embedding(nb_words,
                    WV_DIM,
                    mask_zero=False,
                    weights=[wv_matrix],
                    input_length=MAX_SEQUENCE_LENGTH,
                    trainable=False))
# model.add(Embedding(vocabulary, embed_dim, input_length = X.shape[1]))
model.add(SpatialDropout1D(0.1))
# model.add(LSTM(lstm_out, return_sequences=True, recurrent_dropout=0.3, dropout=0.3))
model.add(LSTM(lstm_out, recurrent_dropout=0, dropout=0.1))
model.add(Dense(2, activation='softmax'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics = ['accuracy'])

In [72]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 200, 50)           64307550  
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 200, 50)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                29440     
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 130       
Total params: 64,337,120
Trainable params: 29,570
Non-trainable params: 64,307,550
_________________________________________________________________


In [74]:
epochs = 20
batch_size = 32

# Here we train the Network.
try:
    history = model.fit([X_train[:50000]], y_train[:50000], validation_split=0.1,
                        batch_size = batch_size, epochs = epochs,
                        verbose = 2, shuffle=True)
except KeyboardInterrupt:
    print("Fitting stopped manually")

Train on 45000 samples, validate on 5000 samples
Epoch 1/20
 - 379s - loss: 0.6946 - acc: 0.5004 - val_loss: 0.6934 - val_acc: 0.5014
Epoch 2/20
 - 171s - loss: 0.6933 - acc: 0.5081 - val_loss: 0.6941 - val_acc: 0.4850
Epoch 3/20
 - 184s - loss: 0.6930 - acc: 0.5076 - val_loss: 0.6927 - val_acc: 0.5166
Epoch 4/20
 - 188s - loss: 0.6928 - acc: 0.5095 - val_loss: 0.6943 - val_acc: 0.4840
Epoch 5/20
 - 191s - loss: 0.6925 - acc: 0.5148 - val_loss: 0.6935 - val_acc: 0.5096
Epoch 6/20
 - 193s - loss: 0.6921 - acc: 0.5148 - val_loss: 0.6932 - val_acc: 0.5148
Epoch 7/20
 - 184s - loss: 0.6916 - acc: 0.5204 - val_loss: 0.6948 - val_acc: 0.5086
Epoch 8/20
 - 168s - loss: 0.6910 - acc: 0.5231 - val_loss: 0.6960 - val_acc: 0.5072
Epoch 9/20
 - 148s - loss: 0.6903 - acc: 0.5282 - val_loss: 0.6961 - val_acc: 0.5076
Epoch 10/20
 - 144s - loss: 0.6887 - acc: 0.5340 - val_loss: 0.6982 - val_acc: 0.5076
Epoch 11/20
 - 145s - loss: 0.6874 - acc: 0.5364 - val_loss: 0.6992 - val_acc: 0.5066
Epoch 12/20
 -

In [94]:
def inspect_preprocessed_comment(data_comments_pol, X_train, backtranslater, idx):
    print('Original:')
    print(data_comments_pol.iloc[idx])
    print('\nAfter preprocessing (& backtranslating):')
    print(' '.join([backtranslater[x] for x in X_train[idx] if x in backtranslater]))

inspect_preprocessed_comment(data_comments_pol, X_train, backtranslater, 0)

Original:
Unnamed: 0                                                      159895
article_id                                                        1731
author_id                                                        10860
comment_id                                                    61083280
comment_text         Actually it has - it was last night, I was the...
timestamp                                         2015-10-09T08:30:36Z
parent_comment_id                                          6.10705e+07
upvotes                                                              2
Name: 0, dtype: object

After preprocessing (& backtranslating):
actually it has it was last night i was there unless he ' s going to give another speech at the university of sheffield in which he says these things


In [None]:
utils.plot_history(history)

In [None]:
# Measuring score and accuracy on test set

score, acc = model.evaluate([X:test], y_test, verbose = 2,
                            batch_size = batch_size)
print("Logloss score: %.2f" % (score))
print("Test set Accuracy: %.2f" % (acc))

In [None]:
plt.hist(data_comments_pol['comment_text'].str.split().len)