### Tutorial on Keras with Gensim
https://www.depends-on-the-definition.com/guide-to-word-vectors-with-gensim-and-keras/

- http://www.orbifold.net/default/2017/01/10/embedding-and-tokenizer-in-keras/
- https://github.com/keras-team/keras/issues/853
- http://adventuresinmachinelearning.com/gensim-word2vec-tutorial/
- https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/
- https://stats.stackexchange.com/questions/320701/how-to-use-keras-pre-trained-embedding-layer
- https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html

In [36]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from nltk.tokenize import WordPunctTokenizer
from collections import Counter

from keras.layers import Dense, Input, LSTM, CuDNNLSTM, Embedding, Dropout,SpatialDropout1D, Bidirectional
from keras.models import Sequential
from keras.optimizers import Adam
from keras.layers.normalization import BatchNormalization
from keras.utils import to_categorical

# Visualization
import matplotlib.pyplot as plt
import matplotlib.cm as cmap
%matplotlib inline

import importlib
import utils
importlib.reload(utils)
import text_utils
importlib.reload(text_utils)

<module 'text_utils' from 'C:\\Users\\Thomas\\HPI\\Text Mining in Practice\\analysis\\text_utils.py'>

In [29]:
# Constants

OUTPUT_DIR = './week-7-plots'

SRC_ENHANCED_COMMENTS = '../data/pol/comments-root-all-pol-enhanced.csv'

SRC_GENSIM_EMBEDDING = '../data/embedding/gensim-guardian-comments-50-tokenized.bin'

#### Preprocess data

In [21]:
comments = pd.read_csv(SRC_ENHANCED_COMMENTS)

In [22]:
X = comments['comment_text']
y = to_categorical(comments['bin'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

#### Load embedding

In [34]:
ft_model = text_utils.load_embedding()
word_vectors = ft_model.wv
EMBEDDING_DIM = 50
print("Number of word vectors: {}".format(len(word_vectors.vocab)))

Loading embeddings...
Number of word vectors: 1286151


In [37]:
# Setup tokenizer
tokenizer = WordPunctTokenizer()
# Use a counter for selecting the X most common words (therefore tokenize)
vocab = Counter()
comments = text_utils.process_comments(tokenizer, vocab, X, lower=True)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100000/100000 [00:20<00:00, 4959.65it/s]


In [42]:
vocab

Counter({'i': 56393,
         '’': 6603,
         've': 3994,
         'not': 37931,
         'really': 6893,
         'been': 12181,
         'payng': 2,
         'too': 5986,
         'close': 985,
         'attention': 521,
         ',': 201618,
         'but': 26776,
         'have': 40745,
         'got': 4014,
         'this': 41349,
         'right': 8407,
         'so': 21028,
         'far': 3717,
         '?': 39091,
         'the': 340201,
         'european': 1708,
         'union': 2525,
         'is': 89643,
         'mistaken': 74,
         'when': 12595,
         'it': 71883,
         'says': 2750,
         'scotland': 7068,
         'would': 21443,
         'to': 177810,
         'apply': 414,
         'join': 807,
         'spaniard': 2,
         'barrosso': 2,
         'lying': 795,
         'he': 35710,
         'points': 748,
         'out': 15447,
         'spain': 219,
         'likely': 1627,
         'veto': 134,
         'attemtps': 1,
         'eu': 9204,
   

In [45]:
backtranslater = dict((i, w) for w, i in vocab.items())  # Tokenizer -> .word_index.items()

In [None]:
# assemble the embedding_weights in one numpy array
n_symbols = len(index_dict) + 1 # adding 1 to account for 0th index (for masking)
embedding_weights = np.zeros((n_symbols, EMBEDDING_DIM))
for word, index in index_dict.items():
    if word in word_vectors:
        embedding_weights[index, :] = word_vectors[word]

# define inputs here
# embedding_layer = Embedding(output_dim=vocab_dim, input_dim=n_symbols, weights=[embedding_weights], trainable=False)
# embedded = embedding_layer(input_layer)
# embedding_layer = model.wv.get_keras_embedding(train_embeddings=False)

In [None]:
word_vectors.most_similar_cosmul(positive=['president', 'german'])

In [None]:
word_vectors.most_similar_cosmul(positive=['woman', 'king'], negative=['man'])

#### Pad/Cut tokenized comments to a certain length

In [None]:
MAX_NB_WORDS = len(word_vectors.vocab)
MAX_SEQUENCE_LENGTH = 200
train_size = len(X_train)
most_common = sorted(tokenizer.word_counts.items(), key=lambda x: x[1], reverse=True)
# word_index tells ous the
X_train, X_test, word_index = text_utils.pad_or_cut_tokenized_comments(
    most_common, comments, train_size, MAX_NB_WORDS, MAX_SEQUENCE_LENGTH)

In [None]:
# https://codekansas.github.io/blog/2016/gensim.html
WV_DIM = 50
nb_words = min(MAX_NB_WORDS, len(word_vectors.vocab))
wv_matrix = word_vectors.syn0 #  text_utils.get_weights_matrix(word_index, word_vectors, nb_words, WV_DIM)

#### Prepare Keras Model

In [None]:
embed_dim = 128
lstm_out = 64
batch_size = 16

model = Sequential()
model.add(Embedding(nb_words,
                    WV_DIM,
                    mask_zero=False,
                    weights=[wv_matrix],
                    input_length=MAX_SEQUENCE_LENGTH,
                    trainable=False))
# model.add(Embedding(vocabulary, embed_dim, input_length = X.shape[1]))
model.add(SpatialDropout1D(0.1))
# model.add(LSTM(lstm_out, return_sequences=True, recurrent_dropout=0.3, dropout=0.3))
model.add(LSTM(lstm_out, recurrent_dropout=0, dropout=0.1))
model.add(Dense(2, activation='softmax'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics = ['accuracy'])

In [None]:
model.summary()

In [None]:
epochs = 20
batch_size = 32

# Here we train the Network.
try:
    history = model.fit([X_train[:50000]], y_train[:50000], validation_split=0.1,
                        batch_size = batch_size, epochs = epochs,
                        verbose = 2, shuffle=True)
except KeyboardInterrupt:
    print("Fitting stopped manually")

In [None]:
def inspect_preprocessed_comment(data_comments_pol, X_train, backtranslater, idx):
    print('Original:')
    print(data_comments_pol.iloc[idx])
    print('\nAfter preprocessing (& backtranslating):')
    print(' '.join([backtranslater[x] for x in X_train[idx] if x in backtranslater]))

inspect_preprocessed_comment(data_comments_pol, X_train, backtranslater, 0)

In [None]:
utils.plot_history(history)

In [None]:
# Measuring score and accuracy on test set

score, acc = model.evaluate([X:test], y_test, verbose = 2,
                            batch_size = batch_size)
print("Logloss score: %.2f" % (score))
print("Test set Accuracy: %.2f" % (acc))

In [None]:
plt.hist(data_comments_pol['comment_text'].str.split().len)