## Import

In [1]:
# this is for google colab
#from google.colab import drive
#drive.mount('/content/drive')

In [2]:
import os 

# tool_path = "/content/drive/Shared drives/Shared Task SentiMix/tools"

# import tools

# import tools.baseline as bt
# import tools.data as data_tools
import numpy as np
import pandas as pd

from sklearn.metrics import classification_report
from gensim.models import Word2Vec, KeyedVectors
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

import keras
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.optimizers import SGD
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.layers import Embedding, Bidirectional, Dropout
from keras.utils import to_categorical
from keras.models import Model, Input
from keras.optimizers import Adam
from collections import defaultdict
import json

from keras.layers import *


Using TensorFlow backend.


In [3]:
def read_corpus(corpus_file):
    """read input document and return the textual articles
    and either the bias or hyperpartisan label"""

    with open(corpus_file) as json_file:
        data = json.load(json_file)

    data = pd.DataFrame(data)
    documents = data.sentences
    labels_bin = data.hyperp
    labels_mult = data.bias


    return documents, labels_bin, labels_mult

## Import Data

In [4]:
X, Y_bin, Y_mult = read_corpus('tokenised_full.json')

In [5]:
Xtrain, Xtest, Ytrain_bin, Ytest_bin, Ytrain_mult, Ytest_mult = train_test_split(X, 
                                                Y_bin, 
                                                Y_mult, 
                                                test_size = 0.2,
                                                stratify = Y_bin)

In [6]:
toki = Tokenizer()
toki.fit_on_texts(Xtrain)

In [7]:
Xtrain_seq = toki.texts_to_sequences(Xtrain)
Xtest_seq = toki.texts_to_sequences(Xtest)

In [8]:
print(Xtrain_seq[:5])

[[19, 3864, 6460, 2197, 3402, 788, 1, 8987, 10, 1422, 8, 1, 161, 119, 6, 196, 1, 122, 23, 35, 5059, 5, 6461, 52, 1, 85, 253, 3, 12, 388, 2475, 2, 55, 7775, 124, 776, 10, 1422, 2, 684, 6, 555, 478, 6, 1626, 4, 354, 1, 1625, 388, 19, 51, 4582, 27071, 181, 776, 10, 955, 3408, 5, 478, 11, 2305, 2738, 64, 23, 5094, 2, 10, 2241, 2, 9, 552, 220, 190, 3, 59, 183, 15, 7, 3003, 107, 36, 1334, 7, 376, 280, 5, 69, 42, 1, 8332, 10, 55, 11, 2249, 4, 1744, 261, 1410, 107, 102, 1153, 23, 916, 6, 655, 8, 1, 334, 373, 66, 45, 655, 8, 926, 42, 79, 127388, 1268, 5, 1, 114, 2, 6, 116, 310, 11, 3715, 7, 950, 2, 45, 115, 26, 11626, 20, 1, 804, 5, 1, 3655, 3219, 14, 11, 1, 164, 1179, 5, 214, 9, 11, 776, 1, 1147, 10, 7, 4617, 887, 45, 1382, 1, 1462, 124, 1341, 52, 1, 7084, 6189, 8, 600, 1828, 8, 20065, 50, 11, 387, 13, 7, 4617, 3, 1, 114, 108, 6, 5282, 8, 1, 114, 259, 2, 4617, 745, 11, 7, 127389, 2441, 33, 3, 5, 1, 51, 4123, 125, 9, 659, 114, 4617, 478, 6, 745, 3004, 2, 4515, 94, 540, 24, 4617, 242, 2, 3328, 9

In [9]:
word2index = toki.word_index
word2index['PAD'] = 0

In [10]:
index2word = toki.index_word
index2word[0] = 'PAD'
print(index2word[0])

PAD


In [11]:
#get max length of words

lens = [len(text) for text in Xtrain]

max_len = max(lens)
mean_len = np.mean(lens)

print(max_len, mean_len)

max_len = int(mean_len)

3209 606.142367111134


In [12]:
#pre-pad the tweets with value 0
# that is prepadding (Louis so you understand as well)
Xtrain_pad = pad_sequences(Xtrain_seq, maxlen = max_len)
Xtest_pad = pad_sequences(Xtest_seq, maxlen = max_len)

### Labels

In [13]:
Ytrain_bin[:3]

130349     True
31451     False
43541     False
Name: hyperp, dtype: bool

In [14]:
Ytrain_bin = np.asarray([0. if l == False else 1. for l in Ytrain_bin])
Ytest_bin = np.asarray([0. if l == False else 1. for l in Ytest_bin])

In [15]:
# Ytrain_bin = to_categorical(Ytrain_bin)
# Ytest_bin = to_categorical(Ytest_bin)

In [16]:
Ytest_bin[:3]

array([0., 0., 1.])

In [17]:
mult_labels = set(Ytrain_mult)

In [18]:
label_dict = defaultdict()
for i, l in enumerate(mult_labels):
    label_dict[l] = i
    
print(label_dict)

defaultdict(None, {'left-center': 0, 'right': 1, 'least': 2, 'right-center': 3, 'left': 4})


In [19]:
Ytrain_mult = [label_dict[label] for label in Ytrain_mult]
Ytest_mult = [label_dict[label] for label in Ytest_mult] 

# this is now a list of numbers

In [20]:
no_cls = len(label_dict)

Ytrain_mult_cat = np.asarray([to_categorical(label, num_classes = no_cls) for label in Ytrain_mult])
Ytest_mult_cat = np.asarray([to_categorical(label, num_classes = no_cls) for label in Ytest_mult])

In [21]:
print('Shape of data tensor:', Xtrain_pad.shape)
print('Shape of binary label tensor:', Ytrain_bin.shape)
print('Shape of multilabel tensor:', Ytrain_mult_cat.shape)

Shape of data tensor: (38773, 606)
Shape of binary label tensor: (38773,)
Shape of multilabel tensor: (38773, 5)


### Prepare the Embedding Layer

In [22]:
w2v_path = 'data/GoogleNews-vectors-negative300.bin'
embeddings = Word2Vec.load('model_all.bin')
embed_len = 100


#embed_w2v = KeyedVectors.load_word2vec_format(w2v_path, binary=True)

In [23]:
#w = filter(lambda x: x in model.vocab, list(model.wv.vocab))
#print model.most_similar(positive=w)

def load_embeddings(model, i2w, embed_len):
    
    index2embed = dict()
    w = filter(lambda x: x in word2index.keys(), list(embeddings.wv.vocab))

    for i, w in i2w.items():
        try:
            embed = model[w]
        except KeyError:
            embed = np.zeros(embed_len)
            #embed = embeddings.most_similar(positive=w)
        index2embed[i] = embed
    
    return index2embed

def load_w2v(model,i2w):
    index2emb = dict()
    
    for i, w in i2w.items():
        try:
            embed = model[w]
        except KeyError:
            embed = model['UNK']
            #embed = embeddings.most_similar(positive=w)
        index2embed[i] = embed
    
    return index2embed


In [24]:
#index2embed = loadw2v(embed_w2v, index2word)
index2embed = load_embeddings(embeddings, index2word, embed_len)

  # This is added back by InteractiveShellApp.init_path()


In [25]:
#compute embedding matrix

embedding_matrix = np.zeros((len(word2index) + 1, embed_len))
for word, i in word2index.items():
    embedding_vector = index2embed[i]
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [26]:
#load embedding matrix into embedding layer


embedding_layer = Embedding(len(word2index) + 1,
                            embed_len,
                            weights=[embedding_matrix],
                            input_length=max_len,
                            trainable=False)

### Building the classifier - FFNN

In [27]:
lr = 0.001
batch = 1024
#embed_len = 100
activation = 'relu'
activation_output = 'softmax'
loss_function = 'categorical_crossentropy'
loss_bin = 'binary_crossentropy'

from keras.layers import Conv1D, GlobalMaxPool1D

In [28]:
filters = 250
kernel_size = 3

In [29]:
sequence_input = Input(shape=(max_len,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

drop1 = Dropout(0.2)(embedded_sequences)

conv1 = Conv1D(filters,
              kernel_size,
              padding = 'valid',
              activation = 'relu',
              strides = 1)(drop1)

pool = GlobalMaxPool1D()(conv1)

dense1 = Dense(250, activation='relu')(pool)
drop2 = Dropout(0.2)(dense1)

output = Dense(1, activation = 'sigmoid')(drop2)


#flat = Flatten()(embedded_sequences)
#output_1 = Dense(200, activation='relu')(flat)
#drop = Dropout(0.4)(output_1)
#output_2 = Dense(64, activation='relu')(output_1)
#predictions = Dense(2, activation=activation_output)(output_2)

model = Model(inputs=sequence_input, outputs=output)

optimizer = Adam(lr = lr)

model.compile(loss=loss_bin, optimizer=optimizer, metrics=['accuracy'])

model.summary()

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 606)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 606, 100)          24449300  
_________________________________________________________________
dropout_1 (Dropout)          (None, 606, 100)          0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 604, 250)          75250     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 250)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 250)               62750     
_______________________________

In [None]:
epochs = 20
history = model.fit(Xtrain_pad, Ytrain_bin, batch_size=batch, epochs=epochs, verbose=1)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20


In [32]:
predictions = model.predict(Xtest_pad)

In [36]:
predictions[:5]

array([[0.00331014],
       [0.03838509],
       [0.06277174],
       [0.9436791 ],
       [0.6695579 ]], dtype=float32)

In [39]:
pred_converted = [0 if p < 0.5 else 1 for p in predictions]
pred_converted[:5]

[0, 0, 0, 1, 1]

In [41]:
pred = np.argmax(predictions, axis=1)
#Ytest_converted = np.argmax(Ytest_bin, axis=1)

print(classification_report(Ytest_bin, pred_converted))

              precision    recall  f1-score   support

         0.0       0.91      0.86      0.88      4813
         1.0       0.87      0.91      0.89      4881

    accuracy                           0.89      9694
   macro avg       0.89      0.89      0.89      9694
weighted avg       0.89      0.89      0.89      9694

