## Import

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip3 install importlib


In [None]:
from importlib.machinery import SourceFileLoader

bt = SourceFileLoader('baseline', "/content/drive/Shared drives/Shared Task SentiMix/tools/baseline.py").load_module()
data_tools = SourceFileLoader('data_tools', "/content/drive/Shared drives/Shared Task SentiMix/tools/data.py").load_module()

In [2]:
import os 

# tool_path = "/content/drive/Shared drives/Shared Task SentiMix/tools"

# import tools

# import tools.baseline as bt
# import tools.data as data_tools
import numpy as np
import pandas as pd

from sklearn.metrics import classification_report
from gensim.models import Word2Vec
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

import keras
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.optimizers import SGD
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.layers import Embedding, Bidirectional, Dropout
from keras.utils import to_categorical
from keras.models import Model, Input
from keras.optimizers import Adam
from collections import defaultdict
import json

from keras.layers import *


Using TensorFlow backend.


In [3]:
def read_corpus(corpus_file, use_binary):
    """read input document and return the textual articles
    and either the bias or hyperpartisan label"""

    with open(corpus_file) as json_file:
        data = json.load(json_file)

    data = pd.DataFrame(data)
    documents = data.sentences

    if use_binary == 0:
        labels = data.hyperp
    else:
        labels = data.bias

    return documents, labels

## Import Data

In [4]:
X, Y = read_corpus('tokenised_full.json', 0)

In [5]:
print(X)

27990                                                    []
80593     [the, las, cruces, sun-news, reported, that, t...
93952     [this, post, first, appeared, at, the, america...
88847     [pasquale, ?, pat, ?, d, ?, arco, passed, away...
91146     [jan, 25, (, ), -, wolong, real, estate, group...
                                ...                        
210143    [june, 6, is, a, very, special, day, in, ameri...
206861    [british, police, say, their, investigation, i...
184344    [msnbc, ?, s, joe, scarborough, told, his, aud...
187353    [a, north, carolina, man, is, suing, philadelp...
125208    [states, remained, stingy, in, funding, public...
Name: sentences, Length: 48467, dtype: object


In [6]:
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size = 0.2, stratify = Y)

In [7]:
toki = Tokenizer()
toki.fit_on_texts(Xtrain)

In [8]:
Xtrain_seq = toki.texts_to_sequences(Xtrain)
Xtest_seq = toki.texts_to_sequences(Xtest)

In [9]:
print(Xtrain_seq[:5])

[[1847, 1069, 39, 38, 209, 30780, 40650, 34, 583, 58366, 1363, 127255, 1332, 698, 533, 1158, 90, 6, 321, 632, 583, 19817, 127256, 2899, 533, 1158, 90, 2, 383, 268, 603, 21, 193, 17, 703, 1014, 603, 583, 14618, 127257, 7684, 30780, 127258, 3, 44782, 15, 533, 1158, 90, 2, 372, 2157, 2, 2003, 2, 350, 1954, 6, 1673, 81, 487, 34, 1, 626, 478, 616, 39, 38, 209, 71, 1610, 634, 321, 7705, 9217, 143, 90, 386, 86, 94, 161, 9, 25, 11, 32, 7, 988, 5, 686, 2759, 1213, 3626, 3, 12, 353, 628, 2, 117, 4, 7, 429, 1759, 16, 1, 3323, 71, 1610, 634, 321, 7705, 9217, 2693, 20, 7, 136, 661, 16, 79, 179, 1092, 225, 20, 1, 514, 222, 4, 3753, 1478, 7706, 1189, 16, 4634, 1113, 3216, 3862, 204, 2, 8, 214, 2, 71, 2, 333, 1287, 2, 532, 13221, 13633, 70, 1, 473, 663, 2307, 16, 9217, 2, 86, 143, 3901, 9, 25, 18, 32, 8259, 4, 1647, 1, 11966, 5, 736, 324, 121, 25, 11, 32, 1, 988, 5, 3626, 3, 12, 3323, 1, 2307, 18, 100, 241, 19, 3475, 1, 514, 222, 143, 14, 215, 32, 754, 13, 3644, 16, 1, 90, 1, 221, 158, 127, 32, 1513, 

In [10]:
word2index = toki.word_index
word2index['PAD'] = 0

In [11]:
index2word = toki.index_word
index2word[0] = 'PAD'
print(index2word[0])

PAD


In [12]:
#get max length of words

lens = [len(text) for text in Xtrain]

max_len = max(lens)
mean_len = np.mean(lens)

print(max_len, mean_len)

max_len = int(mean_len)

3336 605.8469037732443


In [13]:
#pre-pad the tweets with value 0
# that is prepadding (Louis so you understand as well)
Xtrain_pad = pad_sequences(Xtrain_seq, maxlen = max_len)
Xtest_pad = pad_sequences(Xtest_seq, maxlen = max_len)

In [14]:
Ytrain[:5]

36486     False
189129     True
95226     False
195710     True
32290     False
Name: hyperp, dtype: bool

In [15]:
label_dict_bin = {
    'True':1,
    'False':0
    }

In [16]:
Ytrain = [label_dict_bin[str(label)] for label in Ytrain]
Ytest = [label_dict_bin[str(label)] for label in Ytest] 

# this is now a list of numbers (instead of 'neutral' etc)

In [17]:
Ytrain_cat = np.asarray([to_categorical(label, num_classes = 2) for label in Ytrain])
Ytest_cat = np.asarray([to_categorical(label, num_classes = 2) for label in Ytest])

In [18]:
Ytrain_cat[:5]

array([[1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.]], dtype=float32)

In [19]:
print('Shape of data tensor:', Xtrain_pad.shape)
print('Shape of label tensor:', Ytrain_cat.shape)

Shape of data tensor: (38773, 605)
Shape of label tensor: (38773, 2)


### Prepare the Embedding Layer

In [20]:
embeddings = Word2Vec.load('model_all.bin')
embed_len = 100

In [21]:
#make an index2embedding dict
# TODO: change to word embeddings from Spanish and English

index2emb = dict()
w = filter(lambda x: x in word2index.keys(), list(embeddings.wv.vocab))

for i, w in index2word.items():
    try:
        embed = embeddings[w]
    except KeyError:
        embed = np.zeros(embed_len)
        #embed = embeddings.most_similar(positive=w)
    index2emb[i] = embed

    
#w = filter(lambda x: x in model.vocab, list(model.wv.vocab))
#print model.most_similar(positive=w)



  if __name__ == '__main__':


In [22]:
#compute embedding matrix

embedding_matrix = np.zeros((len(word2index) + 1, embed_len))
for word, i in word2index.items():
    embedding_vector = index2emb[i]
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [23]:
#load embedding matrix into embedding layer


embedding_layer = Embedding(len(word2index) + 1,
                            embed_len,
                            weights=[embedding_matrix],
                            input_length=max_len,
                            trainable=False)

### Building the classifier - FFNN

In [24]:
epochs = 20
lr = 0.001
batch = 512
#embed_len = 100
activation = 'relu'
activation_output = 'softmax'
loss_function = 'categorical_crossentropy'

In [26]:
sequence_input = Input(shape=(max_len,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

flat = Flatten()(embedded_sequences)

output_1 = Dense(200, activation='tanh')(flat)
#drop = Dropout(0.4)(output_1)
output_2 = Dense(64, activation='relu')(output_1)
predictions = Dense(2, activation=activation_output)(output_2)

model = Model(inputs=sequence_input, outputs=predictions)

optimizer = Adam(lr = lr)

model.compile(loss=loss_function, optimizer=optimizer, metrics=['accuracy'])

model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 605)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 605, 100)          24530600  
_________________________________________________________________
flatten_2 (Flatten)          (None, 60500)             0         
_________________________________________________________________
dense_4 (Dense)              (None, 200)               12100200  
_________________________________________________________________
dense_5 (Dense)              (None, 64)                12864     
_________________________________________________________________
dense_6 (Dense)              (None, 2)                 130       
Total params: 36,643,794
Trainable params: 12,113,194
Non-trainable params: 24,530,600
______________________________________

In [27]:
model.fit(Xtrain_pad, Ytrain_cat, batch_size = batch, epochs=epochs, verbose = 1)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.callbacks.History at 0x7f4a1ed87be0>

In [28]:
predictions = model.predict(Xtest_pad)

In [29]:
pred = np.argmax(predictions, axis=1)
Ytest_converted = np.argmax(Ytest_cat, axis=1)

print(classification_report(Ytest_converted, pred))

              precision    recall  f1-score   support

           0       0.76      0.71      0.74      4813
           1       0.73      0.78      0.76      4881

    accuracy                           0.75      9694
   macro avg       0.75      0.75      0.75      9694
weighted avg       0.75      0.75      0.75      9694

