In [None]:
from IPython.display import clear_output

import tensorflow as tf
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers, Model

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from load import load_glove_embedding, load_vocabulary, load_tweets
from FeaturesBuilder import FeaturesBuilder
from neural_net_utils import keras_compile, to_tensor

## Load data files

In [None]:
# Load glove embedding
word_vect = load_glove_embedding('glove_embeddings.npy')

# Load vocabulary
vocab = load_vocabulary('vocab.pkl')

# Load tweets
tweets_df = load_tweets('../twitter-datasets', full=True) 

# Define features builder instance
SEQ_LENGTH = 50
features_builder = FeaturesBuilder(tweets_df, vocab, word_vect, target_length=SEQ_LENGTH)
# ___ available methods ___
# build_avg_tweet_embedding
# build_word_embedding_sequences
# build_vocab_idx_sequences

## Classic ML on GloVe tweet embedding average

Here, the word embeddings are averaged over each tweet to build features with (embedding_dim = 20) shape. We try with the following methods to fit a classifier over that space.

In [None]:
x, y = features_builder.build_avg_tweet_embedding()
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

def score(model):
    y_pred = model.predict(x_test) > 0.5
    print('accuracy:', accuracy_score(y_pred, y_test))

### Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(x_train, y_train)
score(model)

We realise that a linear separation of the GloVe embedding space provides a baseline accuracy of 60%, we will compare this baseline to the next result.

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=120)
model.fit(x_train, y_train)
score(model)

A random forest classifier better captures features that are not correlated linearly and thus improves the score significantly over linear regression.
We are no learning a non linear sepration of the 20 dimensional space where tweets live.

### Perceptron

We try to learn a continuous separation using a densely connected perceptron. Dropout is used to reduce overfitting and thus better generalize.

In [None]:
# define model
model = keras.Sequential(
    [
        layers.Dropout(0.5),
        layers.Dense(50),
        
        layers.Dropout(0.5),
        layers.Dense(100),
        
        layers.Dropout(0.5),
        layers.Dense(50),
        
        layers.Dropout(0.1),

        layers.Dense(1, activation="sigmoid", name="out"),
    ]
)

keras_compile(model)

#print(model.summary())

# train
history = model.fit(
    x_train,
    y_train,
    batch_size=64,
    epochs=10,
    validation_data=(x_test, y_test),
)

After trying various network architectures, altering width and depth, we found that none of them was able to outperform the linear classifier. This probably means that the embedding space has to be separated using high order functions, which makes it more difficult for the perceptron to converge.

## Train on glove with word sequences

Here, we build sequences of word embeddings. Those sequences are front padded to provide a fixed tensor size to the neural network. Features now have a shape of (sequence_length = 50, embedding_dim = 20). We gain the information of words order and provide a pre-trained representation of each word.

In [None]:
# build dataset
x, y = features_builder.build_word_embedding_sequences()
x_train, x_test, y_train, y_test = to_tensor(*train_test_split(x, y, test_size=0.33, random_state=42))

In [None]:
# define model
model = keras.Sequential(
    [
        layers.Bidirectional(layers.LSTM(100, dropout=0.4, return_sequences=True)),
        layers.Bidirectional(layers.LSTM(200, dropout=0.4, return_sequences=True)),
        layers.Bidirectional(layers.LSTM(100, dropout=0.4,)),

        layers.Dense(1, activation="sigmoid", name="out"),
    ]
)

keras_compile(model)

#print(model.summary())

# train
history = model.fit(
    x_train,
    y_train,
    batch_size=64,
    epochs=8,
    validation_data=(x_test, y_test),
)

## Train with embedding layer

We switch to a trainable embedding, where word representations are learned in respect to the classification task and not from co occurences as in GloVe. 

With each tweet, backpropagation from the last layers will update the weights associated with each word.

Here we pass a sequence of word index in the vocabulary, of shape (sequence_length = 50).

This means that the embedding layer is not provided with pre-defined meaning but will  infer it from the set of indexes we pass as input.

In [None]:
# build dataset
x, y = features_builder.build_vocab_idx_sequences()
x_train, x_test, y_train, y_test = to_tensor(*train_test_split(x, y, test_size=0.33, random_state=42))

In [None]:
# define model
model = keras.Sequential(
    [
        layers.Embedding(len(vocab)+1, 100),
        
        layers.Bidirectional(layers.LSTM(100, dropout=0.4, return_sequences=True)),
        layers.Bidirectional(layers.LSTM(200, dropout=0.4, return_sequences=True)),
        layers.Bidirectional(layers.LSTM(100, dropout=0.4, return_sequences=True)),
        layers.Bidirectional(layers.LSTM(50, dropout=0.4)),
        
        layers.Dense(1, activation="sigmoid", name="out"),
    ]
)

keras_compile(model)

#print(model.summary())

# train
history = model.fit(
    x_train,
    y_train,
    batch_size=64,
    epochs=8,
    validation_data=(x_test, y_test),
)

## CNN

Keeping the indexes sequence used to train an LSTM network, we now train a convolutional network. Five parallel channels with convolutional layers of various kernel sizes are concatenated and fed to two densely connected layers.

In [None]:
# build dataset
x, y = features_builder.build_vocab_idx_sequences()
x_train, x_test, y_train, y_test = to_tensor(*train_test_split(x, y, test_size=0.5, random_state=42))

In [None]:
# define model
embedding_dim = 100
seq_len = SEQ_LENGTH

sequence_input = keras.Input(shape=(seq_len,), dtype='int32')

embedding_layer = layers.Embedding(len(vocab)+1,
                            embedding_dim,
                            input_length=seq_len,
                            trainable=True)

embedded_sequences = embedding_layer(sequence_input)

convs = []
filter_sizes = [2,3,4,5,6]
for filter_size in filter_sizes:
    l_conv = layers.Conv1D(filters=200,
                            #padding="same",
                           input_shape=(seq_len, embedding_dim),
                    kernel_size=filter_size, 
                    activation='relu')(embedded_sequences)
    l_pool = layers.MaxPooling1D(filter_size)(l_conv)
    l_conv = layers.Conv1D(filters=100,
                           #padding="same",
                            input_shape=(seq_len, embedding_dim),
                            kernel_size=filter_size, 
                            activation='relu')(l_pool)
    l_pool = layers.GlobalMaxPooling1D()(l_conv)
    convs.append(l_pool)
l_merge = layers.Concatenate(axis=1)(convs)

# Dense layers
x = layers.Dropout(0.1)(l_merge)  
x = layers.Dense(128, activation='relu')(x)
x = layers.Dropout(0.4)(x)
x = layers.Dense(64, activation='relu')(x)
x = layers.Dropout(0.2)(x)
preds = layers.Dense(1, activation='sigmoid')(x)

model = Model(sequence_input, preds)

keras_compile(model)

#print(model.summary())

# train
history = model.fit(
    x_train,
    y_train,
    batch_size=64,
    epochs=5,
    validation_data=(x_test, y_test),
)

The scores obtained with the CNN architectures are very close to the ones attained with the LSTM network. We find that these two approaches while different in nature are able to learn.