In [1]:
%matplotlib inline
%load_ext autoreload
%load_ext tensorboard
%autoreload 2
import tensorflow as tf
import numpy as np
import pickle
import random
import os.path

import sklearn
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim

#from src.blstm_tf import BiLSTM
from src.implementations import batch_iter
from src.blstm_pt import BiLSTM
from scripts.tools import *

# Data input and output paths
POS_TRAIN_PATH = 'data/twitter-datasets/train_pos_full.txt' 
NEG_TRAIN_PATH = 'data/twitter-datasets/train_neg_full.txt' 
DATA_TEST_PATH = 'data/twitter-datasets/test_data.txt'
OUTPUT_PATH = 'predictions_out.csv'
TOKENS_PATH = "saved_gen_files/all_tokens.npy"
W2V_MODEL_PATH = "saved_gen_files/w2v.model"
FastText_MODEL_PATH = "saved_gen_files/fasttext.model"
FULL_TRAIN_TWEET_VECTORS = "saved_gen_files/all_tweets_vectors.npy"
FULL_TRAIN_TWEET_VECTORS_200 = "saved_gen_files/all_tweets_vectors_200.npy"
TRAINING_DATA_PATH_X = 'data/training_data.npy'
TRAINING_DATA_PATH_Y = 'data/data_y.npy'
TRAINING_EMBEDDINGS = 'data/test_embeddings.npy'

### Import data

In [None]:
if os.path.isfile(TRAINING_DATA_PATH_X):
    train_data = np.load(TRAINING_DATA_PATH_X)
    train_y = np.load(TRAINING_DATA_PATH_Y)
else:
    embeddings = np.load('saved_gen_files/embeddings.npy')

    train_text_neg = open(NEG_TRAIN_PATH, 'r').readlines()
    train_text_pos = open(POS_TRAIN_PATH, 'r').readlines()
    # Construct the two arrays 
    train_text = np.array(train_text_neg + train_text_pos)
    train_y = np.concatenate([np.array([-1 for _ in range(len(train_text_neg))]), np.ones(len(train_text_pos))])

    with open('saved_gen_files/vocab.pkl', 'rb') as f:
        voc = pickle.load(f)

    def toAvgVec(t):

        _, K = embeddings.shape
        sum_vec = np.zeros((K))
        words = t.split()
        for word in words:
            index = voc.get(word)
            if index is not None:
                sum_vec += embeddings[index]

        return sum_vec/len(words)
    # Create numerical feature matrix of tweets
    train_data = np.zeros(len(train_text)*embeddings.shape[1]).reshape(len(train_text), 20)
    for i in range(len(train_text)):
        train_data[i] = toAvgVec(train_text[i])
    
    np.save(TRAINING_DATA_PATH_X, train_data)
    np.save(TRAINING_DATA_PATH_Y, train_y)

indices = np.arange(train_data.shape[0])
random.shuffle(indices)

indices
X_train = train_data[indices[:2400000]]
y_train = train_y[indices[:2400000]]

X_test = train_data[2400000:]
y_test = train_y[2400000:]

In [2]:
pos_ids, pos_text_train = load_csv_test_data(POS_TRAIN_PATH)
neg_ids, neg_text_train = load_csv_test_data(NEG_TRAIN_PATH)
full_dataset = np.concatenate((pos_text_train, neg_text_train), axis=None)
full_labels = np.concatenate((np.ones(len(pos_text_train)), -np.ones(len(pos_text_train))), axis=None)

In [3]:
all_tweets_vectors = np.load(FULL_TRAIN_TWEET_VECTORS)
#all_tokens = np.load(TOKENS_PATH)

X_train, X_test, y_train, y_test = train_test_split(all_tweets_vectors, full_labels, test_size=.2)
X_full = all_tweets_vectors
y_full = full_labels

In [4]:
y_train[y_train == -1] = 0
y_test[y_test == -1] = 0
y_full[y_full == -1] = 0
print(np.sum(y_train < 0), np.sum(y_test < 0), np.sum(y_full < 0))

0 0 0


In [5]:
ratio = .1
train_size = int(X_train.shape[0] * ratio)
test_size = int(X_test.shape[0] * ratio)
X_train_reduced = X_train[:train_size]
y_train_reduced = y_train[:train_size]
X_test_reduced = X_test[:test_size]
y_test_reduced = y_test[:test_size]

print(X_train_reduced.shape)
print(X_test_reduced.shape)

(200000, 50)
(50000, 50)


In [6]:
X_train_reshape = np.reshape(X_train_reduced, (X_train_reduced.shape[0], 1, X_train_reduced.shape[1]))
X_test_reshape = np.reshape(X_test_reduced, (X_test_reduced.shape[0], 1, X_test_reduced.shape[1]))
X_full_reshape = np.reshape(X_full, (X_full.shape[0], 1, X_full.shape[1]))

print(X_train_reshape.shape)
print(X_test_reshape.shape)
print(X_full_reshape.shape)

(200000, 1, 50)
(50000, 1, 50)
(2500000, 1, 50)


## Using Tensorflow

In [7]:
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from tensorflow.keras.layers import Bidirectional, GlobalMaxPool1D
from tensorflow.keras.models import Model, Sequential
from tensorflow.compat.v1.keras.layers import CuDNNLSTM
from tensorflow.keras import layers

from sklearn.model_selection import KFold

In [11]:
logdir = 'logs'
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=logdir, histogram_freq=1)

In [16]:
model1 = Sequential()
model1.add(Bidirectional(LSTM(254)))
model1.add(Dense(128))
model1.add(Dropout(0.5))
model1.add(Activation('relu'))
model1.add(Dense(64))
model1.add(Activation('softmax'))
model1.add(Dense(1))
model1.add(Activation('sigmoid'))
model1.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [12]:
model1 = Sequential()
model1.add(Bidirectional(LSTM(128)))
model1.add(Dense(64))
model1.add(Dropout(0.5))
model1.add(Activation('relu'))
model1.add(Dense(1))
model1.add(Activation('sigmoid'))
model1.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [13]:
%time model1.fit(X_train_reshape, y_train_reduced, batch_size=512, epochs=2, validation_data=(X_test_reshape, y_test_reduced), verbose=1, callbacks=[tensorboard_callback])

Train on 200000 samples, validate on 50000 samples
Epoch 1/2
Epoch 2/2
CPU times: user 1min 20s, sys: 2.89 s, total: 1min 23s
Wall time: 35.1 s


<tensorflow.python.keras.callbacks.History at 0x7f6893cb3590>

In [17]:
%time model1.fit(X_full_reshape, y_full, batch_size=512, epochs=1, validation_split=.1, verbose=1)

Train on 2250000 samples, validate on 250000 samples
CPU times: user 6min 47s, sys: 16.8 s, total: 7min 4s
Wall time: 2min 53s


<tensorflow.python.keras.callbacks.History at 0x7f4f7c6a8450>

In [14]:
model1.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_1 (Bidirection multiple                  183296    
_________________________________________________________________
dense_2 (Dense)              multiple                  16448     
_________________________________________________________________
dropout_1 (Dropout)          multiple                  0         
_________________________________________________________________
activation_2 (Activation)    multiple                  0         
_________________________________________________________________
dense_3 (Dense)              multiple                  65        
_________________________________________________________________
activation_3 (Activation)    multiple                  0         
Total params: 199,809
Trainable params: 199,809
Non-trainable params: 0
________________________________________________

## Visualize using Tensorboard

In [15]:
%tensorboard --logdir=logs

## K-Fold cross validation

In [None]:
pred_test_y = model1.predict([X_test_reshape], batch_size=1024, verbose=1)

In [None]:
opt_prob = None
f1_max = 0

for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    f1 = sklearn.metrics.f1_score(y_test_reduced, (pred_test_y.flatten() > thresh).astype(int))
    print('F1 score at threshold {} is {}'.format(thresh, f1))
    
    if f1 > f1_max:
        f1_max = f1
        opt_prob = thresh
        
print('Optimal probabilty threshold is {} for maximum F1 score {}'.format(opt_prob, f1_max))

In [None]:
# Load the data to predict
test_ids, test_x = load_csv_test_data(DATA_TEST_PATH, has_ID=True)

In [None]:
test_tweets_vectors = np.load("test_embeddings.npy")
tshape = test_tweets_vectors.shape
test_tweets_vectors = np.reshape(test_tweets_vectors, (tshape[0], 1, tshape[1]))

In [None]:
# Predict
pred_submission_y = model1.predict([test_tweets_vectors], batch_size=1024, verbose=1)
pred_submission_y = pred_submission_y.flatten()

In [None]:
# Save predictions
create_csv_submission(test_ids, predict_labels(pred_submission_y, opt_prob), OUTPUT_PATH)

## Using Pytorch

In [None]:
hidden_dim = 2
model = BiLSTM(hidden_dim)
loss_function = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.1)

## Using scikit-learn

In [None]:
# Si on veut pas utiliser direct tout le dataset
ratio = 0.01
train_size = int(X_train.shape[0] * ratio)
test_size = int(X_test.shape[0] * ratio)
X_train = X_train[:train_size]
y_train = y_train[:train_size]
X_test = X_test[:test_size]
y_test = y_test[:test_size]

print(X_train.shape)
print(X_test.shape)

In [None]:
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)

In [None]:
clf.fit(X_train, y_train)

In [None]:
predict_labels = clf.predict(X_test)

In [None]:
acc = np.mean(y_test == predict_labels)
print(acc)

## Using scikit-learn

In [None]:
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)