In [9]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
import tensorflow as tf
import numpy as np
import pickle
import random
import os.path

import sklearn
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim

#from src.blstm_tf import BiLSTM
from src.implementations import batch_iter
from src.blstm_pt import BiLSTM
from scripts.tools import *

# Data input and output paths
POS_TRAIN_PATH = 'data/twitter-datasets/train_pos_full.txt' 
NEG_TRAIN_PATH = 'data/twitter-datasets/train_neg_full.txt' 
DATA_TEST_PATH = 'data/twitter-datasets/test_data.txt'
OUTPUT_PATH = 'predictions_out.csv'
TOKENS_PATH = "saved_gen_files/all_tokens.npy"
W2V_MODEL_PATH = "saved_gen_files/w2v.model"
FastText_MODEL_PATH = "saved_gen_files/fasttext.model"
FULL_TRAIN_TWEET_VECTORS = "saved_gen_files/all_tweets_vectors.npy"
TRAINING_DATA_PATH_X = 'data/training_data.npy'
TRAINING_DATA_PATH_Y = 'data/data_y.npy'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Import data

In [2]:
if os.path.isfile(TRAINING_DATA_PATH_X):
    train_data = np.load(TRAINING_DATA_PATH_X)
    train_y = np.load(TRAINING_DATA_PATH_Y)
else:
    embeddings = np.load('saved_gen_files/embeddings.npy')

    train_text_neg = open(NEG_TRAIN_PATH, 'r').readlines()
    train_text_pos = open(POS_TRAIN_PATH, 'r').readlines()
    # Construct the two arrays 
    train_text = np.array(train_text_neg + train_text_pos)
    train_y = np.concatenate([np.array([-1 for _ in range(len(train_text_neg))]), np.ones(len(train_text_pos))])

    with open('saved_gen_files/vocab.pkl', 'rb') as f:
        voc = pickle.load(f)

    def toAvgVec(t):

        _, K = embeddings.shape
        sum_vec = np.zeros((K))
        words = t.split()
        for word in words:
            index = voc.get(word)
            if index is not None:
                sum_vec += embeddings[index]

        return sum_vec/len(words)
    # Create numerical feature matrix of tweets
    train_data = np.zeros(len(train_text)*embeddings.shape[1]).reshape(len(train_text), 20)
    for i in range(len(train_text)):
        train_data[i] = toAvgVec(train_text[i])
    
    np.save(TRAINING_DATA_PATH_X, train_data)
    np.save(TRAINING_DATA_PATH_Y, train_y)

indices = np.arange(train_data.shape[0])
random.shuffle(indices)

indices
X_train = train_data[indices[:2400000]]
y_train = train_y[indices[:2400000]]

X_test = train_data[2400000:]
y_test = train_y[2400000:]

KeyboardInterrupt: 

In [4]:
pos_ids, pos_text_train = load_csv_test_data(POS_TRAIN_PATH)
neg_ids, neg_text_train = load_csv_test_data(NEG_TRAIN_PATH)
full_dataset = np.concatenate((pos_text_train, neg_text_train), axis=None)
full_labels = np.concatenate((np.ones(len(pos_text_train)), -np.ones(len(pos_text_train))), axis=None)

[autoreload of dateutil.rrule failed: Traceback (most recent call last):
  File "/home/lucas/anaconda3/envs/tensorflow/lib/python3.7/site-packages/IPython/extensions/autoreload.py", line 245, in check
    superreload(m, reload, self.old_objects)
  File "/home/lucas/anaconda3/envs/tensorflow/lib/python3.7/site-packages/IPython/extensions/autoreload.py", line 410, in superreload
    update_generic(old_obj, new_obj)
  File "/home/lucas/anaconda3/envs/tensorflow/lib/python3.7/site-packages/IPython/extensions/autoreload.py", line 347, in update_generic
    update(a, b)
  File "/home/lucas/anaconda3/envs/tensorflow/lib/python3.7/site-packages/IPython/extensions/autoreload.py", line 317, in update_class
    update_instances(old, new)
  File "/home/lucas/anaconda3/envs/tensorflow/lib/python3.7/site-packages/IPython/extensions/autoreload.py", line 280, in update_instances
    ref.__class__ = new
TypeError: __class__ assignment: 'weekday' object layout differs from 'weekday'
]


In [11]:
all_tweets_vectors = np.load(FULL_TRAIN_TWEET_VECTORS)
#all_tokens = np.load(TOKENS_PATH)

X_train, X_test, y_train, y_test = train_test_split(all_tweets_vectors, full_labels)


In [12]:
y_train[y_train == -1] = 0
y_test[y_test == -1] = 0
print(np.sum(y_train < 0), np.sum(y_test < 0))

0 0


In [13]:
ratio = 1
train_size = int(X_train.shape[0] * ratio)
test_size = int(X_test.shape[0] * ratio)
X_train_reduced = X_train[:train_size]
y_train_reduced = y_train[:train_size]
X_test_reduced = X_test[:test_size]
y_test_reduced = y_test[:test_size]

print(X_train_reduced.shape)
print(X_test_reduced.shape)

(1875000, 50)
(625000, 50)


In [14]:
X_train_reshape = np.reshape(X_train_reduced, (X_train_reduced.shape[0], 1, X_train_reduced.shape[1]))
X_test_reshape = np.reshape(X_test_reduced, (X_test_reduced.shape[0], 1, X_test_reduced.shape[1]))

print(X_train_reshape.shape)
print(X_test_reshape.shape)

(1875000, 1, 50)
(625000, 1, 50)


## Using Tensorflow

In [15]:
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from tensorflow.keras.layers import Bidirectional, GlobalMaxPool1D
from tensorflow.keras.models import Model, Sequential
from tensorflow.compat.v1.keras.layers import CuDNNLSTM
from tensorflow.keras import layers

In [24]:
model1 = Sequential()
model1.add(Bidirectional(LSTM(128, return_sequences=True)))
model1.add(Dense(64))
model1.add(Dense(1))
model1.add(Activation('sigmoid'))
model1.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
#model1.summary()

In [26]:
model1 = Sequential()
model1.add(Bidirectional(LSTM(128)))
model1.add(Dense(64))
model1.add(Dropout(0.5))
model1.add(Activation('relu'))
model1.add(Dense(1))
model1.add(Activation('sigmoid'))
model1.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [27]:
%time model1.fit(X_train_reshape, y_train_reduced, batch_size=512, epochs=50, validation_data=(X_test_reshape, y_test_reduced), verbose = 1)

Train on 1875000 samples, validate on 625000 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
CPU times: user 2h 51min 42s, sys: 6min 19s, total: 2h 58min 1s
Wall time: 34min 55s


<tensorflow.python.keras.callbacks.History at 0x7fd7c00dcf90>

In [28]:
pred_test_y = model1.predict([X_test_reshape], batch_size=1024, verbose=1)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [29]:
opt_prob = None
f1_max = 0

for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    f1 = sklearn.metrics.f1_score(y_test_reduced, (pred_test_y.flatten() > thresh).astype(int))
    print('F1 score at threshold {} is {}'.format(thresh, f1))
    
    if f1 > f1_max:
        f1_max = f1
        opt_prob = thresh
        
print('Optimal probabilty threshold is {} for maximum F1 score {}'.format(opt_prob, f1_max))

F1 score at threshold 0.1 is 0.787921188731107
F1 score at threshold 0.11 is 0.790393270824673
F1 score at threshold 0.12 is 0.7927165270577435
F1 score at threshold 0.13 is 0.7950159105278979
F1 score at threshold 0.14 is 0.7972194198012198
F1 score at threshold 0.15 is 0.7992691326247007
F1 score at threshold 0.16 is 0.8012368733334997
F1 score at threshold 0.17 is 0.8031721467976158
F1 score at threshold 0.18 is 0.8050091201538819
F1 score at threshold 0.19 is 0.8067156331591384
F1 score at threshold 0.2 is 0.8082804134314328
F1 score at threshold 0.21 is 0.809769630140875
F1 score at threshold 0.22 is 0.8112997893928621
F1 score at threshold 0.23 is 0.8127259835974281
F1 score at threshold 0.24 is 0.8141564657696156
F1 score at threshold 0.25 is 0.8153650621290458
F1 score at threshold 0.26 is 0.8165788088631208
F1 score at threshold 0.27 is 0.8176142074354239
F1 score at threshold 0.28 is 0.8186822895000815
F1 score at threshold 0.29 is 0.8196372531888869
F1 score at threshold 0.3

In [30]:
# Load the data to predict
test_ids, test_x = load_csv_test_data(DATA_TEST_PATH, has_ID=True)

In [31]:
test_tweets_vectors = np.load("test_embeddings.npy")
tshape = test_tweets_vectors.shape
test_tweets_vectors = np.reshape(test_tweets_vectors, (tshape[0], 1, tshape[1]))

FileNotFoundError: [Errno 2] No such file or directory: 'test_embeddings.npy'

In [None]:
# Predict
pred_submission_y = model1.predict([test_tweets_vectors], batch_size=1024, verbose=1)
pred_submission_y = pred_submission_y.flatten()

In [None]:
# Save predictions
create_csv_submission(test_ids, predict_labels(pred_submission_y, opt_prob), OUTPUT_PATH)

## Using Pytorch

In [17]:
hidden_dim = 2
model = BiLSTM(hidden_dim)
loss_function = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.1)

TypeError: super(type, obj): obj must be an instance or subtype of type

## Using scikit-learn

In [4]:
# Si on veut pas utiliser direct tout le dataset
ratio = 0.01
train_size = int(X_train.shape[0] * ratio)
test_size = int(X_test.shape[0] * ratio)
X_train = X_train[:train_size]
y_train = y_train[:train_size]
X_test = X_test[:test_size]
y_test = y_test[:test_size]

print(X_train.shape)
print(X_test.shape)

(24000, 20)
(1000, 20)


In [9]:
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)

In [10]:
clf.fit(X_train, y_train)

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(5, 2), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=1, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [11]:
predict_labels = clf.predict(X_test)

In [12]:
acc = np.mean(y_test == predict_labels)
print(acc)

0.773


## Using scikit-learn

In [None]:
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)