In [28]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
import tensorflow as tf
import numpy as np
import pickle
import random
import os.path

import sklearn
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim

#from src.blstm_tf import BiLSTM
from src.implementations import batch_iter
from src.blstm_pt import BiLSTM
from scripts.tools import *

# Data input and output paths
POS_TRAIN_PATH = 'data/twitter-datasets/train_pos_full.txt' 
NEG_TRAIN_PATH = 'data/twitter-datasets/train_neg_full.txt' 
DATA_TEST_PATH = 'data/twitter-datasets/test_data.txt'
OUTPUT_PATH = 'predictions_out.csv'
TOKENS_PATH = "saved_gen_files/all_tokens.txt"
W2V_MODEL_PATH = "saved_gen_files/w2v.model"
FastText_MODEL_PATH = "saved_gen_files/fasttext.model"
FULL_TRAIN_TWEET_VECTORS = "saved_gen_files/train_tweet_vectors.txt"
TRAINING_DATA_PATH_X = 'data/training_data.npy'
TRAINING_DATA_PATH_Y = 'data/data_y.npy'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Import data

In [2]:
if os.path.isfile(TRAINING_DATA_PATH_X):
    train_data = np.load(TRAINING_DATA_PATH_X)
    train_y = np.load(TRAINING_DATA_PATH_Y)
else:
    embeddings = np.load('saved_gen_files/embeddings.npy')

    train_text_neg = open(NEG_TRAIN_PATH, 'r').readlines()
    train_text_pos = open(POS_TRAIN_PATH, 'r').readlines()
    # Construct the two arrays 
    train_text = np.array(train_text_neg + train_text_pos)
    train_y = np.concatenate([np.array([-1 for _ in range(len(train_text_neg))]), np.ones(len(train_text_pos))])

    with open('saved_gen_files/vocab.pkl', 'rb') as f:
        voc = pickle.load(f)

    def toAvgVec(t):

        _, K = embeddings.shape
        sum_vec = np.zeros((K))
        words = t.split()
        for word in words:
            index = voc.get(word)
            if index is not None:
                sum_vec += embeddings[index]

        return sum_vec/len(words)
    # Create numerical feature matrix of tweets
    train_data = np.zeros(len(train_text)*embeddings.shape[1]).reshape(len(train_text), 20)
    for i in range(len(train_text)):
        train_data[i] = toAvgVec(train_text[i])
    
    np.save(TRAINING_DATA_PATH_X, train_data)
    np.save(TRAINING_DATA_PATH_Y, train_y)

indices = np.arange(train_data.shape[0])
random.shuffle(indices)

indices
X_train = train_data[indices[:2400000]]
y_train = train_y[indices[:2400000]]

X_test = train_data[2400000:]
y_test = train_y[2400000:]

KeyboardInterrupt: 

In [3]:
pos_ids, pos_text_train = load_csv_test_data(POS_TRAIN_PATH)
neg_ids, neg_text_train = load_csv_test_data(NEG_TRAIN_PATH)
full_dataset = np.concatenate((pos_text_train, neg_text_train), axis=None)
full_labels = np.concatenate((np.ones(len(pos_text_train)), -np.ones(len(pos_text_train))), axis=None)

In [4]:
with open(FULL_TRAIN_TWEET_VECTORS, "rb") as fp:   # Unpickling
    all_tweets_vectors = pickle.load(fp)

X_train, X_test, y_train, y_test = train_test_split(all_tweets_vectors, full_labels)


In [5]:
y_train[y_train == -1] = 0
y_test[y_test == -1] = 0
print(np.sum(y_train < 0), np.sum(y_test < 0))

0 0


In [6]:
ratio = 1
train_size = int(X_train.shape[0] * ratio)
test_size = int(X_test.shape[0] * ratio)
X_train_reduced = X_train[:train_size]
y_train_reduced = y_train[:train_size]
X_test_reduced = X_test[:test_size]
y_test_reduced = y_test[:test_size]

print(X_train_reduced.shape)
print(X_test_reduced.shape)

(1875000, 50)
(625000, 50)


In [7]:
X_train_reshape = np.reshape(X_train_reduced, (X_train_reduced.shape[0], 1, X_train_reduced.shape[1]))
X_test_reshape = np.reshape(X_test_reduced, (X_test_reduced.shape[0], 1, X_test_reduced.shape[1]))

print(X_train_reshape.shape)
print(X_test_reshape.shape)

(1875000, 1, 50)
(625000, 1, 50)


## Using Tensorflow

In [8]:
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from tensorflow.keras.layers import Bidirectional, GlobalMaxPool1D
from tensorflow.keras.models import Model, Sequential
from tensorflow.compat.v1.keras.layers import CuDNNLSTM
from tensorflow.keras import layers

In [9]:
model1 = Sequential()
model1.add(Bidirectional(LSTM(128, return_sequences=True)))
model1.add(Dense(64))
model1.add(Dense(1))
model1.add(Activation('sigmoid'))
model1.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
#model1.summary()

In [25]:
%time model1.fit(X_train_reshape, y_train_reduced, batch_size=512, epochs=4, validation_data=(X_test_reshape, y_test_reduced), verbose = 1)

Train on 1875000 samples, validate on 625000 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
CPU times: user 24min 15s, sys: 56 s, total: 25min 11s
Wall time: 8min 52s


<tensorflow.python.keras.callbacks.History at 0x7f3377b77c50>

In [33]:
pred_test_y = model1.predict([X_test_reshape], batch_size=1024, verbose=1)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [34]:
opt_prob = None
f1_max = 0

for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    f1 = sklearn.metrics.f1_score(y_test_reduced, (pred_test_y.flatten() > thresh).astype(int))
    print('F1 score at threshold {} is {}'.format(thresh, f1))
    
    if f1 > f1_max:
        f1_max = f1
        opt_prob = thresh
        
print('Optimal probabilty threshold is {} for maximum F1 score {}'.format(opt_prob, f1_max))

ValueError: Found input variables with inconsistent numbers of samples: [625000, 6250000]

In [14]:
# Load the data to predict
test_ids, test_x = load_csv_test_data(DATA_TEST_PATH, has_ID=True)

In [20]:
test_tweets_vectors = np.load("test_embeddings.npy")
tshape = test_tweets_vectors.shape
test_tweets_vectors = np.reshape(test_tweets_vectors, (tshape[0], 1, tshape[1]))

In [None]:
# Predict
pred_submission_y = model1.predict([test_tweets_vectors], batch_size=1024, verbose=1)
pred_submission_y = pred_submission_y.flatten()

In [None]:
# Save predictions
create_csv_submission(test_ids, predict_labels(pred_submission_y, opt_prob), OUTPUT_PATH)

## Using Pytorch

In [17]:
hidden_dim = 2
model = BiLSTM(hidden_dim)
loss_function = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.1)

TypeError: super(type, obj): obj must be an instance or subtype of type

## Using scikit-learn

In [4]:
# Si on veut pas utiliser direct tout le dataset
ratio = 0.01
train_size = int(X_train.shape[0] * ratio)
test_size = int(X_test.shape[0] * ratio)
X_train = X_train[:train_size]
y_train = y_train[:train_size]
X_test = X_test[:test_size]
y_test = y_test[:test_size]

print(X_train.shape)
print(X_test.shape)

(24000, 20)
(1000, 20)


In [9]:
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)

In [10]:
clf.fit(X_train, y_train)

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(5, 2), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=1, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [11]:
predict_labels = clf.predict(X_test)

In [12]:
acc = np.mean(y_test == predict_labels)
print(acc)

0.773


## Using scikit-learn

In [None]:
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)