In [14]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
import tensorflow as tf
import numpy as np
import pickle
import random
import os.path

from sklearn.neural_network import MLPClassifier

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim

#from src.blstm_tf import BiLSTM
from src.implementations import batch_iter
from src.blstm_pt import BiLSTM

# Data input and output paths
POS_TRAIN_PATH = 'data/twitter-datasets/train_pos_full.txt' 
NEG_TRAIN_PATH = 'data/twitter-datasets/train_neg_full.txt' 
DATA_TEST_PATH = 'data/twitter-datasets/test_data.txt'
OUTPUT_PATH = 'predictions_out.csv'
TRAINING_DATA_PATH_X = 'data/training_data.npy'
TRAINING_DATA_PATH_Y = 'data/data_y.npy'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Import data

In [3]:
if os.path.isfile(TRAINING_DATA_PATH_X):
    train_data = np.load(TRAINING_DATA_PATH_X)
    train_y = np.load(TRAINING_DATA_PATH_Y)
else:
    embeddings = np.load('saved_gen_files/embeddings.npy')

    train_text_neg = open(NEG_TRAIN_PATH, 'r').readlines()
    train_text_pos = open(POS_TRAIN_PATH, 'r').readlines()
    # Construct the two arrays 
    train_text = np.array(train_text_neg + train_text_pos)
    train_y = np.concatenate([np.array([-1 for _ in range(len(train_text_neg))]), np.ones(len(train_text_pos))])

    with open('saved_gen_files/vocab.pkl', 'rb') as f:
        voc = pickle.load(f)

    def toAvgVec(t):

        _, K = embeddings.shape
        sum_vec = np.zeros((K))
        words = t.split()
        for word in words:
            index = voc.get(word)
            if index is not None:
                sum_vec += embeddings[index]

        return sum_vec/len(words)
    # Create numerical feature matrix of tweets
    train_data = np.zeros(len(train_text)*embeddings.shape[1]).reshape(len(train_text), 20)
    for i in range(len(train_text)):
        train_data[i] = toAvgVec(train_text[i])
    
    np.save(TRAINING_DATA_PATH_X, train_data)
    np.save(TRAINING_DATA_PATH_Y, train_y)

indices = np.arange(train_data.shape[0])
random.shuffle(indices)

indices
X_train = train_data[indices[:2400000]]
y_train = train_y[indices[:2400000]]

X_test = train_data[2400000:]
y_test = train_y[2400000:]

## Using Pytorch

In [17]:
hidden_dim = 2
model = BiLSTM(hidden_dim)
loss_function = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.1)

TypeError: super(type, obj): obj must be an instance or subtype of type

## Using scikit-learn

In [4]:
# Si on veut pas utiliser direct tout le dataset
ratio = 0.01
train_size = int(X_train.shape[0] * ratio)
test_size = int(X_test.shape[0] * ratio)
X_train = X_train[:train_size]
y_train = y_train[:train_size]
X_test = X_test[:test_size]
y_test = y_test[:test_size]

print(X_train.shape)
print(X_test.shape)

(24000, 20)
(1000, 20)


In [9]:
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)

In [10]:
clf.fit(X_train, y_train)

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(5, 2), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=1, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [11]:
predict_labels = clf.predict(X_test)

In [12]:
acc = np.mean(y_test == predict_labels)
print(acc)

0.773


## Using Tensorflow

In [None]:
##Parameters
num_layers = 10
num_hidden = 10
embedding_size = 10
vocabulary_size = 10
learning_rate = 0.0001
batch_size = 100
num_epoch = 100

In [None]:
sess = tf.Session()

In [None]:
model = BiLSTM(vocabulary_size, num_layers, num_hidden, embedding_size)

In [None]:
# Define training procedure
global_step = tf.Variable(0, trainable=False)
params = tf.trainable_variables()
gradients = tf.gradients(model.loss, params)
clipped_gradients, _ = tf.clip_by_global_norm(gradients, 10.0)
optimizer = tf.train.AdamOptimizer(learning_rate)
train_op = optimizer.apply_gradients(zip(clipped_gradients, params), global_step=global_step)

In [None]:
# Summary
loss_summary = tf.summary.scalar("loss", model.loss)
summary_op = tf.summary.merge([loss_summary])
#train_summary_writer = tf.summary.FileWriter("training", sess.graph)
#test_summary_writer = tf.summary.FileWriter("testing", sess.graph)

In [None]:
def train_step(batch_x, batch_y):
    feed_dict = {model.x: batch_x, model.keep_prob: 1.0}
    _, step, summaries, loss = sess.run([train_op, global_step, summary_op, model.loss], feed_dict=feed_dict)
    train_summary_writer.add_summary(summaries, step)

    if step % 100 == 1:
        print("step {0}: loss = {1}".format(step, loss))

In [None]:
##Initialize all variables
sess.run(tf.global_variables_initializer())

batches = batch_iter(y_train, X_train, batch_size = batch_size, num_batches = 100, shuffle=False)

for batch_y, batch_x in batches:
    train_step(batch_x, batch_y)
    step = tf.train.global_step(sess, global_step)


In [None]:
sess.close()

In [None]:
clf.fit(X_train, y_train)

## Using scikit-learn

In [None]:
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)