# Cross validation of different models

In [20]:
%matplotlib inline
%load_ext autoreload
%load_ext tensorboard
%autoreload 2

from embeddings import *
from tools import *
import pandas as pd
import numpy as np
from nltk.tokenize import TweetTokenizer
import os.path

# Load library
from nltk.corpus import stopwords
from gensim import *
import pickle
import sklearn
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import gensim.downloader as api
import re

# Data input and output paths
POS_TRAIN_PATH = '../data/twitter-datasets/train_pos_full.txt' 
NEG_TRAIN_PATH = '../data/twitter-datasets/train_neg_full.txt' 
DATA_TEST_PATH = '../data/twitter-datasets/test_data.txt'
OUTPUT_PATH = 'predictions_out.csv'

TOKENS_PATH = "../saved_gen_files/all_tokens.txt"
FULL_TRAIN_TWEET_VECTORS = "../saved_gen_files/train_tweet_vectors.txt"

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The tensorboard module is not an IPython extension.


## Import data

In [2]:
pos_ids, pos_text_train = load_csv_test_data(POS_TRAIN_PATH)
neg_ids, neg_text_train = load_csv_test_data(NEG_TRAIN_PATH)
full_dataset = np.concatenate((pos_text_train, neg_text_train), axis=None)
full_labels = np.concatenate((np.ones(len(pos_text_train)), -np.ones(len(pos_text_train))), axis=None)

## Tokenize

In [3]:
if os.path.isfile(TOKENS_PATH):
    with open(TOKENS_PATH, 'rb') as f:
        all_tokens = pickle.load(f)
else:    
    all_tokens = [tokenizeTweet(tweet, stop_word=False, 
                           smiley_tag = False, strip_handles=True, 
                           reduce_len=True, preserve_case=False) for tweet in full_dataset]
    
    with open(TOKENS_PATH, 'wb') as f:
        pickle.dump(all_tokens, f)

In [4]:
# Generate bigrams
all_tokens = computeBigrams(all_tokens)

In [5]:
# Force the negative sentiment to be clasified to 0 instead of 1 
labels = full_labels
labels[labels<0] = 0

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(all_tokens, labels, test_size=0.1)

## Word Embedding

Choose one of the embedding algo and the dimensions of the vectors



In [6]:
DIM = 100

### Word2Vec

In [28]:
wv = getWord2VecDict(all_tokens, size=DIM, window=10, min_count=2, workers=10, iters=10, train=True)

You haven't train Word2Vec already with those parameters, it can take some time...


### Glove

In [7]:
wv = api.load("glove-twitter-" + str(DIM)).wv

  """Entry point for launching an IPython kernel.


### Fastext

In [None]:
# Train a word2vec model to generate embedding
wv = getFasttextDict(all_tokens, size=DIM, window=10, min_count=2, workers=10, iters=10, train=False)

## Model Selection

### Linear Regression

In [None]:
# Convert tweet in features with previous embedding system
all_tweets_vectors = generateTweetsFeatures(all_tokens, wv)

In [None]:
# Train and test the model
clf = linear_model.Ridge(alpha=0.1)
#clf = linear_model.LogisticRegression()
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
print(classification_report(y_test, predict_labels(predictions)))

### SVM

In [None]:
from sklearn import svm

clf_svm = svm.SVC(gamma='scale')
clf_svm.fit(X_train, y_train)
predict_svm = clf_svm.predict(X_test)
print(classification_report(y_test, predict_labels(predict_svm)))

### LSTM

In [8]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

max_length = max([len(tweet_tokens) for tweet_tokens in X_train])

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [9]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Transform each unique word in unique int identifier
sequences = tokenizer.texts_to_sequences(X_train)


# Pad the tweet to have all the same size
tweet_padded = pad_sequences(sequences, maxlen=max_length)

# Same for test data
test_sequences = tokenizer.texts_to_sequences(X_test)
test_tweet_pad = pad_sequences(test_sequences, maxlen=max_length)

In [None]:
%tensorboard --logdir logs

In [10]:
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.layers import Embedding, Dense, Dropout, Activation, GRU, LSTM, Bidirectional, Flatten, GlobalMaxPool1D

model_name = "BLSTM_100.h5"

model = Sequential()

# Add the embedding layer with our trained embedding matrix
embedding_layer = Embedding(input_dim=wv.syn0.shape[0], 
                            output_dim=wv.syn0.shape[1], 
                            weights=[wv.syn0], 
                            input_length=tweet_padded.shape[1])
model.add(embedding_layer)

# Add dropout to prevent overfitting
model.add(Dropout(0.4))

# Add BiLSTM 
model.add(Bidirectional(LSTM(64, return_sequences=True)))

#self.model.add(GlobalMaxPool1D()) #Or at the same place as Flatten()
model.add(Dense(32))
model.add(Dropout(0.5))
model.add(Activation('relu'))
model.add(Flatten())
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=0.000001)

logdir = 'logs/blstm'
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=logdir, histogram_freq=1)
callbacks = [reduce_lr, tensorboard_callback]

model.summary()

  if __name__ == '__main__':
  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 85, 100)           119351400 
_________________________________________________________________
dropout (Dropout)            (None, 85, 100)           0         
_________________________________________________________________
bidirectional (Bidirectional (None, 85, 128)           84480     
_________________________________________________________________
dense (Dense)                (None, 85, 32)            4128      
_________________________________________________________________
dropout_1 (Dropout)          (None, 85, 32)            0         
_________________________________________________________________
activation (Activation)      (None, 85, 32)            0         
_________________________________________________________________
flatten (Flatten)            (None, 2720)              0

In [None]:
#Or we can import the model
model = tf.keras.models.load_model(model_name)
model.summary()

In [11]:
%time model.fit(tweet_padded, y_train, batch_size=512, epochs=10, validation_data=(test_tweet_pad, y_test), shuffle=True, callbacks=callbacks)

Train on 2250000 samples, validate on 250000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 3d 17h 7min 4s, sys: 22h 10min 15s, total: 4d 15h 17min 19s
Wall time: 20h 8min 48s


<tensorflow.python.keras.callbacks.History at 0x7f7b9000f290>

In [12]:
#Save the model if needed
model.save(model_name)

In [13]:
# Predict
predictions = model.predict(x=test_tweet_pad)

In [18]:
opt_prob = None
f1_max = 0

for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    f1 = sklearn.metrics.f1_score(y_test, (predictions > thresh).astype(int))
    print('F1 score at threshold {} is {}'.format(thresh, f1), end='\r')
    
    if f1 > f1_max:
        f1_max = f1
        opt_prob = thresh
        
print('Optimal probabilty threshold is {} for maximum F1 score {}'.format(opt_prob, f1_max))

Optimal probabilty threshold is 0.42 for maximum F1 score 0.8796874401501513


In [22]:
print(classification_report(y_test, predict_labels(predictions, opt_prob)))

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


              precision    recall  f1-score   support

        -1.0       0.00      0.00      0.00         0
         0.0       0.00      0.00      0.00    124879
         1.0       0.84      0.92      0.88    125121

    accuracy                           0.46    250000
   macro avg       0.28      0.31      0.29    250000
weighted avg       0.42      0.46      0.44    250000



## Final Prediction for LSTM

In [23]:
# Retrain the model on the entire dataset
#clf = linear_model.SGDClassifier(max_iter=1000, tol=1e-3)
#clf.fit(all_tweets_vectors, full_labels)

# Load the data to predict
test_ids, test_x = load_csv_test_data(DATA_TEST_PATH, has_ID=True)

# Tokenize it
tknzr = TweetTokenizer(strip_handles=True, reduce_len=True, preserve_case=False)
test_tokens = [tknzr.tokenize(tweet) for tweet in test_x]

# Generate vector representation
#all_tweets_vectors = np.array([generateTweetVector(model.wv, words) for words in test_tokens])
test_sequences = tokenizer.texts_to_sequences(test_tokens)
test_tweet_pad = pad_sequences(test_sequences, maxlen=max_length)

# Predict
predictions = model.predict(test_tweet_pad)

# Save predictions
create_csv_submission(test_ids, predict_labels(predictions, opt_prob), OUTPUT_PATH)