# Models Evaluation

In [2]:
%matplotlib inline
%load_ext autoreload
%load_ext tensorboard
%autoreload 2

from embeddings import *
from tools import *
import pandas as pd
import numpy as np
from nltk.tokenize import TweetTokenizer
import os.path
from tokenizer import *

# Load library
from nltk.corpus import stopwords
from gensim import *
import pickle
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import gensim.downloader as api
import re

# Data input and output paths
POS_TRAIN_PATH = '../data/twitter-datasets/train_pos_full.txt' 
NEG_TRAIN_PATH = '../data/twitter-datasets/train_neg_full.txt' 
DATA_TEST_PATH = '../data/twitter-datasets/test_data.txt'
OUTPUT_PATH = 'predictions_out.csv'

TOKENS_PATH = "../saved_gen_files/all_tokens.txt"
FULL_TRAIN_TWEET_VECTORS = "../saved_gen_files/train_tweet_vectors.txt"

## Import data

In [111]:
pos_ids, pos_text_train = load_csv_test_data(POS_TRAIN_PATH)
neg_ids, neg_text_train = load_csv_test_data(NEG_TRAIN_PATH)
full_dataset = np.concatenate((pos_text_train, neg_text_train), axis=None)
full_labels = np.concatenate((np.ones(len(pos_text_train)), -np.ones(len(pos_text_train))), axis=None)

## Tokenize

In [112]:
if os.path.isfile(TOKENS_PATH):
    with open(TOKENS_PATH, 'rb') as f:
        all_tokens = pickle.load(f)
else:    
    all_tokens = [tokenize(tweet) for tweet in full_dataset]
    
    with open(TOKENS_PATH, 'wb') as f:
        pickle.dump(all_tokens, f)

>Use Bigram and Trigram only with self trained Glove on Bigram and Trigram

In [113]:
# Generate bigrams
all_tokens = computeBigrams(all_tokens)

# Generate Trigrams
#all_tokens = computeBigrams(all_tokens)

## Word Embedding

Choose one of the embedding algo and the dimensions of the vectors



In [5]:
DIM = 50

### Word2Vec

In [None]:
wv = getWord2VecDict(all_tokens, size=DIM, window=10, min_count=2, workers=10, iters=10, train=True)

### Glove

#### Pre-trained

In [63]:
wv = api.load("glove-twitter-" + str(DIM)).wv

  """Entry point for launching an IPython kernel.


#### Self-trained

In [114]:
from gensim.test.utils import datapath
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

glove_file = '../data/self_trained_gloves/vectors_d'+str(DIM)+'.txt'
tmp_file = get_tmpfile("test_word2vec.txt")

_ = glove2word2vec(glove_file, tmp_file)

wv = KeyedVectors.load_word2vec_format(tmp_file)

# Normalize 
wv.init_sims(replace=True)



### Fastext

In [None]:
# Train a word2vec model to generate embedding
wv = getFasttextDict(all_tokens, size=DIM, window=10, min_count=2, workers=10, iters=10, train=False)

## Model Selection

### Linear Regression

In [6]:
# Convert tweet in features with previous embedding system
all_tweets_vectors = generateTweetsFeatures(all_tokens, wv)

In [16]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(all_tweets_vectors, full_labels, test_size=0.1, train_size=0.1)



In [17]:
from sklearn import svm
# Train and test the model
clf = svm.SVC(kernel='rbf')
#clf = linear_model.Ridge(alpha=0.1)
#clf = linear_model.LogisticRegression()
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
print(classification_report(y_test, predict_labels(predictions)))
#72-82




              precision    recall  f1-score   support

        -1.0       0.77      0.72      0.75    124637
         1.0       0.74      0.79      0.77    125363

    accuracy                           0.76    250000
   macro avg       0.76      0.76      0.76    250000
weighted avg       0.76      0.76      0.76    250000



### SVM

In [13]:
from sklearn import svm

clf_svm = svm.SVC(gamma='scale')
clf_svm.fit(X_train, y_train)
predict_svm = clf_svm.predict(X_test)
print(classification_report(y_test, predict_labels(predict_svm)))

ValueError: setting an array element with a sequence.

### LSTM

In [7]:
all_tokens = [list(filter(lambda i: i in wv, tweet)) for tweet in all_tokens]


In [11]:
# Force the negative sentiment to be clasified to 0 instead of 1 
labels = full_labels
labels[labels<0] = 0
use_tensorboard = False
## Split the dataset
X_train, X_test, y_train, y_test = train_test_split(all_tokens, labels, test_size=0.1, train_size=0.01)

In [12]:
if use_tensorboard:
    %tensorboard --logdir logs

In [66]:
# Train the model
from lstm import *
model= LSTM_Model(all_tokens, use_gru=False, tensorboard=False)
model.train_model(X_train, y_train, wv, batch_size=128, epochs=10)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Model: "sequential_17"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 128, 50)           59675700  
_________________________________________________________________
dropout_16 (Dropout)         (None, 128, 50)           0         
_________________________________________________________________
bidirectional (Bidirectional (None, 128, 128)          58880     
_________________________________________________________________
dense_8 (Dense)              (None, 128, 32)           4128      
_____________________________

KeyboardInterrupt: 

In [34]:
# Test the model
predictions = model.predict(X_test)
predictions[predictions<0] = 0
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

         0.0       0.85      0.82      0.84    125553
         1.0       0.83      0.86      0.84    124447

    accuracy                           0.84    250000
   macro avg       0.84      0.84      0.84    250000
weighted avg       0.84      0.84      0.84    250000



In [None]:
opt_prob = None
f1_max = 0

for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    f1 = metrics.f1_score(y_test, (predictions > thresh).astype(int))
    print('F1 score at threshold {} is {}'.format(thresh, f1), end='\r')
    
    if f1 > f1_max:
        f1_max = f1
        opt_prob = thresh
        
print('Optimal probabilty threshold is {} for maximum F1 score {}'.format(opt_prob, f1_max))

## Deep-wise Sep Convolutionnal NN

In [131]:
#all_tokens = [list(filter(lambda i: i in wv, tweet)) for tweet in all_tokens]
# Force the negative sentiment to be clasified to 0 instead of 1 
labels = full_labels
labels[labels<0] = 0
X_train, X_test, y_train, y_test = train_test_split(all_tokens, labels, test_size=0.1, train_size=0.1, random_state=1)


In [132]:
from tensorflow.keras.models import Sequential, load_model

from tensorflow.python.keras import models
from tensorflow.python.keras import initializers
from tensorflow.python.keras import regularizers
import tensorflow as tf
from tensorflow.python.keras.layers import Dense
from tensorflow.python.keras.layers import Dropout
from tensorflow.python.keras.layers import Embedding
from tensorflow.python.keras.layers import SeparableConv1D
from tensorflow.python.keras.layers import MaxPooling1D
from tensorflow.python.keras.layers import GlobalAveragePooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Dense, Dropout, Activation, GRU, LSTM, Bidirectional, Flatten, GlobalMaxPool1D

activation = 'sigmoid'
units = 1
embedding_vectors= wv
max_length = max([len(tweet_tokens) for tweet_tokens in all_tokens])
tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(all_tokens)

dropout_rate=0.2
filters = 64
kernel_size= 7
pool_size=7
learning_rate=1e-3


# Transform each unique word in unique int identifier
sequences = tokenizer_obj.texts_to_sequences(X_train)


# Pad the tweet to have all the same size
tweet_padded = pad_sequences(sequences, maxlen=max_length)


# Construct our model with keras
model = Sequential()

# Add the embedding layer with our trained embedding matrix
embedding_layer = Embedding(input_dim=embedding_vectors.syn0.shape[0] , output_dim=embedding_vectors.syn0.shape[1], weights=[embedding_vectors.syn0], 
                        input_length=tweet_padded.shape[1])

model.add(embedding_layer)


model.add(Dropout(rate=dropout_rate))
model.add(SeparableConv1D(filters=filters,
                              kernel_size=kernel_size,
                              activation='relu',
                              bias_initializer='random_uniform',
                              depthwise_initializer='random_uniform',
                              kernel_initializer='glorot_uniform',
                              padding='same'))
model.add(SeparableConv1D(filters=filters,
                          kernel_size=kernel_size,
                          activation='relu',
                          bias_initializer='random_uniform',
                          depthwise_initializer='random_uniform',
                          kernel_initializer='glorot_uniform',
                          padding='same'))

model.add(MaxPooling1D(pool_size=pool_size))


model.add(SeparableConv1D(filters=filters * 2,
                      kernel_size=kernel_size,
                      activation='relu',
                      bias_initializer='random_uniform',
                      depthwise_initializer='random_uniform',
                      padding='same'))
model.add(SeparableConv1D(filters=filters * 2,
                      kernel_size=kernel_size,
                      activation='relu',
                      bias_initializer='random_uniform',
                      depthwise_initializer='random_uniform',
                      padding='same'))
model.add(GlobalAveragePooling1D())
model.add(Dropout(rate=dropout_rate))
model.add(Dense(1))
model.add(Activation('sigmoid'))

loss = 'binary_crossentropy'
optimizer = tf.keras.optimizers.Adam(lr=learning_rate)

model.compile(optimizer=optimizer, loss=loss, metrics=['acc'])

print (model.summary())
model.fit(tweet_padded, y_train, batch_size=128, epochs=5, validation_split=0.1, shuffle=True)



Model: "sequential_50"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_34 (Embedding)     (None, 128, 50)           2736900   
_________________________________________________________________
dropout_61 (Dropout)         (None, 128, 50)           0         
_________________________________________________________________
separable_conv1d_126 (Separa (None, 128, 64)           3614      
_________________________________________________________________
separable_conv1d_127 (Separa (None, 128, 64)           4608      
_________________________________________________________________
max_pooling1d_36 (MaxPooling (None, 18, 64)            0         
_________________________________________________________________
separable_conv1d_128 (Separa (None, 18, 128)           8768      
_________________________________________________________________
separable_conv1d_129 (Separa (None, 18, 128)         

<tensorflow.python.keras.callbacks.History at 0x14ca6b6d0>

In [123]:
# Transform each unique word in unique int identifier
sequences = tokenizer_obj.texts_to_sequences(X_test)

# Pad the tweet to have all the same size
tweet_padded = pad_sequences(sequences, maxlen=max_length)

predictions = model.predict(tweet_padded)

#print(classification_report(y_test, predictions))


In [124]:
pred = np.array(predictions, copy=True)
threshold = 0.5

pred[pred<threshold] = 0
pred[pred>=threshold] = 1

print(classification_report(y_test, pred))
print(accuracy_score(y_test, pred))

              precision    recall  f1-score   support

         0.0       0.87      0.85      0.86    249914
         1.0       0.85      0.87      0.86    250086

    accuracy                           0.86    500000
   macro avg       0.86      0.86      0.86    500000
weighted avg       0.86      0.86      0.86    500000

0.858672


In [101]:
predictions

array([[1.],
       [1.],
       [0.],
       ...,
       [1.],
       [0.],
       [0.]], dtype=float32)

## Final Prediction

In [19]:
# Retrain the model on the entire dataset
#clf = linear_model.SGDClassifier(max_iter=1000, tol=1e-3)
#clf.fit(all_tweets_vectors, full_labels)

# Load the data to predict
test_ids, test_x = load_csv_test_data(DATA_TEST_PATH, has_ID=True)

# Tokenize it
test_tokens = [tokenize(tweet) for tweet in test_x]
test_tokens = [list(filter(lambda i: i in wv, tweet)) for tweet in test_tokens]

# Generate vector representation
#all_tweets_vectors = np.array([generateTweetVector(model.wv, words) for words in test_tokens])

# Predict
predictions = model.predict(test_tokens)

# Save predictions
create_csv_submission(test_ids, predict_labels(predictions, opt_prob), OUTPUT_PATH)