# Models Evaluation

In [1]:
%matplotlib inline
%load_ext autoreload
%load_ext tensorboard
%autoreload 2

from embeddings import *
from tools import *
import pandas as pd
import numpy as np
from nltk.tokenize import TweetTokenizer
import os.path
from tokenizer import *

# Load library
from nltk.corpus import stopwords
from gensim import *
import pickle
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import gensim.downloader as api
import re

# Data input and output paths
POS_TRAIN_PATH = '../data/twitter-datasets/train_pos_full.txt' 
NEG_TRAIN_PATH = '../data/twitter-datasets/train_neg_full.txt' 
DATA_TEST_PATH = '../data/twitter-datasets/test_data.txt'
OUTPUT_PATH = 'predictions_out.csv'

TOKENS_PATH = "../saved_gen_files/all_tokens.txt"
FULL_TRAIN_TWEET_VECTORS = "../saved_gen_files/train_tweet_vectors.txt"

## Import data

In [2]:
pos_ids, pos_text_train = load_csv_test_data(POS_TRAIN_PATH)
neg_ids, neg_text_train = load_csv_test_data(NEG_TRAIN_PATH)
full_dataset = np.concatenate((pos_text_train, neg_text_train), axis=None)
full_labels = np.concatenate((np.ones(len(pos_text_train)), -np.ones(len(pos_text_train))), axis=None)

## Tokenize

In [3]:
if os.path.isfile(TOKENS_PATH):
    with open(TOKENS_PATH, 'rb') as f:
        all_tokens = pickle.load(f)
else:    
    all_tokens = [tokenize(tweet) for tweet in full_dataset]
    
    with open(TOKENS_PATH, 'wb') as f:
        pickle.dump(all_tokens, f)

In [None]:
# Generate bigrams
#all_tokens = computeBigrams(all_tokens)

# Generate Trigrams
#all_tokens = computeBigrams(all_tokens)

## Word Embedding

Choose one of the embedding algo and the dimensions of the vectors



In [5]:
DIM = 50

### Word2Vec

In [None]:
wv = getWord2VecDict(all_tokens, size=DIM, window=10, min_count=2, workers=10, iters=10, train=True)

### Glove

#### Pre-trained

In [5]:
wv = api.load("glove-twitter-" + str(DIM)).wv

  """Entry point for launching an IPython kernel.


#### Self-trained

In [6]:
from gensim.test.utils import datapath
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

glove_file = '../data/self_trained_gloves/vectors_d'+str(DIM)+'.txt'
tmp_file = get_tmpfile("test_word2vec.txt")

_ = glove2word2vec(glove_file, tmp_file)

wv = KeyedVectors.load_word2vec_format(tmp_file)

# Normalize 
wv.init_sims(replace=True)



### Fastext

In [None]:
# Train a word2vec model to generate embedding
wv = getFasttextDict(all_tokens, size=DIM, window=10, min_count=2, workers=10, iters=10, train=False)

## Model Selection

### Linear Regression

In [26]:
# Convert tweet in features with previous embedding system
all_tweets_vectors = generateTweetsFeatures(all_tokens, wv)

In [27]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(all_tweets_vectors, full_labels, test_size=0.1, train_size=0.1)

In [28]:
# Train and test the model
clf = linear_model.Ridge(alpha=0.1)
#clf = linear_model.LogisticRegression()
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
print(classification_report(y_test, predict_labels(predictions)))
#72-82


              precision    recall  f1-score   support

        -1.0       0.79      0.75      0.77    250135
         1.0       0.76      0.81      0.78    249865

    accuracy                           0.78    500000
   macro avg       0.78      0.78      0.77    500000
weighted avg       0.78      0.78      0.77    500000



### SVM

In [13]:
from sklearn import svm

clf_svm = svm.SVC(gamma='scale')
clf_svm.fit(X_train, y_train)
predict_svm = clf_svm.predict(X_test)
print(classification_report(y_test, predict_labels(predict_svm)))

ValueError: setting an array element with a sequence.

### LSTM

In [7]:
all_tokens = [list(filter(lambda i: i in wv, tweet)) for tweet in all_tokens]


In [11]:
# Force the negative sentiment to be clasified to 0 instead of 1 
labels = full_labels
labels[labels<0] = 0
use_tensorboard = False
## Split the dataset
X_train, X_test, y_train, y_test = train_test_split(all_tokens, labels, test_size=0.1, train_size=0.01)

In [12]:
if use_tensorboard:
    %tensorboard --logdir logs

In [None]:
# Train the model
from lstm import *
model= LSTM_Model(all_tokens, use_gru=False, tensorboard=use_tensorboard)
model.train_model(X_train, y_train, wv, batch_size=128, epochs=10)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 128, 50)           2736900   
_________________________________________________________________
dropout (Dropout)            (No

In [34]:
# Test the model
predictions = model.predict(X_test)
predictions[predictions<0] = 0
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

         0.0       0.85      0.82      0.84    125553
         1.0       0.83      0.86      0.84    124447

    accuracy                           0.84    250000
   macro avg       0.84      0.84      0.84    250000
weighted avg       0.84      0.84      0.84    250000



In [None]:
opt_prob = None
f1_max = 0

for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    f1 = metrics.f1_score(y_test, (predictions > thresh).astype(int))
    print('F1 score at threshold {} is {}'.format(thresh, f1), end='\r')
    
    if f1 > f1_max:
        f1_max = f1
        opt_prob = thresh
        
print('Optimal probabilty threshold is {} for maximum F1 score {}'.format(opt_prob, f1_max))

## Final Prediction

In [19]:
# Retrain the model on the entire dataset
#clf = linear_model.SGDClassifier(max_iter=1000, tol=1e-3)
#clf.fit(all_tweets_vectors, full_labels)

# Load the data to predict
test_ids, test_x = load_csv_test_data(DATA_TEST_PATH, has_ID=True)

# Tokenize it
test_tokens = [tokenize(tweet) for tweet in test_x]
test_tokens = [list(filter(lambda i: i in wv, tweet)) for tweet in test_tokens]

# Generate vector representation
#all_tweets_vectors = np.array([generateTweetVector(model.wv, words) for words in test_tokens])

# Predict
predictions = model.predict(test_tokens)

# Save predictions
create_csv_submission(test_ids, predict_labels(predictions, opt_prob), OUTPUT_PATH)