# Models Evaluation

In [1]:
from embeddings import *
from tools import *
import pandas as pd
import numpy as np
from nltk.tokenize import TweetTokenizer
import os.path

# Load library
from nltk.corpus import stopwords
from gensim import *
import pickle
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import gensim.downloader as api
import re

# Data input and output paths
POS_TRAIN_PATH = '../data/twitter-datasets/train_pos_full.txt' 
NEG_TRAIN_PATH = '../data/twitter-datasets/train_neg_full.txt' 
DATA_TEST_PATH = '../data/twitter-datasets/test_data.txt'
OUTPUT_PATH = 'predictions_out.csv'

TOKENS_PATH = "../saved_gen_files/all_tokens.txt"
FULL_TRAIN_TWEET_VECTORS = "../saved_gen_files/train_tweet_vectors.txt"

## Import data

In [2]:
pos_ids, pos_text_train = load_csv_test_data(POS_TRAIN_PATH)
neg_ids, neg_text_train = load_csv_test_data(NEG_TRAIN_PATH)
full_dataset = np.concatenate((pos_text_train, neg_text_train), axis=None)
full_labels = np.concatenate((np.ones(len(pos_text_train)), -np.ones(len(pos_text_train))), axis=None)

## Tokenize

In [3]:
if os.path.isfile(TOKENS_PATH):
    with open(TOKENS_PATH, 'rb') as f:
        all_tokens = pickle.load(f)
else:    
    all_tokens = [tokenizeTweet(tweet, stop_words=False, 
                           smiley_tag = False, strip_handles=True, 
                           reduce_len=True, preserve_case=False) for tweet in full_dataset]
    
    with open(TOKENS_PATH, 'wb') as f:
        pickle.dump(all_tokens, f)

## Word Embedding

Choose one of the embedding algo and the dimensions of the vectors



In [4]:
DIM = 50

### Word2Vec

In [5]:
# Train a word2vec model to generate embedding
wv = getWord2VecDict(all_tokens, size=DIM, window=10, min_count=2, workers=10, iters=10, train=False)

### Glove

In [None]:
wv = api.load("glove-twitter-" + DIM).wv

### Fastext

In [None]:
# Train a word2vec model to generate embedding
wv = getFasttextDict(all_tokens, size=DIM, window=10, min_count=2, workers=10, iters=10, train=False)

## Model Selection

### Linear Regression

In [None]:
# Convert tweet in features with previous embedding system
all_tweets_vectors = generateTweetsFeatures(all_tokens, wv)

In [None]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(all_tweets_vectors, full_labels, test_size=0.2, train_size=0.8)

In [None]:
# Train and test the model
clf = linear_model.Ridge(alpha=0.1)
#clf = linear_model.LogisticRegression()
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
print(classification_report(y_test, predict_labels(predictions)))

### LSTM

In [6]:
# Force the negative sentiment to be clasified to 0 instead of 1 
labels = full_labels
labels[labels<0] = 0

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(all_tokens, labels, test_size=0.1, train_size=0.1)

In [8]:
# Train the model
from lstm import *
model= LSTM_Model(X_train)
model.train_model(X_train, y_train, wv, batch_size=128, epochs=5)

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 82, 50)            11779500  
_________________________________________________________________
dropout_3 (Dropout)          (None, 82, 50)            0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               91648     
_________________________________________________________________
dense_3 (Dense)              (None, 64)                8256      
_________________________________________________________________
dropout_4 (Dropout)          (None, 64)                0         
_________________________________________________________________
activation_3 (Activation)    (None, 64)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 450000 samples, validate on 50000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [15]:
# Test the model
predictions = model.predict(X_test)
predictions[predictions<0] = 0
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

         0.0       0.88      0.73      0.80    249783
         1.0       0.77      0.90      0.83    250217

    accuracy                           0.81    500000
   macro avg       0.82      0.81      0.81    500000
weighted avg       0.82      0.81      0.81    500000



## Final Prediction

In [None]:
# Retrain the model on the entire dataset
#clf = linear_model.SGDClassifier(max_iter=1000, tol=1e-3)
#clf.fit(all_tweets_vectors, full_labels)

# Load the data to predict
test_ids, test_x = load_csv_test_data(DATA_TEST_PATH, has_ID=True)

# Tokenize it
tknzr = TweetTokenizer(strip_handles=True, reduce_len=True, preserve_case=False)
test_tokens = [tknzr.tokenize(tweet) for tweet in test_x]

# Generate vector representation
#all_tweets_vectors = np.array([generateTweetVector(model.wv, words) for words in test_tokens])
test_sequences = tokenizer_obj.texts_to_sequences(test_tokens)
test_tweet_pad = pad_sequences(test_sequences, maxlen=max_length)

# Predict
predictions = model.predict(test_tweet_pad)

# Save predictions
create_csv_submission(test_ids, predict_labels(predictions, 0.5), OUTPUT_PATH)