# Models Evaluation

In [12]:
from embeddings import *
from tools import *
import pandas as pd
import numpy as np
from nltk.tokenize import TweetTokenizer
# Load library
from nltk.corpus import stopwords
from gensim import *
import pickle
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import gensim.downloader as api
import re
# Data input and output paths
POS_TRAIN_PATH = '../data/twitter-datasets/train_pos_full.txt' 
NEG_TRAIN_PATH = '../data/twitter-datasets/train_neg_full.txt' 
DATA_TEST_PATH = '../data/twitter-datasets/test_data.txt'
OUTPUT_PATH = 'predictions_out.csv'
TOKENS_PATH = "../saved_gen_files/all_tokens.txt"
W2V_MODEL_PATH = "../saved_gen_files/w2v.model"
FastText_MODEL_PATH = "../saved_gen_files/fasttext.model"

FULL_TRAIN_TWEET_VECTORS = "../saved_gen_files/train_tweet_vectors.txt"

## Import data

In [75]:
pos_ids, pos_text_train = load_csv_test_data(POS_TRAIN_PATH)
neg_ids, neg_text_train = load_csv_test_data(NEG_TRAIN_PATH)
full_dataset = np.concatenate((pos_text_train, neg_text_train), axis=None)
full_labels = np.concatenate((np.ones(len(pos_text_train)), -np.ones(len(pos_text_train))), axis=None)


## Tokenization

In [43]:
# Top pos smiley used and not removed
pos_smiley = ["\(':", "\(':<", "\(';", "\(\*:", "\(\*;", "\(:", "\(;", "\(=", ":'\)", ":'\]", ":'\}", 
              ":\)", ":\*\)", ":\*:", ":-\]", ":-\}", ":\]", ":\}", ";'\)", ";'\]", ";\)", ";\*\)", ";-\}"
             , ";\]", ";\}", "=\)", "\(=", "<3", ":p", ":D", "xD", ":)"]


# Top neg smiley used and not removed
neg_smiley = ["\)':", "\)':<", "\)';", "\)=", "\)=<", "/':", "/';", "/-:", "/:", "/:<", "/;", 
              "/;<", "/=", ":'/", ":'@", ":'\[", ":'\\", ":'\{", ":'\|", ":\(", ":\*\(", ":\*\{", 
            ":\*\|", ":-/", ":-@", ":-\[", ":-\\", ":-\|", ":/", ":@", ":\[", ":\\", ":\{", ":\|"
             , ";'\(", ";'/", ";'\[", ";\*\{", ";-/", ";-\|", ";/", ";@", ";\[", ";\\", ";\{", ";\|"
             ,"=\(", "</3"]


# Top word without sentiment meaning nor negative form possible
stop_words= ["i", "you", "it", "she", "he", "we", "they", "a", "in", "to", "the", "and", "my", "me", "of", "for", "that", "this", "on", "so", "be", "just", "your", "at", "its", "im", ".", ",", ")", "'", "(", "or", "by", "am", "ve", "our", "\"", "<", ">", "&", "\\", ":", "-", ";", "/"]

tknzr = TweetTokenizer(strip_handles=True, reduce_len=True, preserve_case=False)

def processTweet(tweet):
    # Tokenize
    tokens = tknzr.tokenize(tweet)
    
    for word in tokens:
        if word in pos_smiley:
            word = '<pos_smiley>'
        elif word in neg_smiley:
            word = '<neg_smiley>'
    
    tokens = [word for word in tokens if word not in stop_words and not word.isnumeric()]
    
    return tokens
    

all_tokens = [tknzr.tokenize(tweet) for tweet in full_dataset]

# Save 
with open(TOKENS_PATH, "wb") as fp:   #Pickling
    pickle.dump(all_tokens, fp)

In [15]:
with open(TOKENS_PATH, "rb") as fp:   # Unpickling
    all_tokens = pickle.load(fp)
tks = np.array(all_tokens)
flat_list = [item for sublist in tks for item in sublist]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

def get_top_n_words(corpus, n=None):
    """
    List the top n words in a vocabulary according to occurrence in a text corpus.
    
    get_top_n_words(["I love Python", "Python is a language programming", "Hello world", "I love the world"]) -> 
    [('python', 2),
     ('world', 2),
     ('love', 2),
     ('hello', 1),
     ('is', 1),
     ('programming', 1),
     ('the', 1),
     ('language', 1)]
    """
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in     vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
dict_words = get_top_n_words(all_tokens)


## Word Embedding
### Word2Vec

In [52]:
#with open(TOKENS_PATH, "rb") as fp:   # Unpickling
#    all_tokens = pickle.load(fp)

# Train a word2vec model to generate embedding
model = models.Word2Vec(
        all_tokens,
        size=200,
        window=10,
        min_count=2,
        workers=10,
        iter=10)
model.save(W2V_MODEL_PATH)

### Glove

In [None]:
model = api.load("glove-twitter-50")

### Fastext

In [None]:
with open(TOKENS_PATH, "rb") as fp:   # Unpickling
    all_tokens = pickle.load(fp)

# Train a word2vec model to generate embedding
model = models.FastText(
        all_tokens,
        size=50,
        window=10,
        min_count=2,
        workers=10,
        iter=10)

model.save(FastText_MODEL_PATH)

## Feature Engineering

In [53]:
def generateTweetVector(word_dic, words):
    num_words = len(words)
    if num_words < 1:
        num_words = 1
        
    vector = np.zeros(word_dic.vector_size)
    for word in words:
        if word in word_dic.vocab:
            vector += word_dic[word]
    vector /= num_words
    return vector

all_tweets_vectors = np.array([generateTweetVector(model.wv, words) for words in all_tokens])

# Save 
with open(FULL_TRAIN_TWEET_VECTORS, "wb") as fp:   #Pickling
    pickle.dump(all_tweets_vectors, fp)

## Model Selection

## Test

In [4]:
with open(FULL_TRAIN_TWEET_VECTORS, "rb") as fp:   # Unpickling
    all_tweets_vectors = pickle.load(fp)
    
X_train, X_test, y_train, y_test = train_test_split(all_tweets_vectors, full_labels)

In [55]:
clf = linear_model.Ridge(alpha=0.1)
#clf = linear_model.LogisticRegression()
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
print(classification_report(y_test, predict_labels(predictions)))

              precision    recall  f1-score   support

        -1.0       0.80      0.75      0.77    312802
         1.0       0.76      0.81      0.79    312198

    accuracy                           0.78    625000
   macro avg       0.78      0.78      0.78    625000
weighted avg       0.78      0.78      0.78    625000



In [76]:
w2v =  Word2Vec.load(W2V_MODEL_PATH)
with open(TOKENS_PATH, "rb") as fp:   # Unpickling
    all_tokens = pickle.load(fp)

full_labels[full_labels<0] = 0

In [77]:
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, Activation, GRU
from keras.layers import Embedding
from keras.initializers import Constant
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from keras.layers import LSTM
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from gensim.models import Word2Vec



max_length = max([len(tweet_tokens) for tweet_tokens in all_tokens])

tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(all_tokens)
sequences = tokenizer_obj.texts_to_sequences(all_tokens)

word_index = tokenizer_obj.word_index
print('Found %s unique tokens'% len(word_index))

tweet_pad = pad_sequences(sequences, maxlen=max_length)

num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, 200))

for word, i in word_index.items():
    if i>num_words:
        continue
    if word in w2v.wv.vocab:
        embedding_matrix[i] = w2v.wv[word]
        
print(num_words)

Found 592851 unique tokens
592852


In [71]:
VALIDATION_SPLIT = 0.2
indices = np.arange(tweet_pad.shape[0])
np.random.shuffle(indices)
tweet_pad=tweet_pad[indices]
sentiment = full_labels[indices]
#num_validatiion_samples = int(VALIDATION_SPLIT * tweet_pad.shape[0])

#X_train = tweet_pad[:-num_validatiion_samples]
#y_train = sentiment[:-num_validatiion_samples]

#X_test = tweet_pad[-num_validatiion_samples:]
#y_test = sentiment[-num_validatiion_samples:]

#print("X_train: ", X_train.shape)
#print("y_train: ", y_train.shape)
#print("X_test: ", X_test.shape)
#print("y_test: ", y_test.shape)

X_train:  (2000000, 128)
y_train:  (2000000,)
X_test:  (500000, 128)
y_test:  (500000,)


In [79]:
from keras.initializers import Constant
from keras.layers import Dense, Dropout, Activation, GRU

model = Sequential()
model.add(Embedding(num_words, 200, embeddings_initializer=Constant(embedding_matrix), input_length=max_length, trainable= False))
model.add(Dropout(0.4))
model.add(LSTM(128))
model.add(Dense(64))
model.add(Dropout(0.5))
model.add(Activation('relu'))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=0.000001)
print (model.summary())
model.fit(tweet_pad, full_labels, batch_size=128, epochs=5, validation_split=0.1, shuffle=True, callbacks=[reduce_lr])
'''

model.add(GRU(units=32, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation = 'sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, batch_size=128, epochs=5, validation_data=[X_test, y_test],verbose=2)
'''


Model: "sequential_16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_13 (Embedding)     (None, 128, 200)          118570400 
_________________________________________________________________
dropout_7 (Dropout)          (None, 128, 200)          0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 128)               168448    
_________________________________________________________________
dense_13 (Dense)             (None, 64)                8256      
_________________________________________________________________
dropout_8 (Dropout)          (None, 64)                0         
_________________________________________________________________
activation_7 (Activation)    (None, 64)                0         
_________________________________________________________________
dense_14 (Dense)             (None, 1)               

"\n\nmodel.add(GRU(units=32, dropout=0.2, recurrent_dropout=0.2))\nmodel.add(Dense(1, activation = 'sigmoid'))\nmodel.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\nmodel.fit(X_train, y_train, batch_size=128, epochs=5, validation_data=[X_test, y_test],verbose=2)\n"

In [73]:
with open(TOKENS_PATH, "rb") as fp:   # Unpickling
    all_tokens = pickle.load(fp)
test_sentences = all_tokens
test_sequences = tokenizer_obj.texts_to_sequences(test_sentences)
test_tweet_pad = pad_sequences(test_sequences, maxlen=max_length)

predictions = model.predict(x=test_tweet_pad)
print(classification_report(full_labels, predict_labels(predictions, 0.5)))

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


              precision    recall  f1-score   support

        -1.0       0.00      0.00      0.00         0
         0.0       0.00      0.00      0.00   1250000
         1.0       0.79      0.86      0.82   1250000

    accuracy                           0.43   2500000
   macro avg       0.26      0.29      0.27   2500000
weighted avg       0.40      0.43      0.41   2500000



In [55]:
full_labels

array([1., 1., 1., ..., 1., 1., 1.])

## Predict

In [80]:
# Retrain the model on the entire dataset
#clf = linear_model.SGDClassifier(max_iter=1000, tol=1e-3)
#clf.fit(all_tweets_vectors, full_labels)

# Load the data to predict
test_ids, test_x = load_csv_test_data(DATA_TEST_PATH, has_ID=True)

# Tokenize it
tknzr = TweetTokenizer(strip_handles=True, reduce_len=True, preserve_case=False)
test_tokens = [tknzr.tokenize(tweet) for tweet in test_x]

# Generate vector representation
#all_tweets_vectors = np.array([generateTweetVector(model.wv, words) for words in test_tokens])
test_sequences = tokenizer_obj.texts_to_sequences(test_tokens)
test_tweet_pad = pad_sequences(test_sequences, maxlen=max_length)

# Predict
predictions = model.predict(test_tweet_pad)

# Save predictions
create_csv_submission(test_ids, predict_labels(predictions, 0.5), OUTPUT_PATH)

In [None]:
tknzr = TweetTokenizer(strip_handles=True, reduce_len=True, preserve_case=False)
test_tokens = [tknzr.tokenize(tweet) for tweet in ["Great!! it is raining today!!", "I'm so sad", "I'm so happy","love"]]

test_sequences = tokenizer_obj.texts_to_sequences(["Great!! it is raining today!!", "I'm so sad", "I'm so happy","love"])
test_tweet_pad = pad_sequences(test_sequences, maxlen=max_length)

# Predict
preds = model.predict(tweet_pad)
preds_2 = predict_labels(preds, 0.5)
print(classification_report(full_labels, preds_2))

