In [4]:
import time
import json
import pickle
import re
import html
import sys
import gc

import gensim

from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import PolynomialFeatures, scale
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor

import pandas as pd
from pandas import DataFrame

from scipy.stats import spearmanr, pearsonr
import numpy as np

import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Flatten, Embedding, Input
from keras.layers.wrappers import Bidirectional
from keras.wrappers.scikit_learn import KerasRegressor
from keras.optimizers import SGD, Adagrad
from keras.layers.convolutional import Conv1D
from keras.layers.pooling import MaxPooling1D, GlobalMaxPooling1D, AveragePooling1D
from keras.layers.normalization import BatchNormalization
from keras.layers.recurrent import LSTM, GRU
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from nltk import word_tokenize
from nltk import bigrams
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import sentiwordnet as swn
from nltk.tokenize.casual import TweetTokenizer

from IPython.display import display, HTML

In [5]:
def browser_alert(message):
    display(HTML('<script type="text/javascript">alert("' + message + '");</script>'))
    
def browser_notify(message):
    display(HTML('<script type="text/javascript">var notification=new Notification("Jupyter Notification",{icon:"http://blog.jupyter.org/content/images/2015/02/jupyter-sq-text.png",body:"' + message + '"});</script>'))

In [11]:
glove_twitter_path = "/Users/COMMANDER/Desktop/MScBA_Consultantcy_Project/GloVe/glove.twitter.27B/"
glove_path_6B = "/Users/COMMANDER/Desktop/MScBA_Consultantcy_Project/GloVe/glove.6B/"
glove_path_42B = "/Users/COMMANDER/Desktop/MScBA_Consultantcy_Project/GloVe/glove.42B.300d/"
glove_path_840B = "/Users/COMMANDER/Desktop/MScBA_Consultantcy_Project/GloVe/glove.840B.300d/"
glove_path_twitter = "/Users/COMMANDER/Desktop/MScBA_Consultantcy_Project/GloVe/glove.twitter.27B/"
wassa_home = "/Users/COMMANDER/Downloads/"

## Word2Vec + GloVe

In [7]:
def loadGloveModel(gloveFile):
    print("Loading Glove Model")
    f = open(gloveFile,'r',encoding="utf8")
    model = {}
    num = 1
    for line in f:
        try:
            splitLine = line.split()
            word = splitLine[0]
            embedding = [float(val) for val in splitLine[1:]]
            model[word] = np.array(embedding)
            num += 1
        except Exception as e:
            print("Failed at line " + str(num))
    print("Done.",len(model)," words loaded!")
    return model

In [12]:
# Google news pretrained vectors
wv_model_path = "GoogleNews-vectors-negative300.bin.gz"
wv_model = gensim.models.KeyedVectors.load_word2vec_format(wv_model_path, binary=True, unicode_errors='ignore')

In [13]:
# Twitter pretrained vectors
wv_model_path_1 = "word2vec_twitter_model.bin"
wv_model_1 = gensim.models.KeyedVectors.load_word2vec_format(wv_model_path_1, binary=True, unicode_errors='ignore')

In [14]:
wv_model_path_2 = glove_path_twitter + "glove.twitter.27B.200d.txt"
wv_model_2 = loadGloveModel(wv_model_path_2)

Loading Glove Model
Done. 1193514  words loaded!


In [15]:
wv_model_path_3 = glove_path_6B + "glove.6B.300d.txt"
wv_model_3 = loadGloveModel(wv_model_path_3)

Loading Glove Model
Done. 400000  words loaded!


In [28]:
wv_model_path_4 = glove_path_42B + "glove.42B.300d.txt"
wv_model_4 = loadGloveModel(wv_model_path_4)

Loading Glove Model
Done. 1917494  words loaded!


In [44]:
wv_model_path_5 = glove_path_840B + "glove.840B.300d.txt"
wv_model_5 = loadGloveModel(wv_model_path_5)

Loading Glove Model
Failed at line 52344
Failed at line 128261
Failed at line 151101
Failed at line 200666
Failed at line 209830
Failed at line 220775
Failed at line 253456
Failed at line 365739
Failed at line 532041
Failed at line 717294
Failed at line 994809
Failed at line 1123321
Failed at line 1148398
Failed at line 1352098
Failed at line 1499714
Failed at line 1533795
Failed at line 1899826
Failed at line 1921136
Failed at line 2058949
Failed at line 2165228
Done. 2195885  words loaded!


In [112]:
w2v_dimensions = len(wv_model['word'])
w2v_dimensions_1 = len(wv_model_1['word'])
w2v_dimensions_2 = len(wv_model_2['word'])
w2v_dimensions_3 = len(wv_model_3['word'])
w2v_dimensions_4 = len(wv_model_4['word'])
w2v_dimensions_5 = len(wv_model_5['word'])

print(w2v_dimensions, w2v_dimensions_1, w2v_dimensions_2, w2v_dimensions_3, w2v_dimensions_4, w2v_dimensions_5)

300 400 200 300 300 300


In [113]:
def get_word2vec_embedding(word, model, dimensions):

    vec_rep = np.zeros(dimensions)
    if word in model:
        vec_rep = model[word]
    
    return vec_rep

In [56]:
wnl = WordNetLemmatizer()
tknzr = TweetTokenizer()

In [57]:
from negate import NEGATE as neg
import nltk
nltk_sw = nltk.corpus.stopwords.words('english') #Number of nltk stop words: 179
nltk_sw_neg = [x for x in nltk_sw if x not in neg] #Number of nltk stop words without negating words: 158

In [58]:
def remove_stopwords(string):
    split_string = \
        [word for word in string.split()
         if word not in nltk_sw_neg]
    
    
    return " ".join(split_string)

In [59]:
def clean_str(string):  
    string = html.unescape(string)
    string = string.replace("\\n", " ")
    string = string.replace("_NEG", "")
    string = string.replace("_NEGFIRST", "")
    string = re.sub(r"@[A-Za-z0-9_(),!?\'\`]+", " ", string) # removing any twitter handle mentions
    string = re.sub(r"\d+", " ", string) # removing any words with numbers
    string = re.sub(r"_", " ", string)
    string = re.sub(r":", " ", string)
    string = re.sub(r"/", " ", string)
    string = re.sub(r"#", " ", string)
    string = re.sub(r"\.", " ", string)
    string = re.sub(r"\*", " ", string)
    string = re.sub(r"\'s", " ", string)
    string = re.sub(r"\'m", " am", string)
    string = re.sub(r"\'ve", " have", string)
    string = re.sub(r"n\'t", " not", string)
    string = re.sub(r"n\’t", " not", string)
    string = re.sub(r"\'re", " are", string)
    string = re.sub(r"\’re", " are", string)
    string = re.sub(r"\'d", " would", string)
    string = re.sub(r"\’d", " would", string)
    string = re.sub(r"\'ll", " will", string)
    string = re.sub(r"\’ll", " will", string)
    string = re.sub(r"'", " ", string)
    string = re.sub(r",", "", string)
    string = re.sub(r"!", " !", string)
    string = re.sub(r"\(", "", string)
    string = re.sub(r"\)", "", string)
    string = re.sub(r"\?", " ?", string)
    string = re.sub(r"-", " ", string)
    string = re.sub(r"<", " ", string)
    string = re.sub(r">", " ", string)
    string = re.sub(r";", " ", string)
    string = re.sub(r"\s{2,}", " ", string)

    return remove_stopwords(string.strip().lower())

In [51]:
text="Applê isn't looking at buying U.K. startup for $1 billion. They still 600,20 4.5 + 423 look around."

In [52]:
clean_str(text)

'applê not looking buying u k startup $ billion still + look around'

In [60]:
class Tweet(object):

    def __init__(self, id, text, emotion, intensity):
        self.id = id
        self.text = text
        self.emotion = emotion
        self.intensity = intensity

    def __repr__(self):
        return \
            "id: " + self.id + \
            ", text: " + self.text + \
            ", emotion: " + self.emotion + \
            ", intensity: " + self.intensity

In [61]:
def read_training_data(training_data_file_path):

    train_list = list()
    with open(training_data_file_path, encoding="utf8") as input_file:
        for line in input_file:
            line = line.strip()
            array = line.split('\t')
            train_list.append(Tweet(array[0], list(tknzr.tokenize(clean_str(array[1]))), 
                                    array[2], str(array[3])))
    return train_list
            
def read_training_data_verbatim(training_data_file_path):

    train_list = list()
    with open(training_data_file_path, encoding="utf8") as input_file:
        for line in input_file:
            line = line.strip()
            array = line.split('\t')
            train_list.append(Tweet(array[0], array[1], array[2], str(array[3])))
    return train_list
    
def read_test_data(training_data_file_path):

    test_list = list()
    with open(training_data_file_path, encoding="utf8") as input_file:
        for line in input_file:
            line = line.strip()
            array = line.split('\t')
            test_list.append(Tweet(array[0], clean_str(array[1]), array[2], None))
    return test_list

In [62]:
non_linear_factor = PolynomialFeatures(3)

In [63]:
emotion = "anger"

In [64]:
training_data_file_path = \
    wassa_home + "corpora/multigenre/" + \
    emotion + "-ratings-0to1.train.txt"
word_embeddings_path = wassa_home + "embeddings/" + emotion + "-word-embeddings.pkl"

Feature Extraction Snippets

Emoji Intensity (Thanks to Roger, it works!)¶

In [65]:
with open('/Users/COMMANDER/Downloads/lexicons/index_emoji.json', encoding="utf8") as emoji_file:
    emoji_list = json.load(emoji_file)
    
emoji_dict = dict()

for emoji in emoji_list:
    emoji_dict[emoji["emoji"]] = (emoji["name"], emoji["polarity"])

In [66]:
def get_emoji_intensity(word):
    
    score = 0.0
    if word in emoji_dict.keys():
        score = float(emoji_dict[word][1])
    
    vec_rep = np.array([score])
    
    
    return non_linear_factor.fit_transform([vec_rep])[0]

In [67]:
affect_intensity_file_path = \
    wassa_home + \
    "lexicons/NRC-AffectIntensity-Lexicon.txt"
    
def get_word_affect_intensity_dict(emotion):
    word_intensities = dict()

    with open(affect_intensity_file_path) as affect_intensity_file:
        for line in affect_intensity_file:
            word_int_array = line.replace("\n", "").split("\t")

            if (word_int_array[2] == emotion):
                word_intensities[word_int_array[0]] = float(word_int_array[1])

    return word_intensities

In [68]:
word_intensities = get_word_affect_intensity_dict(emotion)

In [69]:
def get_emo_int_vector(word):
    
    score = 0.0
    if word in word_intensities.keys():
        score = float(word_intensities[word])
        
    vec_rep = np.array([score])
    
    return non_linear_factor.fit_transform([vec_rep])[0]

## SentiWordNet 

In [70]:
def get_sentiwordnetscore(word):
    
    vec_rep = np.zeros(2)
    
    synsetlist = list(swn.senti_synsets(word))

    if synsetlist:
        vec_rep[0] = synsetlist[0].pos_score()
        vec_rep[1] = synsetlist[0].neg_score()

    return non_linear_factor.fit_transform([vec_rep])[0]

## Sentiment Emotion Presence Lexicon

In [71]:
sentiment_emotion_lex_file_path = \
    wassa_home + \
    "Lexikon/NRC-Emotion-Lexicons/NRC-Emotion-Lexicon-v0.92/" + \
    "NRC-Emotion-Lexicon-Wordlevel-v0.92.txt"

def get_affect_presence_list(emotion):
    word_list = list()
    
    with open(sentiment_emotion_lex_file_path) as sentiment_emotion_lex_file:
        for line in sentiment_emotion_lex_file:
            word_array = line.replace("\n", "").split("\t")


            if (word_array[1] == emotion and word_array[2] == '1'):
                word_list.append(word_array[0])
    return word_list 

In [72]:
sentiment_emotion_lex_word_list = get_affect_presence_list(emotion)

In [73]:
def get_sentiment_emotion_feature(word):
    
    score = 0.0
    if word in sentiment_emotion_lex_word_list:
        score = 1.0
    vec_rep = np.array([score])
    
    return non_linear_factor.fit_transform([vec_rep])[0]

## Hashtag Emotion Intensity

In [74]:
hashtag_emotion_lex_file_path = \
    wassa_home + \
    "Lexikon/NRC-Emotion-Lexicons/NRC-Emotion-Lexicon-v0.92/" + \
    "NRC-Hashtag-Emotion-Lexicon-v0.2.txt"
    
def get_hashtag_emotion_intensity(emotion):
    hastag_intensities = dict()
    
    with open(hashtag_emotion_lex_file_path) as hashtag_emotion_lex_file:
        for line in hashtag_emotion_lex_file:
            word_array = line.replace("\n", "").split("\t")
            if (word_array[0] == emotion):
                hastag_intensities[word_array[1]] = float(word_array[2])
                
    return hastag_intensities

In [75]:
hashtag_emotion_intensities = get_hashtag_emotion_intensity(emotion)

In [76]:
def get_hashtag_emotion_vector(word):
    
    score = 0.0
    
    if word in hashtag_emotion_intensities.keys():
        score = float(hashtag_emotion_intensities[word])
        
    vec_rep = np.array([score])
            
    return non_linear_factor.fit_transform([vec_rep])[0]

## Emoticon Sentiment Lexicon

In [77]:
emoticon_lexicon_unigrams_file_path = \
    wassa_home + \
    "lexicons/Emoticon-unigrams.txt"
emoticon_lexicon_bigrams_file_path = \
    wassa_home + \
    "lexicons/Emoticon-bigrams.txt"
    
emoticon_lexicon_unigrams = dict()
emoticon_lexicon_bigrams = dict()

def get_emoticon_lexicon_unigram_dict():
    with open(emoticon_lexicon_unigrams_file_path) as emoticon_lexicon_file:
        for line in emoticon_lexicon_file:
            word_array = line.replace("\n", "").split("\t")
            emoticon_lexicon_unigrams[word_array[0]] = np.array([float(val) for val in word_array[1:]])
    
    return emoticon_lexicon_unigrams

def get_emoticon_lexicon_bigram_dict():
    with open(emoticon_lexicon_bigrams_file_path) as emoticon_lexicon_file:
        for line in emoticon_lexicon_file:
            word_array = line.replace("\n", "").split("\t")
            emoticon_lexicon_bigrams[word_array[0]] = np.array([float(val) for val in word_array[1:]])
    
    return emoticon_lexicon_bigrams

In [78]:
emoticon_lexicon_unigram_dict = get_emoticon_lexicon_unigram_dict()

In [80]:
emoticon_lexicon_bigram_dict = get_emoticon_lexicon_bigram_dict()

In [81]:
def get_unigram_sentiment_emoticon_lexicon_vector(word):
    
    vec_rep = np.zeros(3)
    if word in emoticon_lexicon_unigram_dict.keys():
        vec_rep = emoticon_lexicon_unigram_dict[word]
        
    return non_linear_factor.fit_transform([vec_rep])[0]

def get_bigram_sentiment_emoticon_lexicon_vector(word):
    
    vec_rep = np.zeros(3)
    if word in emoticon_lexicon_bigram_dict.keys():
        vec_rep = emoticon_lexicon_bigram_dict[word]
        
    return non_linear_factor.fit_transform([vec_rep])[0]

## Emoticon Sentiment Aff-Neg Lexicon

In [84]:
emoticon_afflex_unigrams_file_path = \
    wassa_home + \
    "lexicons/Emoticon-AFFLEX-NEGLEX-unigrams.txt"

emoticon_afflex_bigrams_file_path = \
    wassa_home + \
    "lexicons/Emoticon-AFFLEX-NEGLEX-bigrams.txt"
    
emoticon_afflex_unigrams = dict()
emoticon_afflex_bigrams = dict()

def get_emoticon_afflex_unigram_dict():
    with open(emoticon_afflex_unigrams_file_path) as emoticon_lexicon_file:
        for line in emoticon_lexicon_file:
            word_array = line.replace("\n", "").split("\t")
            emoticon_afflex_unigrams[word_array[0]] = np.array([float(val) for val in word_array[1:]])
    
    return emoticon_afflex_unigrams

def get_emoticon_afflex_bigram_dict():
    with open(emoticon_afflex_bigrams_file_path) as emoticon_lexicon_file:
        for line in emoticon_lexicon_file:
            word_array = line.replace("\n", "").split("\t")
            emoticon_afflex_bigrams[word_array[0]] = np.array([float(val) for val in word_array[1:]])
    
    return emoticon_afflex_bigrams

In [85]:
emoticon_afflex_unigram_dict = get_emoticon_afflex_unigram_dict()

In [86]:
emoticon_afflex_bigram_dict = get_emoticon_afflex_bigram_dict()

In [87]:
def get_unigram_sentiment_emoticon_afflex_vector(word):
    
    vec_rep = np.zeros(3)
    if word in emoticon_afflex_unigram_dict.keys():
        vec_rep = emoticon_afflex_unigram_dict[word]
        
    return non_linear_factor.fit_transform([vec_rep])[0]

def get_bigram_sentiment_emoticon_afflex_vector(word):
    
    vec_rep = np.zeros(3)
    if word in emoticon_afflex_bigram_dict.keys():
        vec_rep = emoticon_afflex_bigram_dict[word]
    
    return non_linear_factor.fit_transform([vec_rep])[0]

## Hashtag Sentiment Aff-Neg Lexicon

In [88]:
hashtag_affneglex_unigrams_file_path = \
    wassa_home + \
    "lexicons/HS-AFFLEX-NEGLEX-unigrams.txt"
hashtag_affneglex_bigrams_file_path = \
    wassa_home + \
    "lexicons/HS-AFFLEX-NEGLEX-bigrams.txt"
    
hashtag_affneglex_unigrams = dict()
hashtag_affneglex_bigrams = dict()

def get_hashtag_affneglex_unigram_dict():
    with open(hashtag_affneglex_unigrams_file_path) as hashtag_sent_lex_file:
        for line in hashtag_sent_lex_file:
            word_array = line.replace("\n", "").split("\t")
            hashtag_affneglex_unigrams[word_array[0]] = np.array([float(val) for val in word_array[1:]])
    
    return hashtag_affneglex_unigrams

def get_hashtag_affneglex_bigram_dict():
    with open(hashtag_affneglex_bigrams_file_path) as hashtag_sent_lex_file:
        for line in hashtag_sent_lex_file:
            word_array = line.replace("\n", "").split("\t")
            hashtag_affneglex_bigrams[word_array[0]] = np.array([float(val) for val in word_array[1:]])

    return hashtag_affneglex_bigrams

In [89]:
hashtag_affneglex_unigram_dict = get_hashtag_affneglex_unigram_dict()

In [90]:
hashtag_affneglex_bigram_dict = get_hashtag_affneglex_bigram_dict()

In [91]:
def get_unigram_sentiment_hashtag_affneglex_vector(word):
    
    vec_rep = np.zeros(3)
    if word in hashtag_affneglex_unigram_dict.keys():
        vec_rep = hashtag_affneglex_unigram_dict[word]
        
    return non_linear_factor.fit_transform([vec_rep])[0]

def get_bigram_sentiment_hashtag_affneglex_vector(word):
    
    vec_rep = np.zeros(3)
    if word in hashtag_affneglex_bigram_dict.keys():
        vec_rep = hashtag_affneglex_bigram_dict[word]
        
    return non_linear_factor.fit_transform([vec_rep])[0]

## Hashtag Sentiment Lexicon

In [92]:
hash_sent_lex_unigrams_file_path = \
    wassa_home + \
    "lexicons/HS-unigrams.txt"
hash_sent_lex_bigrams_file_path = \
    wassa_home + \
    "lexicons/HS-bigrams.txt"

def get_hash_sent_lex_unigram_dict():
    
    hash_sent_lex_unigrams = dict()
    with open(hash_sent_lex_unigrams_file_path) as hash_sent_lex_file:
        for line in hash_sent_lex_file:
            word_array = line.replace("\n", "").split("\t")
            if clean_str(word_array[0]):
                hash_sent_lex_unigrams[word_array[0]] = np.array([float(val) for val in word_array[1:]])
    
    return hash_sent_lex_unigrams

def get_hash_sent_lex_bigram_dict():

    hash_sent_lex_bigrams = dict()
    with open(hash_sent_lex_bigrams_file_path) as hash_sent_lex_file:
        for line in hash_sent_lex_file:
            word_array = line.replace("\n", "").split("\t")
            if clean_str(word_array[0]):
                hash_sent_lex_bigrams[word_array[0]] = np.array([float(val) for val in word_array[1:]])
    
    return hash_sent_lex_bigrams

In [93]:
hash_sent_lex_unigram_dict = get_hash_sent_lex_unigram_dict()

In [94]:
hash_sent_lex_bigram_dict = get_hash_sent_lex_bigram_dict()

In [95]:
def get_unigram_sentiment_hash_sent_lex_vector(word):
    
    vec_rep = np.zeros(3)
    if word in hash_sent_lex_unigram_dict.keys():
        vec_rep = hash_sent_lex_unigram_dict[word]
        
    return non_linear_factor.fit_transform([vec_rep])[0]


def get_bigram_sentiment_hash_sent_lex_vector(word):

    vec_rep = np.zeros(3)
    if word in hash_sent_lex_bigram_dict.keys():
        vec_rep = hash_sent_lex_bigram_dict[word]
            
    return non_linear_factor.fit_transform([vec_rep])[0]

## Depeche Mood (There is DepecheMood V2, try it)

In [101]:
depeche_mood_file_path = \
    wassa_home + \
    "lexicons/DepecheMood_V1.0/DepecheMood_normfreq2.txt"

In [102]:
def get_depeche_vector_dict():
    depeche_vector_dict = dict()
    with open(depeche_mood_file_path) as depeche_mood_file:
        for line in depeche_mood_file:
            word_array = line.replace("\n", "").split("\t")
            depeche_vector_dict[word_array[0].split("#")[0]] = np.array([float(val) for val in word_array[1:]])
    
    return depeche_vector_dict

In [103]:
depeche_vector_dict = get_depeche_vector_dict()

In [104]:
def get_depeche_mood_vector(word):
    
    vec_rep = np.zeros(8)
    if word in depeche_vector_dict.keys():
        vec_rep = np.array(depeche_vector_dict[word])

    return non_linear_factor.fit_transform([vec_rep])[0]

# Reading & Vectorizing Data

In [144]:
def is_active_vector_method(string):
    return int(string)

def learn_unigram_word_embedding(word):
    
    word_feature_embedding_dict = dict()
    
    '''Pre-trained Word embeddings'''
    index = 0
    word_feature_embedding_dict[index] = get_word2vec_embedding(word, wv_model, w2v_dimensions)

    index = 1
    word_feature_embedding_dict[index] = get_word2vec_embedding(word, wv_model_1, w2v_dimensions_1)

    index = 2
    word_feature_embedding_dict[index] = get_word2vec_embedding(word, wv_model_2, w2v_dimensions_2)

    index = 3
    word_feature_embedding_dict[index] = get_word2vec_embedding(word, wv_model_3, w2v_dimensions_3)

    index = 4
    word_feature_embedding_dict[index] = get_word2vec_embedding(word, wv_model_4, w2v_dimensions_4)

    index = 5
    word_feature_embedding_dict[index] = get_word2vec_embedding(word, wv_model_5, w2v_dimensions_5)

    '''NRC Emotion Intensity Lexicon'''
    index = 6
    word_feature_embedding_dict[index] = get_emo_int_vector(word)

    '''WordNet'''
    index = 7
    word_feature_embedding_dict[index] = get_sentiwordnetscore(word)

    '''NRC Sentiment Lexica'''
    index = 8
    word_feature_embedding_dict[index] = get_sentiment_emotion_feature(word)

    index = 9
    word_feature_embedding_dict[index] = get_unigram_sentiment_emoticon_lexicon_vector(word)

    index = 10
    word_feature_embedding_dict[index] = get_unigram_sentiment_emoticon_afflex_vector(word)

    '''NRC Hashtag Lexica'''
    index = 11
    word_feature_embedding_dict[index] = get_hashtag_emotion_vector(word)

    index = 12
    word_feature_embedding_dict[index] = get_unigram_sentiment_hash_sent_lex_vector(word)

    index = 13
    word_feature_embedding_dict[index] = get_unigram_sentiment_hashtag_affneglex_vector(word)

    '''Emoji Polarities'''
    index = 14
    word_feature_embedding_dict[index] = get_emoji_intensity(word)
    
    '''Depeche Mood'''
    index = 15
    word_feature_embedding_dict[index] = get_depeche_mood_vector(word)

    return word_feature_embedding_dict


def learn_bigram_word_embedding(word):
    
    word_feature_embedding_dict = dict()
    
    '''NRC Sentiment Lexica'''

    index = 0
    word_feature_embedding_dict[index] = get_bigram_sentiment_emoticon_lexicon_vector(word)

    index = 1
    word_feature_embedding_dict[index] = get_bigram_sentiment_emoticon_afflex_vector(word)

    '''NRC Hashtag Lexica'''
    index = 2
    word_feature_embedding_dict[index] = get_bigram_sentiment_hash_sent_lex_vector(word)

    index = 3
    word_feature_embedding_dict[index] = get_bigram_sentiment_hashtag_affneglex_vector(word)

    return word_feature_embedding_dict

In [145]:
def get_unigram_embedding(word, word_embedding_dict, bin_string):
    
    word_feature_embedding_dict = word_embedding_dict[word]
    final_embedding = np.array([])
    
    for i in range(16):
        if is_active_vector_method(bin_string[i]):
            final_embedding = np.append(final_embedding, word_feature_embedding_dict[i])
    
    return final_embedding

def get_bigram_embedding(bigram, word_embedding_dict, bin_string):
    
    word_feature_embedding_dict = word_embedding_dict[word]
    final_embedding = np.array([])
    
    for i in range(4):
        if is_active_vector_method(bin_string[i]):
            final_embedding = np.append(final_embedding, word_feature_embedding_dict[i])
    
    return final_embedding

In [146]:
unigram_feature_string = "1111111111111111"
bigram_feature_string = "1111"

In [None]:
"""
training_tweets = read_training_data(training_data_file_path)
dev_tweets = read_training_data(dev_set_path)

score_train = list()
tweet_train = list()
for tweet in training_tweets:
    tweet_train.append(tweet.text)
    #score_train.append(float(tweet.intensity))

for tweet in dev_tweets:
    tweet_train.append(tweet.text)
    #score_train.append(float(tweet.intensity))    
print(len(score_train))
score_train = np.asarray(score_train)
"""

In [None]:
"""
raw_test_tweets = read_training_data_verbatim(test_data_file_path)
test_tweets = read_training_data(test_data_file_path)

tweet_test_raw = list()
tweet_test = list()
y_gold = list()

for tweet in raw_test_tweets:
    tweet_test_raw.append(tweet.text)

for tweet in test_tweets:
    tweet_test.append(tweet.text)
    y_gold.append(float(tweet.intensity))
    
print(len(y_gold))
"""

In [147]:
def build_word_embeddings(tweets):
    
    max_tweet_length = -1
    word_embedding_dict = dict()

    for tweet in tweets:
        if len(tweet) > max_tweet_length:
            max_tweet_length = len(tweet)

        for token in tweet:
            if token not in word_embedding_dict.keys():
                word_embedding_dict[token] = learn_unigram_word_embedding(token)
                
    return word_embedding_dict, max_tweet_length

In [161]:
from corpora_utils import CorporaHelper, CorporaProperties
corpora_helper = CorporaHelper("multigenre.csv")
corpora_helper.translate_contractions() # problem space before '
corpora_helper.translate_urls() # http;/sdasd  => URL
#corpora_helper.translate_emoticons()
#corpora_helper.translate_emojis()
#corpora_helper.translate_html_tags()
corpora_helper.translate_camel_case()
corpora_helper.translate_underscore()
corpora_helper.add_space_at_special_chars() 

corpora = list()
for corpus_id, corpus in corpora_helper.get_data().iterrows():
# tokenize the cleaned corpora
    corpora.append(list(tknzr.tokenize(corpus[CorporaProperties.CLEANED_CORPUS.value])))


In [149]:
all_tweets = corpora
embedding_info = build_word_embeddings(all_tweets)

In [157]:
pickle.dump(embedding_info, open("multigenre_embedding.pkl", "wb"))

In [150]:
all_tweets

[['The',
  'Rock',
  'is',
  'destined',
  'to',
  'be',
  'the',
  '21st',
  'Centurys',
  'new',
  '`',
  '`',
  'Conan',
  'and',
  'that',
  'he',
  'is',
  'going',
  'to',
  'make',
  'a',
  'splash',
  'even',
  'greater',
  'than',
  'Arnold',
  'Schwarzenegger',
  ',',
  'Jean-Claud',
  'Van',
  'Damme',
  'or',
  'Steven',
  'Segal',
  '.'],
 ['The',
  'gorgeously',
  'elaborate',
  'continuation',
  'of',
  '`',
  '`',
  'The',
  'Lord',
  'of',
  'the',
  'Rings',
  'trilogy',
  'is',
  'so',
  'huge',
  'that',
  'a',
  'column',
  'of',
  'words',
  'can',
  'not',
  'adequately',
  'describe',
  'co-writer',
  '/',
  'director',
  'Peter',
  'Jacksons',
  'expanded',
  'vision',
  'of',
  'J',
  '.',
  'R',
  '.',
  'R',
  '.',
  'Tolkiens',
  'Middle-earth',
  '.'],
 ['Effective', 'but', 'too-tepid', 'biopic'],
 ['If',
  'you',
  'sometimes',
  'like',
  'to',
  'go',
  'to',
  'the',
  'movies',
  'to',
  'have',
  'fun',
  ',',
  'Wasabi',
  'is',
  'a',
  'good',
  '

In [152]:
# Save vectors
with open(word_embeddings_path, 'wb') as word_embeddings_file:
     pickle.dump(embedding_info, word_embeddings_file)

In [153]:
browser_notify("Persisted to disk")

In [154]:
# Restore vectors
with open(word_embeddings_path, 'rb') as word_embeddings_file:
    embedding_info = pickle.load(word_embeddings_file)

In [130]:
embeddings_index = embedding_info[0]
MAX_SEQUENCE_LENGTH = embedding_info[1]
MAX_NB_WORDS = 20000
EMBEDDING_DIM = len(get_unigram_embedding("glad", embedding_info[0], unigram_feature_string))
print(MAX_SEQUENCE_LENGTH, EMBEDDING_DIM)

130 2071


In [135]:
word_indices = dict()
current_index = 1

In [136]:
def sequence_tweets(tweets):
    global current_index
    vectors = list()
    for tweet in tweets:        
        vector = list()
        for word in tweet:
            word_index = None
            
            if word in word_indices:
                word_index = word_indices[word]
            else:
                word_index = current_index
                current_index += 1
                word_indices[word] = word_index
            
            vector.append(word_index)
        
        vectors.append(vector)

    return vectors

In [137]:
len(word_indices)

0

In [138]:
display(tweet_train)

NameError: name 'tweet_train' is not defined

In [139]:
word_embedding_matrix = list()
word_embedding_matrix.append(np.zeros(EMBEDDING_DIM))

for word in sorted(word_indices, key=word_indices.get):
    embedding_features = get_unigram_embedding(word, embedding_info[0], unigram_feature_string)    
    word_embedding_matrix.append(embedding_features)

word_embedding_matrix = np.asarray(word_embedding_matrix, dtype='f')

In [140]:
word_embedding_matrix.shape

(1, 2071)

In [141]:
word_embedding_matrix = scale(word_embedding_matrix)

In [142]:
browser_notify("Vectorization Done")

In [143]:
word_indices

{}

In [None]:
with open(wassa_home + 'word_indices_new.pickle', 'wb') as handle:
    pickle.dump(word_indices, handle)