## Import libraries

In [103]:
import re
import nltk
from nltk.corpus import stopwords
import pandas as pd
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors
from textblob import TextBlob
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout
from keras.models import load_model
from keras.utils import np_utils
from keras.preprocessing import sequence
from keras.layers.embeddings import Embedding
from keras.utils import plot_model
import string
import warnings
warnings.filterwarnings('ignore')

## Path to folders/files

In [3]:
PATH_TO_FOLDER = '../AI_data/'
PATH_TO_Glove_vectors = '../glove.twitter.27B/glove.twitter.27B.200d.txt'
PATH_TO_Word2Vec_vectors = '../glove.twitter.27B/glove.twitter.27B.200d.word2vec'

In [25]:
# np.unique(data_frame['emotion'].values)
emotion_labels = {'anger':0, 'happiness':1, 'love':2, 'neutral':3, 'sadness':4, 'worry': 5}

In [8]:
convert_glove_to_word2vec_vectors(PATH_TO_Glove_vectors, PATH_TO_Word2Vec_vectors)

In [32]:
model = read_word2vec_vectors(PATH_TO_Word2Vec_vectors)

## Read data

In [4]:
def read_data(filename, delimiter):
    data = pd.read_csv(filename, delimiter='\t')
    data = data.drop('Unnamed: 0', axis=1)
    return data

In [5]:
def convert_glove_to_word2vec_vectors(filepath, output_filepath):
    glove2word2vec(filepath, output_filepath)

In [6]:
def read_word2vec_vectors(filepath):
    model = KeyedVectors.load_word2vec_format(filepath, binary=False)
    return model

In [7]:
def preprocess(text):
    text = re.sub(r"http\S+", "", text)
    tokens = nltk.word_tokenize(text)
    tokens = [word.lower() for word in tokens]
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    tokens = [word for word in stripped if word.isalpha()]
    stop_words = stopwords.words('english')
    token_list = []
    for word in tokens:
        if not word in stop_words:
            token_list.append(word)
    #token_list = [TextBlob(word).correct() for word in token_list]
    return token_list

In [8]:
def preprocess_df(data_frame):
    train_data = []
    for review in data_frame['review'].values:
        review = preprocess(review)
        train_data.append(review)
    return train_data

In [27]:
def convert_labels_into_categorical(data_frame):
    categorical_list = []
    for emotion in data_frame['emotion'].values:
        categorical_list.append(emotion_labels[emotion])
    categorical_list = np_utils.to_categorical(categorical_list)
    return categorical_list

In [87]:
def unique_words(data):
    unique_words = []
    for l in data:
        unique_words += l
    unique_words = list(set(unique_words))
    return unique_words

In [88]:
def convert_words_into_numbers(data, unique_words):
    data_in_numbers = []
    for l in data:
        data_in_numbers.append([unique_words.index(word) for word in l])
    return data_in_numbers

## Main function

In [89]:
filename_train_data = PATH_TO_FOLDER + 'emotion_trainingdataset.csv'
train_data = read_data(filename_train_data, '\t')

filename_test_data = PATH_TO_FOLDER + 'emotion_testdataset.csv'
test_data = read_data(filename_test_data, '\t')

In [90]:
X_train = preprocess_df(train_data)
X_test = preprocess_df(test_data)
unique_words = unique_words(X_train + X_test)
X_train = convert_words_into_numbers(X_train, unique_words)
X_test = convert_words_into_numbers(X_test, unique_words)

In [93]:
y_train = convert_labels_into_categorical(train_data)

In [104]:
number_of_words = len(unique_words)
max_length_of_input = 200

X_train = sequence.pad_sequences(X_train, max_length_of_input)
X_test = sequence.pad_sequences(X_test, max_length_of_input)

embedding_vector_length = 32
model = Sequential()
model.add(Embedding(number_of_words, embedding_vector_length, input_length = max_length_of_input))
# Uncomment the below line if you want to use Dropout
# model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dense(1, activation = 'sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
print(model.summary())
plot_model(model, to_file='model_LSTM.png', show_shapes=True, show_layer_names=True)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 200, 32)           235776    
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 101       
Total params: 289,077
Trainable params: 289,077
Non-trainable params: 0
_________________________________________________________________
None


ImportError: Failed to import `pydot`. Please install `pydot`. For example with `pip install pydot`.

In [None]:
number_of_words = len(unique_words)
max_length_of_input = 200

(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=number_of_words)
X_train = sequence.pad_sequences(X_train, max_length_of_input)
X_test = sequence.pad_sequences(X_test, max_length_of_input)

embedding_vector_length = 32
model = Sequential()
model.add(Embedding(number_of_words, embedding_vector_length, input_length = max_length_of_input))
# Uncomment the below line if you want to use Dropout
# model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dense(1, activation = 'sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
print model.summary()
plot_model(model, to_file='model_LSTM.png', show_shapes=True, show_layer_names=True)

model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_test, y_test))
score_train, accuracy_train = model.evaluate(X_train, y_train)
print score_train, accuracy_train
score_test, accuracy_test = model.evaluate(X_test, y_test)
print score_test, accuracy_test

model.save('model_LSTM.h5')

In [58]:
[2,3, 5] + [3,4]

[2, 3, 5, 3, 4]

In [71]:
[2,3,4].index(2)

0

In [86]:
list(set([1,1,1,2,3]))

[1, 2, 3]