## Import the required libraries

In [None]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from keras import layers
from keras.models import Model, Sequential
from keras.layers import Dense, Dropout, Activation, LSTM
from keras.layers import Input, Flatten, merge, Lambda, Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from keras.layers.wrappers import TimeDistributed, Bidirectional
from keras.utils import np_utils, to_categorical
from keras.optimizers import Adam, RMSprop
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.layers.normalization import BatchNormalization

from sklearn.model_selection import train_test_split, KFold, GridSearchCV, StratifiedShuffleSplit,StratifiedKFold
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

from gensim.models.keyedvectors import KeyedVectors

import matplotlib.pyplot as plt
import itertools

from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer

from sklearn.utils import shuffle

import re

## Load the dataset

In [None]:
data = pd.read_csv("trainset.csv")
comments = data['Komenti']
y = data['Sentimenti']

## Input parameters

In [None]:
MAX_SEQUENCE_LENGTH = 20
MAX_NB_WORDS = 6000
EMBEDDING_DIM = 300

## Data preprocessing

In [None]:
def preprocess_text(sen):

    sentence = sen
    # Remove everything except a-z, A-Z, + sign and chars like Ë, ë, Ç, ç
    sentence = re.sub('[^a-zA-Z0-9\+ËëÇç]', ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    return sentence

### Fill list with preprocessed text

In [None]:
docs = []
for sen in comments:
    docs.append(preprocess_text(sen))

## Encode class values as integers 

In [None]:
encoder = LabelEncoder()

encoder.fit(y) # encoder.fit([1, 1, 2, 6]) LabaleEncoder()
# encoder.classes_ 
# array([1, 2, 6])

# In our case
# encoder.classes_   
# array([0, 1, 2], dtype=int64)

encoded_y = encoder.transform(y)
# array([0, 0, 1, 2]...)

# encoder.inverse_transform([0, 0, 1, 2])
# array([1, 1, 2, 6])

# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_y)
# Converts a class vector(integers) to binary class matrix.
# E.g. for use with categorical_crossentropy.
# Arguments: 
    # y: class vector to be converted into a matrix(integers from 0 to num_classes)
    # Returns a binary matrix representation of the input. The classes axis is placed last
# Example:
    # a = tf.keras.utils_to_categorical([0, 1, 2, 3], num_classes=4)
    # a = tf.constant(a, shape=[4, 4])
    # print(a)
# tf.Tensor(
# [[1. 0. 0. 0.]
#  [0. 1. 0. 0.]
#  [0. 0. 1. 0.]
#  [0. 0. 0. 1.]], shape = (4,4), dtype=float32)

## Define plot_history function

In [None]:
def plot_history(history):
    loss_list = [s for s in history.history.keys() if 'loss' in s and 'val' not in s]
    val_loss_list = [s for s in history.history.keys() if 'loss' in s and 'val' in s]
    acc_list = [s for s in history.history.keys() if 'acc' in s and 'val' not in s]
    val_acc_list = [s for s in history.history.keys() if 'acc' in s and 'val' in s]
    
    if len(loss_list) == 0:
        print('Loss is missing in history')
        return 
    
    ## As loss always exists
    epochs = range(1,len(history.history[loss_list[0]]) + 1) # range(1, 251) 
    
    ## Loss
    plt.figure(1)
    for l in loss_list:
        plt.plot(epochs, history.history[l], 'b', label='Training loss (' + str(str(format(history.history[l][-1],'.5f'))+')'))
    for l in val_loss_list:
        plt.plot(epochs, history.history[l], 'g', label='Validation loss (' + str(str(format(history.history[l][-1],'.5f'))+')'))
    
    plt.title('Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    
    ## Accuracy
    plt.figure(2)
    for l in acc_list:
        plt.plot(epochs, history.history[l], 'b', label='Training accuracy (' + str(format(history.history[l][-1],'.5f'))+')')
    for l in val_acc_list:    
        plt.plot(epochs, history.history[l], 'g', label='Validation accuracy (' + str(format(history.history[l][-1],'.5f'))+')')

    plt.title('Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.show()

## Define full_multiclass_report which prints classification report

In [None]:
## If binary (sigmoid output), set binary parameter to True
def full_multiclass_report(model,
                           x,
                           y_true,
                           classes,
                           batch_size = 16,
                           binary=False):

    # 1. Transform one-hot encoded y_true into their class number
    if not binary:
        y_true = np.argmax(y_true,axis=1)
    
    # 2. Predict classes and stores in y_pred
    y_pred = model.predict_classes(x, batch_size=batch_size)
    
    # 3. Print accuracy score
    print("Accuracy : "+ str(accuracy_score(y_true,y_pred)))
    
    print("")
    
    # 4. Print classification report
    print("Classification Report")
    print(classification_report(y_true,y_pred,digits=4))    

## Create a tokenizer

In [None]:
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS, lower=True )

tokenizer.fit_on_texts(docs)

sequences = tokenizer.texts_to_sequences(docs)

word_index = tokenizer.word_index

print('Found %s unique tokens.' % len(word_index))

# convert text to sequence of tokens and pad them to ensure equal length vectors 
x = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

## Training, testing and validation

In [None]:
seed = 20

x_train, x_test, y_train, y_test = train_test_split(x, dummy_y, train_size=0.8, random_state=seed)

x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, train_size=0.7, random_state=seed)

## Generate same results if you don't change parameters

In [None]:
from numpy.random import seed

seed(1)

# from tensorflow import set_random_seed
import tensorflow as tf

# set_random_seed(2)
tf.random.set_seed(2)

## Build LSTM Model

In [None]:
LSTM_model = Sequential()

LSTM_model.add(Embedding(len(word_index), EMBEDDING_DIM, input_length =x.shape[1]))

LSTM_model.add(LSTM(32))

LSTM_model.add(Dense(32,activation='relu'))
LSTM_model.add(Dense(3,activation='softmax'))

LSTM_model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])

print(LSTM_model.summary())

## Evaluate model

In [None]:
LSTMHistory=LSTM_model.fit(x_train, y_train, epochs = 100, batch_size = 16,verbose=2, 
                           validation_data=(x_val,y_val), shuffle=True)

## Print results

In [None]:
plot_history(LSTMHistory)

In [None]:
full_multiclass_report(LSTM_model, x_val, y_val, encoder.inverse_transform(np.arange(3)))