# Sentiment classification with word embeddings

Words are different from images or even molecules, in that the meaning of a word is not represented by the letters that make up the word (the same way that the meaning of an image is represented by the pixels that make up the pixel).  
Instead, <b>the meaning of words comes from how they are used in conjunction with other words.</b>  

### GloVe, Global Vectors for Word Representation

There are multiple versions of pre-trained GloVe word embeddings.  
They differ in the <i>corpus</i> used to train the embedding, and the <i>size</i> of the embeddings.

GloVe is a project Stanford NLP: https://nlp.stanford.edu/projects/glove/

In [None]:
# THIS CELL IS USED TO CREATE A SUBSET OF THE WHOLE aclImdb DATASET
# SET THE VARIABLES IN THE MIDDLE OF THE CELL!!!

import os
import shutil
import random

ACTIVATE_CODE = False

if ACTIVATE_CODE:
    if not os.path.isdir("../Data/aclImdb_subset/"):
        print(len(os.listdir("../Data/aclImdb/train/pos")))
        print(len(os.listdir("../Data/aclImdb/train/neg")))
        print(len(os.listdir("../Data/aclImdb/test/pos")))
        print(len(os.listdir("../Data/aclImdb/test/neg")))

        train_pos_files = os.listdir("../Data/aclImdb/train/pos")
        train_neg_files = os.listdir("../Data/aclImdb/train/neg")
        test_pos_files = os.listdir("../Data/aclImdb/test/pos")
        test_neg_files = os.listdir("../Data/aclImdb/test/neg")


        # SET THESE 3 VARIABLES
        train_or_test = 'test'
        pos_or_neg = 'neg'
        file_names = test_neg_files
        ############################

        in_folder = "../Data/aclImdb/" + train_or_test + "/" + pos_or_neg + "/"
        out_folder = "../Data/aclImdb_subset/" + train_or_test + "/" + pos_or_neg + "/"

        used_indexes = []
        for i in range(int(len(os.listdir(in_folder))/10)):
            index = random.randint(0, len(os.listdir(in_folder)))
            while index in used_indexes:
                index = random.randint(0, len(os.listdir(in_folder)))
            file_ = in_folder + file_names[index]
            shutil.copy(file_ , out_folder)
            used_indexes.append(index)

In [None]:
import numpy as np
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import nltk

import string
from tensorflow import keras
import os 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import tensorflow as tf

nltk_stopw = stopwords.words('english')


### Read the Text Corpus 


In [None]:

data = "../Data/aclImdb_subset/"
labelToName = { 0 : 'neg', 1: 'pos' }
def getMovies(split):
    '''
    outputs:
    X_raw: lista di  recensioni
    Y: array di target; len(Y)=len(X_raw)
    '''
    X_raw, Y  = [], []

    for classIndex, directory in enumerate(['neg', 'pos']):
        dirName = data + split + "/" + directory
        for reviewFile in os.listdir(dirName):
            with open (dirName + '/' + reviewFile, 'r', encoding='utf8') as f:
                raw = f.read()
                if (len(raw) == 0):
                    continue
            X_raw.append(raw)
            Y.append(classIndex)
    return X_raw, np.array(Y)

# We will split later in train and val
X_raw, Y = getMovies(split='train')

X_raw_test, Y_test = getMovies(split='test')




In [None]:
n_char_train = [len(x) for x in X_raw]
n_char_test = [len(x) for x in X_raw_test]
print('TRAIN: ', len(X_raw),' reviews; ','min length = ', min(n_char_train), ', max length = ',max(n_char_train), ', median',np.median(n_char_train), 'chars')
print('TEST: ', len(X_raw_test),' reviews; ','min length = ', min(n_char_test), ', mac length = ',max(n_char_test), ', median',np.median(n_char_test), 'chars')

print('\n \n TEXT \n',X_raw[0],'\n LABEL =', labelToName[Y[0]])

### Text preprocessing
lowcase, tokenize, remove punctuations, lemmatize

In [None]:
def get_pos(pos):
    '''
    Convert nltk.pos_tag() tags  so that they can be understood by pos tags by nltk.WordNetLemmatizer()
    '''
    if pos.startswith('J'):
        return 'a' # o wordnet.ADJ
    elif pos.startswith('V'):
        return 'v' # o wordnet.VERB
    elif pos.startswith('N'):
        return 'n' # o wordnet.NOUN
    elif pos.startswith('R'):
        return 'r' # o wordnet.ADV
    else:          
        return 'n' # default  

def txt_preprocessing(X, printa=False):
    i = 0 #text to print
    #lowcase
    X = [x.lower() for x in X]
    if printa: print(X[i],'\n')

    # tokenize
    X = [RegexpTokenizer(r'\b[a-zA-Z][a-zA-Z0-9]{2,14}\b').tokenize(x) for x in X] #or [re.findall(r'\b[a-zA-Z][a-zA-Z0-9]{2,14}\b',x) for x in X]
    if printa: print(X[i],'\n')

    # remove stop words
    X = [(lambda x: [x_i for x_i in x if x_i not in nltk_stopw])(x) for x in X] # or list(map(lambda x: ([x_i for x_i in x if x_i not in nltk_stopw]),X))
    if printa: print(X[i],'\n')

    # lemmatization using POS
    X = [nltk.pos_tag(x) for x in X]
    if printa: print(X[i],'\n')

    # POS tags to match nltk.WordNetLemmatizer()
    X = [ (lambda x: [(x_i[0],get_pos(x_i[1])) for x_i in x])(x) for x in X]
    if printa: print(X[i],'\n')

    # lemmatizzo
    X = [(lambda x: [nltk.WordNetLemmatizer().lemmatize(w,p) for w,p in x])(x) for x in X]
    if printa: print(X[i],'\n')

    # reshape as a list of sentences: [['this','is','string','1'], ['this','is','string','2']...] --> ['this is string 1','this is string 2'...]
    X = [" ".join(x) for x in X]
    if printa: print(X[i])

    return X

#a=['thIs Film was#@ the ?worst Ever', 'I sAw,  !very good Films recently!']    
#txt_preprocessing(a, printa=True)

In [None]:
# see all passages in txt_processing
txt_preprocessing([X_raw[0]], printa=True)

In [None]:
#1 minute run
X = txt_preprocessing(X_raw)
X_test = txt_preprocessing(X_raw_test)


In [None]:
print(X_raw[0],'\n\n',X[0])

In [None]:

# Test/Train Split
X_train, X_val, Y_train, Y_val = train_test_split(X,Y, test_size=0.2, random_state=123)
print(Y_train.mean(), Y_val.mean())

### Embedding with Glove. 
If not present, browse to https://nlp.stanford.edu/projects/glove/ and download glove.6B.zip.
Unzip files and put in a new directory "glove".

In [None]:
from gensim.models import KeyedVectors

def read_glove_vecs(glove_file):
    with open(glove_file, 'r',encoding="utf8") as f:

        words = set()
        word_to_vec_map = {}

        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)

    return words, word_to_vec_map


# Load the GloVe word embeddings file
glove_file = "../Data/glove/glove.6B.100d.txt"
glove_words, glove_word2vec_map = read_glove_vecs(glove_file)

In [None]:
def glove_vectorize_sentence(sentence):
    lemmas = sentence.split()
    vectors = []
    for word in lemmas:
        if word in glove_words:
            vectors.append(glove_word2vec_map[word])
        else:
            vectors.append(glove_word2vec_map["unk"])
    return vectors

In [None]:
X_train_glove = [glove_vectorize_sentence(sentence) for sentence in X_train]
X_val_glove = [glove_vectorize_sentence(sentence) for sentence in X_val]
X_test_glove = [glove_vectorize_sentence(sentence) for sentence in X_test]

In [None]:
# pad to take all sequences to same length
from tensorflow.keras.preprocessing.sequence import pad_sequences

lengths = [len(vecs) for vecs in X_train_glove]
max_length = int(np.percentile(lengths, 70))  # 80th percentile
print("max_length is: ", max_length)
    
X_train_glove_pad = pad_sequences(X_train_glove,
                                  maxlen=max_length, dtype='float32',
                                  padding='post', truncating='pre')
X_val_glove_pad = pad_sequences(X_val_glove,
                                maxlen=max_length,
                                dtype='float32', padding='post', truncating='pre')
X_test_glove_pad = pad_sequences(X_test_glove, 
                                 maxlen=max_length, dtype='float32',
                                 padding='post', truncating='pre')


In [None]:
# A Simple Model for LSTM
model = keras.models.Sequential()
model.add(keras.layers.LSTM(units=128, 
                            input_shape=(max_length, X_train_glove_pad.shape[2]), 
                            dropout=0, 
                            recurrent_dropout=0, 
                            return_sequences=False))
model.add(keras.layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
print(model.summary())

In [None]:
#fit the model
epoche=10
b_size=128
verb=1
es = keras.callbacks.EarlyStopping(monitor='val_accuracy', mode='max', verbose=verb, patience=5)
#es=callbacks.ModelCheckpoint(filepath='./nnet_for.hdf5', monitor='val_mean_squared_error', verbose=2, save_best_only=True) # con questo fa tutte le epoche ma salva il migliore. SOpra può fermarsi prima di fine epoche
history=model.fit(X_train_glove_pad,Y_train,
					epochs=epoche,
					validation_data=(X_val_glove_pad,Y_val),
					batch_size=b_size,
					callbacks=[es],
					verbose=verb)

print('\n Test accuracy = ', model.evaluate(X_test_glove_pad,Y_test, verbose=0)[1])

## APPENDIX: alternative syntax - Embedding with Glove and Keras

In [None]:
from keras.preprocessing.text import one_hot, Tokenizer
word_tokenizer = Tokenizer()
word_tokenizer.fit_on_texts(X_train)
X_train_tkn = word_tokenizer.texts_to_sequences(X_train)
X_val_tkn = word_tokenizer.texts_to_sequences(X_val)
X_test_tkn = word_tokenizer.texts_to_sequences(X_test)

# Adding 1 to store dimensions for words for which no pretrained word embeddings exist
vocab_length = len(word_tokenizer.word_index) + 1

# Padding all reviews to fixed length 100
maxlen = max_length
X_train_tkn_pad = pad_sequences(X_train_tkn, padding='post', truncating='pre', maxlen=maxlen)
X_val_tkn_pad = pad_sequences(X_val_tkn, padding='post', truncating='pre', maxlen=maxlen)
X_test_tkn_pad = pad_sequences(X_test_tkn, padding='post', truncating='pre', maxlen=maxlen)

In [None]:
embeddings_dictionary = dict()
glove_file = open('../Data/glove/glove.6B.100d.txt', encoding="utf8")
for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = np.asarray(records[1:], dtype='float32')
    embeddings_dictionary [word] = vector_dimensions
glove_file.close()

In [None]:
vocab_length = len(glove_words)

embedding_matrix = np.zeros((vocab_length, 100))
for word, index in word_tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

# Print Embedding Matrix shape
embedding_matrix.shape

In [None]:
from keras.layers import Embedding, LSTM, Dense

# Neural Network architecture
lstm_model = keras.models.Sequential()
embedding_layer = Embedding(vocab_length, 100, weights=[embedding_matrix], input_length=maxlen , trainable=False)
lstm_model.add(embedding_layer)
lstm_model.add(LSTM(128))
lstm_model.add(Dense(1, activation='sigmoid'))

# Model compiling
lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
print(lstm_model.summary())

# Model Training
lstm_model_history = lstm_model.fit(X_train_tkn_pad, Y_train, batch_size=128, epochs=8, verbose=1, validation_split=0.2)