# Sentiment classification with word embeddings

Words are different from images or even molecules, in that the meaning of a word is not represented by the letters that make up the word (the same way that the meaning of an image is represented by the pixels that make up the pixel).  
Instead, <b>the meaning of words comes from how they are used in conjunction with other words.</b>  

### GloVe, Global Vectors for Word Representation

There are multiple versions of pre-trained GloVe word embeddings.  
They differ in the <i>corpus</i> used to train the embedding, and the <i>size</i> of the embeddings.

GloVe is a project Stanford NLP: https://nlp.stanford.edu/projects/glove/

In [1]:
# THIS CELL IS USED TO CREATE A SUBSET OF THE WHOLE aclImdb DATASET
# If the data/aclImdb_subset directory exists this cell does nothing.
# SET THE VARIABLES IN THE MIDDLE OF THE CELL to create each subfolder.

import os
import shutil
import random

if not os.path.isdir("../../Data/aclImdb_subset/"):
    print(len(os.listdir("../../Data/aclImdb/train/pos")))
    print(len(os.listdir("../../Data/aclImdb/train/neg")))
    print(len(os.listdir("../../Data/aclImdb/test/pos")))
    print(len(os.listdir("../../Data/aclImdb/test/neg")))

    train_pos_files = os.listdir("../../Data/aclImdb/train/pos")
    train_neg_files = os.listdir("../../Data/aclImdb/train/neg")
    test_pos_files = os.listdir("../../Data/aclImdb/test/pos")
    test_neg_files = os.listdir("../../Data/aclImdb/test/neg")


    # SET THESE 3 VARIABLES
    train_or_test = 'test'
    pos_or_neg = 'neg'
    file_names = test_neg_files
    ############################

    in_folder = "../../Data/aclImdb/" + train_or_test + "/" + pos_or_neg + "/"
    out_folder = "../../Data/aclImdb_subset/" + train_or_test + "/" + pos_or_neg + "/"

    used_indexes = []
    for i in range(int(len(os.listdir(in_folder))/10)):
        index = random.randint(0, len(os.listdir(in_folder)))
        while index in used_indexes:
            index = random.randint(0, len(os.listdir(in_folder)))
        file_ = in_folder + file_names[index]
        shutil.copy(file_ , out_folder)
        used_indexes.append(index)

In [2]:
import numpy as np
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import nltk

import string
from tensorflow import keras
import os 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import tensorflow as tf

nltk_stopw = stopwords.words('english')


### Read the Text Corpus 


In [3]:

data = "../../Data/aclImdb_subset/"
labelToName = { 0 : 'neg', 1: 'pos' }
def getMovies(split):
    '''
    outputs:
    X_raw: lista di  recensioni
    Y: array di target; len(Y)=len(X_raw)
    '''
    X_raw, Y  = [], []

    for classIndex, directory in enumerate(['neg', 'pos']):
        dirName = data + split + "/" + directory
        for reviewFile in os.listdir(dirName):
            with open (dirName + '/' + reviewFile, 'r', encoding='utf8') as f:
                raw = f.read()
                if (len(raw) == 0):
                    continue
            X_raw.append(raw)
            Y.append(classIndex)
    return X_raw, np.array(Y)

# We will split later in train and val
X_raw, Y = getMovies(split='train')

X_raw_test, Y_test = getMovies(split='test')




In [4]:
n_char_train = [len(x) for x in X_raw]
n_char_test = [len(x) for x in X_raw_test]
print('TRAIN: ', len(X_raw),' reviews; ','min length = ', min(n_char_train), ', max length = ',max(n_char_train), ', median',np.median(n_char_train), 'chars')
print('TEST: ', len(X_raw_test),' reviews; ','min length = ', min(n_char_test), ', mac length = ',max(n_char_test), ', median',np.median(n_char_test), 'chars')

print('\n \n TEXT \n',X_raw[0],'\n LABEL =', labelToName[Y[0]])

TRAIN:  2494  reviews;  min length =  81 , max length =  8969 , median 975.0 chars
TEST:  2496  reviews;  min length =  32 , mac length =  12988 , median 961.0 chars

 
 TEXT 
 A young scientist is trying to carry on his dead father's work on limb regeneration.His overbearing mother has convinced him that he murdered his own father and is monitoring his progress for her own evil purposes.A young doctor uses reptilian DNA he extracts from a large creature and when his arm is conveniently ripped off a few minutes later,he injects himself with his formula and grows a new murderous arm...Admittedly the special effects in "Severed Ties" are pretty good and grotesque,but the rest of the film is awful.The severed arm is behaving like a snake and kills few people.Big deal.The acting is mediocre and the climax is silly.3 out of 10. 
 LABEL = neg


### Text preprocessing
lowcase, tokenize, remove punctuations, lemmatize

In [5]:
def get_pos(pos):
    '''
    Convert nltk.pos_tag() tags  so that they can be understood by pos tags by nltk.WordNetLemmatizer()
    '''
    if pos.startswith('J'):
        return 'a' # o wordnet.ADJ
    elif pos.startswith('V'):
        return 'v' # o wordnet.VERB
    elif pos.startswith('N'):
        return 'n' # o wordnet.NOUN
    elif pos.startswith('R'):
        return 'r' # o wordnet.ADV
    else:          
        return 'n' # default  

def txt_preprocessing(X, printa=False):
    i = 0 #text to print
    #lowcase
    X = [x.lower() for x in X]
    if printa: print(X[i],'\n')

    # tokenize
    X = [RegexpTokenizer(r'\b[a-zA-Z][a-zA-Z0-9]{2,14}\b').tokenize(x) for x in X] #or [re.findall(r'\b[a-zA-Z][a-zA-Z0-9]{2,14}\b',x) for x in X]
    if printa: print(X[i],'\n')

    # remove stop words
    X = [(lambda x: [x_i for x_i in x if x_i not in nltk_stopw])(x) for x in X] # or list(map(lambda x: ([x_i for x_i in x if x_i not in nltk_stopw]),X))
    if printa: print(X[i],'\n')

    # lemmatization using POS
    X = [nltk.pos_tag(x) for x in X]
    if printa: print(X[i],'\n')

    # POS tags to match nltk.WordNetLemmatizer()
    X = [ (lambda x: [(x_i[0],get_pos(x_i[1])) for x_i in x])(x) for x in X]
    if printa: print(X[i],'\n')

    # lemmatizzo
    X = [(lambda x: [nltk.WordNetLemmatizer().lemmatize(w,p) for w,p in x])(x) for x in X]
    if printa: print(X[i],'\n')

    # reshape as a list of sentences: [['this','is','string','1'], ['this','is','string','2']...] --> ['this is string 1','this is string 2'...]
    X = [" ".join(x) for x in X]
    if printa: print(X[i])

    return X

#a=['thIs Film was#@ the ?worst Ever', 'I sAw,  !very good Films recently!']    
#txt_preprocessing(a, printa=True)

In [6]:
# see all passages in txt_processing
txt_preprocessing([X_raw[0]], printa=True)

a young scientist is trying to carry on his dead father's work on limb regeneration.his overbearing mother has convinced him that he murdered his own father and is monitoring his progress for her own evil purposes.a young doctor uses reptilian dna he extracts from a large creature and when his arm is conveniently ripped off a few minutes later,he injects himself with his formula and grows a new murderous arm...admittedly the special effects in "severed ties" are pretty good and grotesque,but the rest of the film is awful.the severed arm is behaving like a snake and kills few people.big deal.the acting is mediocre and the climax is silly.3 out of 10. 

['young', 'scientist', 'trying', 'carry', 'his', 'dead', 'father', 'work', 'limb', 'regeneration', 'his', 'overbearing', 'mother', 'has', 'convinced', 'him', 'that', 'murdered', 'his', 'own', 'father', 'and', 'monitoring', 'his', 'progress', 'for', 'her', 'own', 'evil', 'purposes', 'young', 'doctor', 'uses', 'reptilian', 'dna', 'extracts'

['young scientist try carry dead father work limb regeneration overbear mother convince murder father monitoring progress evil purpose young doctor use reptilian dna extract large creature arm conveniently rip minute later injects formula grow new murderous arm admittedly special effect sever tie pretty good grotesque rest film awful sever arm behaving like snake kill people big deal act mediocre climax silly']

In [7]:
#1 minute run
X = txt_preprocessing(X_raw)
X_test = txt_preprocessing(X_raw_test)


In [8]:
print(X_raw[0],'\n\n',X[0])

A young scientist is trying to carry on his dead father's work on limb regeneration.His overbearing mother has convinced him that he murdered his own father and is monitoring his progress for her own evil purposes.A young doctor uses reptilian DNA he extracts from a large creature and when his arm is conveniently ripped off a few minutes later,he injects himself with his formula and grows a new murderous arm...Admittedly the special effects in "Severed Ties" are pretty good and grotesque,but the rest of the film is awful.The severed arm is behaving like a snake and kills few people.Big deal.The acting is mediocre and the climax is silly.3 out of 10. 

 young scientist try carry dead father work limb regeneration overbear mother convince murder father monitoring progress evil purpose young doctor use reptilian dna extract large creature arm conveniently rip minute later injects formula grow new murderous arm admittedly special effect sever tie pretty good grotesque rest film awful sever

In [9]:

# Test/Train Split
X_train, X_val, Y_train, Y_val = train_test_split(X,Y, test_size=0.2, random_state=123)
print(Y_train.mean(), Y_val.mean())

0.4907268170426065 0.5390781563126252


### Embedding with Glove. 
If not present, browse to https://nlp.stanford.edu/projects/glove/ and download glove.6B.zip.
Unzip files and put in a new directory "glove".

In [13]:
from gensim.models import KeyedVectors

def read_glove_vecs(glove_file):
    with open(glove_file, 'r',encoding="utf8") as f:

        words = set()
        word_to_vec_map = {}

        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)

    return words, word_to_vec_map


# Load the GloVe word embeddings file
glove_file = "../../data/glove/glove.6B.100d.txt"
glove_words, glove_word2vec_map = read_glove_vecs(glove_file)

In [14]:
def glove_vectorize_sentence(sentence):
    lemmas = sentence.split()
    vectors = []
    for word in lemmas:
        if word in glove_words:
            vectors.append(glove_word2vec_map[word])
        else:
            vectors.append(glove_word2vec_map["unk"])
    return vectors

In [15]:
X_train_glove = [glove_vectorize_sentence(sentence) for sentence in X_train]
X_val_glove = [glove_vectorize_sentence(sentence) for sentence in X_val]
X_test_glove = [glove_vectorize_sentence(sentence) for sentence in X_test]

In [16]:
# pad to take all sequences to same length
from tensorflow.keras.preprocessing.sequence import pad_sequences

lengths = [len(vecs) for vecs in X_train_glove]
max_length = int(np.percentile(lengths, 70))  # 80th percentile
print("max_length is: ", max_length)
    
X_train_glove_pad = pad_sequences(X_train_glove,
                                  maxlen=max_length, dtype='float32',
                                  padding='post', truncating='pre')
X_val_glove_pad = pad_sequences(X_val_glove,
                                maxlen=max_length,
                                dtype='float32', padding='post', truncating='pre')
X_test_glove_pad = pad_sequences(X_test_glove, 
                                 maxlen=max_length, dtype='float32',
                                 padding='post', truncating='pre')


max_length is:  131


In [17]:
# A Simple Model for LSTM
model = keras.models.Sequential()
model.add(keras.layers.LSTM(units=128, 
                            input_shape=(max_length, X_train_glove_pad.shape[2]), 
                            dropout=0, 
                            recurrent_dropout=0, 
                            return_sequences=False))
model.add(keras.layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
print(model.summary())

  super().__init__(**kwargs)


None


In [18]:
#fit the model
epoche=10
b_size=128
verb=1
es = keras.callbacks.EarlyStopping(monitor='val_accuracy', mode='max', verbose=verb, patience=5)
#es=callbacks.ModelCheckpoint(filepath='./nnet_for.hdf5', monitor='val_mean_squared_error', verbose=2, save_best_only=True) # con questo fa tutte le epoche ma salva il migliore. SOpra può fermarsi prima di fine epoche
history=model.fit(X_train_glove_pad,Y_train,
					epochs=epoche,
					validation_data=(X_val_glove_pad,Y_val),
					batch_size=b_size,
					callbacks=[es],
					verbose=verb)

print('\n Test accuracy = ', model.evaluate(X_test_glove_pad,Y_test, verbose=0)[1])

Epoch 1/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 200ms/step - accuracy: 0.5365 - loss: 0.6904 - val_accuracy: 0.5030 - val_loss: 0.6840
Epoch 2/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 201ms/step - accuracy: 0.5922 - loss: 0.6562 - val_accuracy: 0.6333 - val_loss: 0.6699
Epoch 3/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 201ms/step - accuracy: 0.6755 - loss: 0.6280 - val_accuracy: 0.6774 - val_loss: 0.6192
Epoch 4/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 221ms/step - accuracy: 0.7253 - loss: 0.5711 - val_accuracy: 0.7595 - val_loss: 0.5116
Epoch 5/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 192ms/step - accuracy: 0.7516 - loss: 0.5351 - val_accuracy: 0.7295 - val_loss: 0.5561
Epoch 6/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 193ms/step - accuracy: 0.7750 - loss: 0.4929 - val_accuracy: 0.7635 - val_loss: 0.4882
Epoch 7/10
[1m16/16[0m [3

## APPENDIX: alternative syntax - Embedding with Glove and Keras

In [21]:
from tensorflow.keras.preprocessing.text import one_hot, Tokenizer
word_tokenizer = Tokenizer()
word_tokenizer.fit_on_texts(X_train)
X_train_tkn = word_tokenizer.texts_to_sequences(X_train)
X_val_tkn = word_tokenizer.texts_to_sequences(X_val)
X_test_tkn = word_tokenizer.texts_to_sequences(X_test)

# Adding 1 to store dimensions for words for which no pretrained word embeddings exist
vocab_length = len(word_tokenizer.word_index) + 1

# Padding all reviews to fixed length 100
maxlen = max_length
X_train_tkn_pad = pad_sequences(X_train_tkn, padding='post', truncating='pre', maxlen=maxlen)
X_val_tkn_pad = pad_sequences(X_val_tkn, padding='post', truncating='pre', maxlen=maxlen)
X_test_tkn_pad = pad_sequences(X_test_tkn, padding='post', truncating='pre', maxlen=maxlen)

In [22]:
embeddings_dictionary = dict()
glove_file = open('../../data/glove/glove.6B.100d.txt', encoding="utf8")
for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = np.asarray(records[1:], dtype='float32')
    embeddings_dictionary [word] = vector_dimensions
glove_file.close()

In [23]:
vocab_length = len(glove_words)

embedding_matrix = np.zeros((vocab_length, 100))
for word, index in word_tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

# Print Embedding Matrix shape
embedding_matrix.shape

(400000, 100)

In [24]:
from keras.layers import Embedding, LSTM, Dense

# Neural Network architecture
lstm_model = keras.models.Sequential()
embedding_layer = Embedding(vocab_length, 100, weights=[embedding_matrix], input_length=maxlen , trainable=False)
lstm_model.add(embedding_layer)
lstm_model.add(LSTM(128))
lstm_model.add(Dense(1, activation='sigmoid'))

# Model compiling
lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
print(lstm_model.summary())

# Model Training
lstm_model_history = lstm_model.fit(X_train_tkn_pad, Y_train, batch_size=128, epochs=8, verbose=1, validation_split=0.2)



None
Epoch 1/8
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 204ms/step - acc: 0.4885 - loss: 0.6934 - val_acc: 0.5439 - val_loss: 0.6870
Epoch 2/8
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 201ms/step - acc: 0.5429 - loss: 0.6768 - val_acc: 0.5489 - val_loss: 0.6771
Epoch 3/8
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 208ms/step - acc: 0.5847 - loss: 0.6564 - val_acc: 0.5539 - val_loss: 0.6953
Epoch 4/8
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 208ms/step - acc: 0.5951 - loss: 0.6574 - val_acc: 0.5464 - val_loss: 0.6821
Epoch 5/8
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 210ms/step - acc: 0.5997 - loss: 0.6504 - val_acc: 0.5714 - val_loss: 0.6729
Epoch 6/8
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 208ms/step - acc: 0.6226 - loss: 0.6446 - val_acc: 0.6917 - val_loss: 0.6377
Epoch 7/8
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 204ms/step - ac

In [26]:
print('\n Test accuracy = ', lstm_model.evaluate(X_test_tkn_pad,Y_test, verbose=0)[1])


 Test accuracy =  0.6875
