## Sentiment classification with Neural Network

Using a subset of the aclImdb dataset this notebook builds and trains a text classification engine.


In [1]:
# THIS CELL IS USED TO CREATE A SUBSET OF THE WHOLE aclImdb DATASET
# If the data/aclImdb_subset directory exists this cell does nothing.
# SET THE VARIABLES IN THE MIDDLE OF THE CELL to create each subfolder.

import os
import shutil
import random

if not os.path.isdir("../../Data/aclImdb_subset/"):
    print(len(os.listdir("../../Data/aclImdb/train/pos")))
    print(len(os.listdir("../../Data/aclImdb/train/neg")))
    print(len(os.listdir("../../Data/aclImdb/test/pos")))
    print(len(os.listdir("../../Data/aclImdb/test/neg")))

    train_pos_files = os.listdir("../../Data/aclImdb/train/pos")
    train_neg_files = os.listdir("../../Data/aclImdb/train/neg")
    test_pos_files = os.listdir("../../Data/aclImdb/test/pos")
    test_neg_files = os.listdir("../../Data/aclImdb/test/neg")


    # SET THESE 3 VARIABLES
    train_or_test = 'test'
    pos_or_neg = 'neg'
    file_names = test_neg_files
    ############################

    in_folder = "../../Data/aclImdb/" + train_or_test + "/" + pos_or_neg + "/"
    out_folder = "../../Data/aclImdb_subset/" + train_or_test + "/" + pos_or_neg + "/"

    used_indexes = []
    for i in range(int(len(os.listdir(in_folder))/10)):
        index = random.randint(0, len(os.listdir(in_folder)))
        while index in used_indexes:
            index = random.randint(0, len(os.listdir(in_folder)))
        file_ = in_folder + file_names[index]
        shutil.copy(file_ , out_folder)
        used_indexes.append(index)

In [2]:
import numpy as np
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import nltk

import string
from tensorflow import keras
import os 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import tensorflow as tf

nltk_stopw = stopwords.words('english')


### Read the Text Corpus 


In [3]:

data = "../../Data/aclImdb_subset/"
labelToName = { 0 : 'neg', 1: 'pos' }
def getMovies(split):
    '''
    outputs:
    X_raw: list of reviews
    Y: target array; len(Y)=len(X_raw)
    '''
    X_raw, Y  = [], []

    for classIndex, directory in enumerate(['neg', 'pos']):
        dirName = data + split + "/" + directory
        for reviewFile in os.listdir(dirName):
            with open (dirName + '/' + reviewFile, 'r', encoding='utf8') as f:
                raw = f.read()
                if (len(raw) == 0):
                    continue
            X_raw.append(raw)
            Y.append(classIndex)
    return X_raw, np.array(Y)

# We will split later in train and val
X_raw, Y = getMovies(split='train')

X_raw_test, Y_test = getMovies(split='test')




In [4]:
n_char_train = [len(x) for x in X_raw]
n_char_test = [len(x) for x in X_raw_test]
print('TRAIN: ', len(X_raw),' reviews; ','minimum length = ', min(n_char_train), ', max length = ',max(n_char_train), ', median',np.median(n_char_train), 'characters')
print('TEST: ', len(X_raw_test),' reviews; ','minimum length = ', min(n_char_test), ', max length = ',max(n_char_test), ', median',np.median(n_char_test), 'characters')

print('\n \n TEXT \n',X_raw[0],'\n LABEL =', labelToName[Y[0]])

TRAIN:  2494  reviews;  minimum length =  81 , max length =  8969 , median 975.0 characters
TEST:  2496  reviews;  minimum length =  32 , max length =  12988 , median 961.0 characters

 
 TEXT 
 A young scientist is trying to carry on his dead father's work on limb regeneration.His overbearing mother has convinced him that he murdered his own father and is monitoring his progress for her own evil purposes.A young doctor uses reptilian DNA he extracts from a large creature and when his arm is conveniently ripped off a few minutes later,he injects himself with his formula and grows a new murderous arm...Admittedly the special effects in "Severed Ties" are pretty good and grotesque,but the rest of the film is awful.The severed arm is behaving like a snake and kills few people.Big deal.The acting is mediocre and the climax is silly.3 out of 10. 
 LABEL = neg


### Text preprocessing
lowcase, tokenize, remove punctuations, lemmatize

In [5]:
def get_pos(pos):
    '''
    Convert nltk.pos_tag() tags  so that they can be understood by pos tags by nltk.WordNetLemmatizer()
    '''
    if pos.startswith('J'):
        return 'a' # o wordnet.ADJ
    elif pos.startswith('V'):
        return 'v' # o wordnet.VERB
    elif pos.startswith('N'):
        return 'n' # o wordnet.NOUN
    elif pos.startswith('R'):
        return 'r' # o wordnet.ADV
    else:          
        return 'n' # default 

def txt_preprocessing(X, printa=False):
    i = 0 #text to print
    #lowcase
    X = [x.lower() for x in X]
    if printa: print(X[i],'\n')

    # tokenize: token are made of strings or of alphanumerical strings; punctuaction and special chars are excluded.
    # token with <=2 or >14 chars are removed
    X = [RegexpTokenizer(r'\b[a-zA-Z][a-zA-Z0-9]{2,14}\b').tokenize(x) for x in X] # or [re.findall(r'\b[a-zA-Z][a-zA-Z0-9]{2,14}\b',x) for x in X]
    if printa: print(X[i],'\n')

    #remove stop words
    X = [(lambda x: [x_i for x_i in x if x_i not in nltk_stopw])(x) for x in X] # alternatively list(map(lambda x: ([x_i for x_i in x if x_i not in nltk_stopw]),X))
    if printa: print(X[i],'\n')

    # lemmatization using POS
    X = [nltk.pos_tag(x) for x in X]
    if printa: print(X[i],'\n')

    # map POS tags to work with nltk.WordNetLemmatizer()
    X = [ (lambda x: [(x_i[0],get_pos(x_i[1])) for x_i in x])(x) for x in X]
    if printa: print(X[i],'\n')

    # lemmatize
    X = [(lambda x: [nltk.WordNetLemmatizer().lemmatize(w,p) for w,p in x])(x) for x in X]
    if printa: print(X[i],'\n')

    # reshape as a list of sentences: [['this','is','string','1'], ['this','is','string','2']...] --> ['this is string 1','this is string 2'...]
    X = [" ".join(x) for x in X]
    if printa: print(X[i])

    return X

In [6]:
# see all passages in txt_processing
txt_preprocessing([X_raw[0]], printa=True)

a young scientist is trying to carry on his dead father's work on limb regeneration.his overbearing mother has convinced him that he murdered his own father and is monitoring his progress for her own evil purposes.a young doctor uses reptilian dna he extracts from a large creature and when his arm is conveniently ripped off a few minutes later,he injects himself with his formula and grows a new murderous arm...admittedly the special effects in "severed ties" are pretty good and grotesque,but the rest of the film is awful.the severed arm is behaving like a snake and kills few people.big deal.the acting is mediocre and the climax is silly.3 out of 10. 

['young', 'scientist', 'trying', 'carry', 'his', 'dead', 'father', 'work', 'limb', 'regeneration', 'his', 'overbearing', 'mother', 'has', 'convinced', 'him', 'that', 'murdered', 'his', 'own', 'father', 'and', 'monitoring', 'his', 'progress', 'for', 'her', 'own', 'evil', 'purposes', 'young', 'doctor', 'uses', 'reptilian', 'dna', 'extracts'

['young scientist try carry dead father work limb regeneration overbear mother convince murder father monitoring progress evil purpose young doctor use reptilian dna extract large creature arm conveniently rip minute later injects formula grow new murderous arm admittedly special effect sever tie pretty good grotesque rest film awful sever arm behaving like snake kill people big deal act mediocre climax silly']

In [7]:
# 1 minute run
X = txt_preprocessing(X_raw)
X_test = txt_preprocessing(X_raw_test)


In [8]:
print(X_raw[0],'\n\n',X[0])

A young scientist is trying to carry on his dead father's work on limb regeneration.His overbearing mother has convinced him that he murdered his own father and is monitoring his progress for her own evil purposes.A young doctor uses reptilian DNA he extracts from a large creature and when his arm is conveniently ripped off a few minutes later,he injects himself with his formula and grows a new murderous arm...Admittedly the special effects in "Severed Ties" are pretty good and grotesque,but the rest of the film is awful.The severed arm is behaving like a snake and kills few people.Big deal.The acting is mediocre and the climax is silly.3 out of 10. 

 young scientist try carry dead father work limb regeneration overbear mother convince murder father monitoring progress evil purpose young doctor use reptilian dna extract large creature arm conveniently rip minute later injects formula grow new murderous arm admittedly special effect sever tie pretty good grotesque rest film awful sever

In [9]:
# Test/Train Split
X_train, X_val, Y_train, Y_val = train_test_split(X,Y, test_size=0.2, random_state=123)
print(Y_train.mean(), Y_val.mean())

0.4907268170426065 0.5390781563126252


Up until the cell above the code is in common for setions below.

## Classic model with TF-IDF

In [10]:
# Build tf-Idf vectors with sklearn

vectorizer = TfidfVectorizer().fit(X_train)
X_train_vect = vectorizer.transform(X_train).toarray()
X_val_vect = vectorizer.transform(X_val).toarray()
X_test_vect = vectorizer.transform(X_test).toarray()
print('X_train shape', X_train_vect.shape)
print('vocab length = ', len(vectorizer.vocabulary_))


X_train shape (1995, 20431)
vocab length =  20431


In [11]:
# define a Simple Model 
model = keras.models.Sequential()
model.add(keras.layers.Dense(16, activation='relu', input_shape=(X_train_vect.shape[1],)))
model.add(keras.layers.Dense(1, activation='sigmoid'))# oppure 2 e softmax, e sotto sparse_categorical_crossentropy
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
print(model.summary())

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


None


In [12]:
# fit the model
epoche=10
b_size=32
verb=1
es = keras.callbacks.EarlyStopping(monitor='val_accuracy', mode='max', verbose=verb, patience=5)
#es=callbacks.ModelCheckpoint(filepath='./nnet_for.hdf5', monitor='val_mean_squared_error', verbose=2, save_best_only=True) # con questo fa tutte le epoche ma salva il migliore. SOpra può fermarsi prima di fine epoche
history=model.fit(X_train_vect,Y_train,
					epochs=epoche,
					validation_data=(X_val_vect,Y_val),
					batch_size=b_size,
					callbacks=[es],
					verbose=verb)

print('\n Test accuracy = ', model.evaluate(X_test_vect,Y_test, verbose=0)[1])

Epoch 1/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.5982 - loss: 0.6869 - val_accuracy: 0.8417 - val_loss: 0.6515
Epoch 2/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9480 - loss: 0.6019 - val_accuracy: 0.8537 - val_loss: 0.5898
Epoch 3/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9653 - loss: 0.4806 - val_accuracy: 0.8477 - val_loss: 0.5259
Epoch 4/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9786 - loss: 0.3628 - val_accuracy: 0.8617 - val_loss: 0.4715
Epoch 5/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9910 - loss: 0.2633 - val_accuracy: 0.8557 - val_loss: 0.4347
Epoch 6/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9962 - loss: 0.1985 - val_accuracy: 0.8497 - val_loss: 0.4112
Epoch 7/10
[1m63/63[0m [32m━━━━━━━━━━