## Sentiment classification

Using a subset of the aclImdb dataset this notebook builds and trains a text classification engine.


In [None]:
# THIS CELL IS USED TO CREATE A SUBSET OF THE WHOLE aclImdb DATASET
# If the data/aclImdb_subset directory exists this cell does nothing.
# SET THE VARIABLES IN THE MIDDLE OF THE CELL to create each subfolder.

import os
import shutil
import random

if not os.path.isdir("../../Data/aclImdb_subset/"):
    print(len(os.listdir("../../Data/aclImdb/train/pos")))
    print(len(os.listdir("../../Data/aclImdb/train/neg")))
    print(len(os.listdir("../../Data/aclImdb/test/pos")))
    print(len(os.listdir("../../Data/aclImdb/test/neg")))

    train_pos_files = os.listdir("../../Data/aclImdb/train/pos")
    train_neg_files = os.listdir("../../Data/aclImdb/train/neg")
    test_pos_files = os.listdir("../../Data/aclImdb/test/pos")
    test_neg_files = os.listdir("../../Data/aclImdb/test/neg")


    # SET THESE 3 VARIABLES
    train_or_test = 'test'
    pos_or_neg = 'neg'
    file_names = test_neg_files
    ############################

    in_folder = "../../Data/aclImdb/" + train_or_test + "/" + pos_or_neg + "/"
    out_folder = "../../Data/aclImdb_subset/" + train_or_test + "/" + pos_or_neg + "/"

    used_indexes = []
    for i in range(int(len(os.listdir(in_folder))/10)):
        index = random.randint(0, len(os.listdir(in_folder)))
        while index in used_indexes:
            index = random.randint(0, len(os.listdir(in_folder)))
        file_ = in_folder + file_names[index]
        shutil.copy(file_ , out_folder)
        used_indexes.append(index)

In [None]:
import numpy as np
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import nltk

import string
from tensorflow import keras
import os 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import tensorflow as tf

nltk_stopw = stopwords.words('english')


### Read the Text Corpus 


In [None]:

data = "../../Data/aclImdb_subset/"
labelToName = { 0 : 'neg', 1: 'pos' }
def getMovies(split):
    '''
    outputs:
    X_raw: list of reviews
    Y: target array; len(Y)=len(X_raw)
    '''
    X_raw, Y  = [], []

    for classIndex, directory in enumerate(['neg', 'pos']):
        dirName = data + split + "/" + directory
        for reviewFile in os.listdir(dirName):
            with open (dirName + '/' + reviewFile, 'r', encoding='utf8') as f:
                raw = f.read()
                if (len(raw) == 0):
                    continue
            X_raw.append(raw)
            Y.append(classIndex)
    return X_raw, np.array(Y)

# We will split later in train and val
X_raw, Y = getMovies(split='train')

X_raw_test, Y_test = getMovies(split='test')




In [None]:
n_char_train = [len(x) for x in X_raw]
n_char_test = [len(x) for x in X_raw_test]
print('TRAIN: ', len(X_raw),' reviews; ','minimum length = ', min(n_char_train), ', max length = ',max(n_char_train), ', median',np.median(n_char_train), 'characters')
print('TEST: ', len(X_raw_test),' reviews; ','minimum length = ', min(n_char_test), ', max length = ',max(n_char_test), ', median',np.median(n_char_test), 'characters')

print('\n \n TEXT \n',X_raw[0],'\n LABEL =', labelToName[Y[0]])

### Text preprocessing
lowcase, tokenize, remove punctuations, lemmatize

In [None]:
def get_pos(pos):
    '''
    Convert nltk.pos_tag() tags  so that they can be understood by pos tags by nltk.WordNetLemmatizer()
    '''
    if pos.startswith('J'):
        return 'a' # o wordnet.ADJ
    elif pos.startswith('V'):
        return 'v' # o wordnet.VERB
    elif pos.startswith('N'):
        return 'n' # o wordnet.NOUN
    elif pos.startswith('R'):
        return 'r' # o wordnet.ADV
    else:          
        return 'n' # default 

def txt_preprocessing(X, printa=False):
    i = 0 #text to print
    #lowcase
    X = [x.lower() for x in X]
    if printa: print(X[i],'\n')

    # tokenize: token are made of strings or of alphanumerical strings; punctuaction and special chars are excluded.
    # token with <=2 or >14 chars are removed
    X = [RegexpTokenizer(r'\b[a-zA-Z][a-zA-Z0-9]{2,14}\b').tokenize(x) for x in X] # or [re.findall(r'\b[a-zA-Z][a-zA-Z0-9]{2,14}\b',x) for x in X]
    if printa: print(X[i],'\n')

    #remove stop words
    X = [(lambda x: [x_i for x_i in x if x_i not in nltk_stopw])(x) for x in X] # alternatively list(map(lambda x: ([x_i for x_i in x if x_i not in nltk_stopw]),X))
    if printa: print(X[i],'\n')

    # lemmatization using POS
    X = [nltk.pos_tag(x) for x in X]
    if printa: print(X[i],'\n')

    # map POS tags to work with nltk.WordNetLemmatizer()
    X = [ (lambda x: [(x_i[0],get_pos(x_i[1])) for x_i in x])(x) for x in X]
    if printa: print(X[i],'\n')

    # lemmatize
    X = [(lambda x: [nltk.WordNetLemmatizer().lemmatize(w,p) for w,p in x])(x) for x in X]
    if printa: print(X[i],'\n')

    # reshape as a list of sentences: [['this','is','string','1'], ['this','is','string','2']...] --> ['this is string 1','this is string 2'...]
    X = [" ".join(x) for x in X]
    if printa: print(X[i])

    return X

In [None]:
# see all passages in txt_processing
txt_preprocessing([X_raw[0]], printa=True)

In [None]:
# 1 minute run
X = txt_preprocessing(X_raw)
X_test = txt_preprocessing(X_raw_test)


In [None]:
print(X_raw[0],'\n\n',X[0])

In [None]:
# Test/Train Split
X_train, X_val, Y_train, Y_val = train_test_split(X,Y, test_size=0.2, random_state=123)
print(Y_train.mean(), Y_val.mean())

Up until the cell above the code is in common for setions below.

## Recurrent model with sequential data

In [None]:
max_length = 100
sequence_length = max_length
vectorize_layer = keras.layers.TextVectorization(
    output_mode="int", # every token is assigned an index
    output_sequence_length=sequence_length
)
vectorize_layer.adapt(X_train)

vocab_size = vectorize_layer.vocabulary_size()
print('distinct tokens (including 1 for UNK and 0 for padded tokens) = ', vectorize_layer.vocabulary_size())

X_train_encoded = vectorize_layer(X_train)
X_val_encoded = vectorize_layer(X_val)
X_test_encoded = vectorize_layer(X_test)

print('X_train_encoded.shape  = ',X_train_encoded.shape)
print('X_val_encoded.shape  = ',X_val_encoded.shape)
print('X_test_encoded.shape  = ',X_test_encoded.shape)

print('\n  = ',X_train_encoded[0])

In [None]:
# A Simple Model for LSTM
model = keras.models.Sequential()
# number of params of the embedding layer is vocab_size*output_dim
embedding = keras.layers.Embedding(  input_dim= vocab_size,
                                                output_dim=50, # 128
                                                input_length=sequence_length, trainable=True ,mask_zero=True) #maskzero -_> sa che 0 è speciale e non avrà 'peso' nella loss
model.add(embedding)
model.add(keras.layers.LSTM(units=150, dropout=0.2, recurrent_dropout=0.2, return_sequences=False))
model.add(keras.layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
print(model.summary())

In [None]:
#fit the model (up to 10 min)
epoche=10
b_size=32
verb=1
es = keras.callbacks.EarlyStopping(monitor='val_accuracy', mode='max', verbose=verb, patience=5)
#es=callbacks.ModelCheckpoint(filepath='./nnet_for.hdf5', monitor='val_mean_squared_error', verbose=2, save_best_only=True) # con questo fa tutte le epoche ma salva il migliore. SOpra può fermarsi prima di fine epoche
history=model.fit(X_train_encoded,Y_train,
					epochs=epoche,
					validation_data=(X_val_encoded,Y_val),
					batch_size=b_size,
					callbacks=[es],
					verbose=verb)

print('\n Test accuracy = ', model.evaluate(X_test_encoded,Y_test, verbose=0)[1])