# Practical application: Sentiment analysis of movie reviews


## Table of Contents

##### I. <a href=#I>Preliminary steps</a>
##### II. <a href=#II>Prepare the data</a>
##### III. <a href=#V>Naive discriminative learning model</a>

## I. Preliminary steps <a ID="I"></a> 

### Import necessary libraries and set up the working directory

In [None]:
### Import necessary packages
import os
import csv
import re
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Dropout, Dense
from keras.optimizers import Adam, Nadam, RMSprop, SGD
from keras.activations import relu, elu
from keras.losses import categorical_crossentropy
from keras import metrics
from keras.models import load_model
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import warnings

### Set working directory
#WD = 'F:/Adnane/Teaching/Tutorials_ooominds/DTM_tutorial/'
WD = '/media/adnane/HDD drive/Adnane/Teaching/Tutorials_ooominds/DTM_tutorial/'
os.chdir(WD)

### Import local packages
import deep_text_modelling.preprocessing as pr
import deep_text_modelling.modelling as md
import deep_text_modelling.evaluation as ev

# Display option for dataframes and matplotlib
pd.set_option('display.max_colwidth', 100) # Max width of columns when displaying a dataframe
PREVIOUS_MAX_ROWS = pd.options.display.max_rows
pd.options.display.max_rows = 20
warnings.filterwarnings('ignore') # Hide warnings
warnings.simplefilter('ignore')
%matplotlib inline

### Define file paths

In [None]:
IMDB_FULL_CSV = WD + "Data/IMDB_full.csv"
IMDB_TRAIN_CSV = WD + "Data/IMDB_train.csv"
IMDB_VALID_CSV = WD + "Data/IMDB_valid.csv"
IMDB_TEST_CSV = WD + "Data/IMDB_test.csv"
CUE_INDEX = WD + "Data/Cue_index.csv"
OUTCOME_INDEX = WD + "Data/Outcome_index.csv"
DATA_DIR = WD + 'Data/'
GLOVE_PATH = os.path.join(DATA_DIR, 'glove.6B.100d.txt')
WORD2VEC_PATH = os.path.join(DATA_DIR, 'GoogleNews-vectors-negative300.txt')

### Parameters to use

In [None]:
N_outcomes = 2 # number of most frequent outcomes to keep 
N_cues = 2000  # number of most frequent words to keep
prop_valid = 1/8 # proportion of validation data
prop_test = 1/8 # proportion of test data

## II. Prepare the data <a name="II"></a> 

### Load the full data set

In [None]:
imdb_full = pd.read_csv(IMDB_FULL_CSV)
imdb_full.head(5)

In [None]:
# We will use only 2000 reviews from each category to speed up training
N_reviews = 4000
ind_select = list(range(int(N_reviews/2))) + list(range(50000-int(N_reviews/2), 50000))
imdb_full = imdb_full.iloc[ind_select,]
print(f'Number of examples: {len(imdb_full)}')

### Remove special characters

In [None]:
# Allowed characters
ENGLISH = "abcdefghijklmnopqrstuvwxyz"
#ENGLISH = ENGLISH + ENGLISH.upper()
not_allowed_symbols = re.compile("[^%s]" % ENGLISH)

In [None]:
# Lower-case all letters
imdb_full['review_cleaned'] = imdb_full['review'].apply(lambda s: s.lower())
# Replace special characters with spaces
imdb_full['review_cleaned'] = imdb_full['review_cleaned'].apply(lambda s: not_allowed_symbols.sub(" ", s))
# Remove multiple spaces
imdb_full['review_cleaned'] = imdb_full['review_cleaned'].apply(lambda s: re.sub('\s+', ' ', s))
imdb_full.head(5)

### Create the cues and outcomes

In [None]:
imdb_full['cues'] = imdb_full['review_cleaned'].apply(lambda s: "_".join(s.split()))
imdb_full['outcomes'] = imdb_full['sentiment']
imdb_full.head(5)

### Final data set

In [None]:
# Retain only the 'cues' and 'outcomes' columns
imdb_full = imdb_full[['cues', 'outcomes']]
imdb_full.head(5)

### Create index systems for the cues and outcomes

In [None]:
# Create the files containing the index systems
pr.create_index_systems_from_df(data = imdb_full, 
                                cue_index_path = CUE_INDEX, 
                                outcome_index_path = OUTCOME_INDEX)

In [None]:
# Import the cue index system
cue_to_index = pr.import_index_system(CUE_INDEX, N_tokens = N_cues)
pr.display_dictionary(cue_to_index, start = 0, end = 5)

In [None]:
# Import the outcome index system
outcome_to_index = pr.import_index_system(OUTCOME_INDEX)
outcome_to_index

In [None]:
# Reverse the cue dictionary
index_to_cue = pr.reverse_dictionary(cue_to_index)
# Reverse the outcome dictionary
index_to_outcome = pr.reverse_dictionary(outcome_to_index)
index_to_outcome

### Split into training, validation and test sets

In [None]:
# Create train, valid and test set files
pr.df_train_valid_test_split(data = imdb_full, 
                             train_data_path = IMDB_TRAIN_CSV, 
                             valid_data_path = IMDB_VALID_CSV, 
                             test_data_path = IMDB_TEST_CSV, 
                             p_valid = prop_valid, 
                             p_test = prop_test,
                             seed = 1)

In [None]:
# Load the train, valid and test sets
imdb_train = pd.read_csv(IMDB_TRAIN_CSV, sep=',', na_filter = False)
imdb_valid = pd.read_csv(IMDB_VALID_CSV, sep=',', na_filter = False)
imdb_test = pd.read_csv(IMDB_VALID_CSV, sep=',', na_filter = False)

## III. Naive discriminative learning model <a ID="V"></a> 

### Build a simple NDL model

In [None]:
### Build a simple NDL
p = {'epochs': 10, # number of iterations on the full set 
    'lr': 0.001}

# Model fitting
NDL_history_dict, NDL_model = md.train(model = 'NDL',
                                       data_train = imdb_train, 
                                       data_valid = imdb_valid,  
                                       cue_index = cue_to_index, 
                                       outcome_index = outcome_to_index, 
                                       num_threads = 16, 
                                       verbose = 1,
                                       params = p,
                                       temp_dir = DATA_DIR)

In [None]:
# Generate learning curve
ev.plot_learning_curve(history_dict = NDL_history_dict, metric = 'accuracy', set = 'train_valid')

### Tune the parameters to find a good model

In [None]:
import imp
imp.reload(md)

In [None]:
### Parameter tuning using grid search 
p = {'lr': [0.00001, 0.00005, 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05], # learning rate (x8)
     'epochs': [1, 2, 4, 6], # number of iterations on the full set (x4)
    }

### Grid search 
TUNING_PATH = WD + 'Results/grid_search_NDL_imdb.csv'
md.grid_search(model = 'NDL',
               data_train = imdb_train, 
               data_valid = imdb_valid, 
               cue_index = cue_to_index, 
               outcome_index = outcome_to_index, 
               params = p, 
               prop_grid = 0.2, 
               shuffle_grid = True,
               tuning_output_file = TUNING_PATH, 
               temp_dir = DATA_DIR,
               num_threads = 16)

### Assessing the grid search

In [None]:
# Import the grid search file to analyse the results 
gs_results = pd.read_csv(TUNING_PATH, index_col = False)

# get the number of parameter combinations that were processed
len(gs_results)

In [None]:
# Display the dataframe containing the tuning results
gs_results.head()

In [None]:
# get the highest result for any metric
print(f"- Highest validation accuracy: {gs_results['val_acc'].max()}")
print(f"- Highest validation f1-score: {gs_results['f1score'].max()}")

In [None]:
# get the best paramaters
i_best = gs_results['val_acc'].argmax()
gs_results.iloc[i_best, ]

### Retraining with the best parameters

In [None]:
### Hyperparameters to use
p = {'epochs': 3, # number of iterations on the full set 
    'lr': 0.001}

# Model fitting
NDL_hist, NDL_model = md.train(model = 'NDL',
                               data_train = imdb_train, 
                               data_valid = imdb_valid,  
                               cue_index = cue_to_index, 
                               outcome_index = outcome_to_index, 
                               num_threads = 16, 
                               verbose = 1,
                               params = p)

In [None]:
# Save the weights and training history
MODEL_PATH = WD + 'Results/NDL_imdb.h5'
HISTORY_PATH = WD + 'Results/NDL_history_dict_imdb'
md.export_model(model = NDL_model, path = MODEL_PATH)  # create a HDF5 file 
md.export_history(history_dict = NDL_hist, path = HISTORY_PATH)
del NDL_model, NDL_hist  # delete the existing model and history dictionary

In [None]:
# Load the model and training history
MODEL_PATH = WD + 'Results/NDL_imdb.h5'
HISTORY_PATH = WD + 'Results/NDL_history_dict_imdb'
NDL_model = md.import_model(MODEL_PATH)
NDL_history_dict = md.import_history(path = HISTORY_PATH)

### Evaluate the final model

In [None]:
# Test prediction for a single given cue sequence. Model expect input as array of shape (1, N_cues) 
cue1_seq = 'it_is_the_worst_film_ever' # context from the sentence 'I will meet you tomorrow'
outcome1_prob_pred = ev.predict_proba_oneevent_NDL(model = NDL_model, 
                                                   cue_seq = cue1_seq)
print(outcome1_prob_pred) # vector of predicted probabilities
print({index_to_outcome[j+1]:outcome1_prob_pred[j] for j in range(len(outcome1_prob_pred))})

The probability of neg seems low (the model is quite unconfident), because we trained it on a small dataset (with 3000 examples). One way to increase the confidence of the model is to add a temperature parameter to the softmax function and set it at a low value.

In [None]:
# Test prediction for a single given cue sequence. Model expect input as array of shape (1, N_cues) 
cue1_seq = 'it_is_the_worst_film_ever'
outcome1_prob_pred = ev.predict_proba_oneevent_NDL(model = NDL_model, 
                                                   cue_seq = cue1_seq,
                                                   T = 0.1)
print(outcome1_prob_pred) # vector of predicted probabilities
print({index_to_outcome[j+1]:outcome1_prob_pred[j] for j in range(len(outcome1_prob_pred))})

In [None]:
### Evaluate the model on the test set
# True outcomes to compare the predictions to
y_test = imdb_test['outcomes'].tolist()

# Predicted outcomes
y_pred = ev.predict_outcomes_NDL(model = NDL_model, 
                                 data_test = imdb_test,
                                 num_threads = 16,
                                 temp_dir = DATA_DIR,
                                 remove_temp_dir = False)

# Overall test accuracy
test_accuracy = accuracy_score(y_test, y_pred)
test_accuracy

In [None]:
# Test accuracy per class
cmat = confusion_matrix(y_test, y_pred, labels = list(outcome_to_index.keys())) # Confusion matrix
cmat_diag = cmat.diagonal()/cmat.sum(axis=1)
print({index_to_outcome[j+1]:cmat_diag[j] for j in range(len(cmat_diag))})