In [590]:
import numpy as np
import nltk
import os
import torch
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import PlaintextCorpusReader
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk import word_tokenize
import os
import os.path
import pandas as pd
import random
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import wordnet as wn
from nltk import pos_tag

from nltk.cluster.kmeans import KMeansClusterer
from nltk.stem import WordNetLemmatizer
from matplotlib import pyplot as plt
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix
import re
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.optim as optim
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
from sklearn.model_selection import train_test_split
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.utils.data import DataLoader, TensorDataset

nltk.download('wordnet')
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/christianpollitt/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/christianpollitt/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/christianpollitt/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [591]:
class Functions:
    def __init__(self):
        self.path = "./product_reviews/"
         
    def read_data_raw(self, path: str) -> list:         
        """
        Read files from a directory and then append the data of each file into a list.
        """
        corpus_root = path
        corpora = PlaintextCorpusReader(corpus_root, '.*')
        
        return corpora
    
    def process_review(self, review, stem=False, lem=False) -> list:
        
        # Tokenize
        tokens = word_tokenize(review)
        # Case fold and Remove Stop Words
        stopwords = nltk.corpus.stopwords.words('english')
        processed_list = [w for w in tokens if w.lower() not in stopwords and w.isalpha() and (len(w) > 1) and  w.lower()]
        
        if (stem):
            stemmer = PorterStemmer()
            processed_list = [stemmer.stem(x) for x in processed_list]
        
        if (lem):
            lemmatizer = WordNetLemmatizer()
            # lemmatization
            processed_list = [lemmatizer.lemmatize(x) for x in processed_list]
        

        return processed_list
    
    def extract_label(self, review: str) -> str:  
        score = 0
        # print(review)
        ratings = re.findall(r'\[.[0-9]\]', review)
        
        # If no rating given, set to NA
        label = "NA"
        
        
        
        # Collate a total score for reviews, apply a class label
        for rate in ratings:
            rate = str(rate).replace('[','')
            rate = str(rate).replace(']','')
            pos_sign = '+'
            neg_sign = '-'
            if (pos_sign in rate):
                rate = rate.replace('+',"")
                rate = int(rate)
            elif (neg_sign in rate):
                rate = rate.replace('-',"")
                rate = int(rate)
                rate = -1*rate
                
            score += int(rate)
        
        if (score > 0):
            label = 1
        elif (score < 0):
            label = 0
        elif (score == 0):
            label = -1
            
        return label
    
    def prepare_reviews_and_labels(self, review_files):
        # For each file in the folder
        review_bank = []
        processed_review_bank = []
        review_labels = []
        for file in review_files:
            with open(("product_reviews/"+file), "r+", encoding='utf-8') as f:
                
                # Put all the file data into separate lines
                lines = f.readlines()
                n_lines = len(lines)
                n_reviews = 0
                review_data = []
        
                # For each line in the file, check if it contains a start of the review tag
                for i in range(n_lines):
                    if "[t]" in lines[i]:
                        # If there is sentences from the last tag
                        if (len(review_data) != 0):
                            review_bank.append(review_data)
                            # Restart the compilation of sentences
                        review_data = []    
                        n_reviews += 1
                        # Create a place to store the upcoming sentences
                    else:
                        # Get the sentence and append it to to the last documented review
                        sentence = lines[i].split("##")
                        review_data.append(sentence)
                        
                # For each Review, Process and Save review
                for review in review_data:
                    # Process Each Review, get the label (by processing it as a string for scores)
                    review_string = ''.join(review)
                    label = self.extract_label(review_string)
                    
                    # Get a list of terms in the review
                    processed_review = self.process_review(review_string, True, True)
                    processed_review_bank.append(processed_review)
                    review_labels.append(label)

        return processed_review_bank, review_labels
    
    def prepare_train_test_data(self, features_positive, features_negative):
        """
        Split data into train validation and testing
        """
        ratio_traintest = 0.8
        # Get the data for train and test
        df_positive_train, df_positive_test = train_test_split(features_positive, train_size = ratio_traintest, random_state = 1)
        df_negative_train, df_negative_test = train_test_split(features_negative, train_size = ratio_traintest, random_state = 1)
        
        df_train = pd.concat([df_positive_train, df_negative_train], ignore_index=True, sort=False)
        df_test = pd.concat([df_positive_test, df_negative_test], ignore_index=True, sort=False)
        
        # Split the test into validation / test data
        df_test, df_validation =  train_test_split(df_test, train_size = 0.5, random_state = 1)

        return df_train, df_test, df_validation
    
    
    def prepare_train_test_data_kfold(self, features_test):
        """
        Split data into train validation and testing
        """
        
        
        # Split the test into validation / test data
        df_test, df_validation =  train_test_split(features_test, train_size = 0.5)
        
        return df_test, df_validation
    
    def prepare_for_pytorch(self, train_x, train_y, validation_x, validation_y, test_x, test_y):
        """
        Split data into train validation and testing
        """
        batch_size = 16
        
        # Format data to correct dimensions for Data Loader
        train_x = np.vstack(train_x).astype(np.int64)
        train_y = np.vstack(train_y).astype(np.int64)
        validation_x = np.vstack(validation_x).astype(np.int64)
        validation_y = np.vstack(validation_y).astype(np.int64)
        test_x = np.vstack(test_x).astype(np.int64)
        test_y = np.vstack(test_y).astype(np.int64)
        
        train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
        validation_data = TensorDataset(torch.from_numpy(validation_x), torch.from_numpy(validation_y))
        test_data = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))

        train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
        validation_loader = DataLoader(validation_data, shuffle=True, batch_size=batch_size)
        test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)
        
        return train_loader, validation_loader, test_loader

    
    def prepare_review_vocab(self, review_bank):
        """
        Word embedding, by creating a numbered dictionary to later convert to one-hot vectors
        """
        all_words = []
        
        # Combine all the data to make a Vocabulary
        for review in review_bank:
            review_words = []
            for i, word in enumerate(review):
                review_words.append(review[i])
            all_words.extend(review_words)
            
        # Create vocab
        vocab = sorted(set(all_words))
        vocab_size = len(vocab)
             
        # Create Integers for each word
        vocab_int_dict = {word: i+1 for i, word in enumerate(vocab)}
        
        # Convert Reviews using One-Hot representation
        reviews_oneh_indexs = []
        for review in review_bank:
            review_index_form = []
            for word in review:
                # Find the index in the dictionary 
                index = vocab_int_dict[word]
                review_index_form.append(index)
                
            reviews_oneh_indexs.append(review_index_form)   
            
        return vocab_int_dict, reviews_oneh_indexs
    
    def prepare_padded_reviews(self, reviews_as_index):
        """
        Padding so that reviews of varied length can be processed
        """
        # print(reviews_as_index)
        # Fine the largest Review
        max_length = 0
        for single_review_int in reviews_as_index:
        # Establish the max length
            if (len(single_review_int) > max_length):
                max_length = len(single_review_int)
                
        # Convert to One-Hot form
        one_hot_vec = np.zeros((len(reviews_as_index), max_length), dtype=int)
                
        for i, review in enumerate(reviews_as_index):
            review_length = len(review)
            # If we need to do any padding at all
            if review_length <= max_length:
                extension_length = max_length-review_length
                padding = list(np.zeros(extension_length))
                padded_review = padding+review
        
            one_hot_vec[i, :] = np.array(padded_review)

        return one_hot_vec, max_length


      

In [592]:
import torch.nn as nn

class SentimentLSTM(nn.Module):

    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):
        """
        Initialize the model by setting up the layers.
        """
        super().__init__()

        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        # embedding and LSTM layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, 
                            dropout=drop_prob, batch_first=True)
        # dropout layer, a hyperparameter which I shall modify lots
        self.dropout = nn.Dropout(0.2)
        
        # linear and sigmoid layers
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sig = nn.Sigmoid()
        

    def forward(self, x, hidden):
        """
        Perform a forward pass of our model on some input and hidden state.
        """
        batch_size = x.size(0)

        # embeddings and lstm_out
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)
    
        # stack up lstm outputs
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        # dropout and fully-connected layer
        out = self.dropout(lstm_out)
        out = self.fc(out)
      
        
        # sigmoid function
        sig_out = self.sig(out)
        
        # reshape to be batch_size first
        sig_out = sig_out.view(batch_size, -1)
        sig_out = sig_out[:, -1] # get last batch of labels
        
        # return last sigmoid output and hidden state
        return sig_out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data

        hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())

        return hidden
 


In [593]:
def train_evaluate_lstm(lstm, train_loader, evaluation_loader, max_length):
    
    lr = 0.001
    epochs = 7
    clip = 5
    print_every = 200
    batch_size = 16
    criterion = nn.BCELoss()
    optimizer = torch.optim.AdamW(lstm.parameters(), lr=lr)
    CUDA_VISIBLE_DEVICES=""
    counter = 0
    lstm.train()

    # Train for the allocated epochs
    for e in range(epochs):
        # Initialize hidden states
        h = lstm.init_hidden(batch_size)

        # batch loop
        for inputs, labels in train_loader:
            if((inputs.shape[0], inputs.shape[1]) != (batch_size, max_length)):
                #print('Training - Input Shape Issue:', inputs.shape)
                continue

            counter += 1

            # Create the hidden states
            h = tuple([each.data for each in h])

            lstm.zero_grad()

             # Make predictions // get the output from the model
            inputs = inputs.type(torch.LongTensor)
            output, h = lstm(inputs, h)
            
            # Calculate loss
            loss = criterion(output.unsqueeze(1), labels.float())
            loss.backward()

            nn.utils.clip_grad_norm_(lstm.parameters(), clip)
            optimizer.step()

            # Compute loss statistics
            if counter % print_every == 0:

                # Get validation loss
                val_h = lstm.init_hidden(batch_size)
                val_losses = []
                lstm.eval()
                for inputs, labels in evaluation_loader:
                    if((inputs.shape[0], inputs.shape[1]) != (batch_size, max_length)):
                        # print('Validation - Input Shape Issue:', inputs.shape)
                        continue

                    # Reinitialize hidden states
                    val_h = tuple([each.data for each in val_h])

                    inputs = inputs.type(torch.LongTensor)
                    output, val_h = lstm(inputs, val_h)
                    val_loss = criterion(output.unsqueeze(1), labels.float())

                    val_losses.append(val_loss.item())

                lstm.train()
                print("Epoch: {}/{}...".format(e+1, epochs),
                      "Step: {}...".format(counter),
                      "Loss: {:.6f}...".format(loss.item()),
                      "Val Loss: {:.6f}".format(np.mean(val_losses)))

    return lstm, criterion

In [594]:
def test_lstm_model(lstm, criterion, test_loader, max_length):
    """
    Test the performance of the model, using test data
    """
    test_losses = []
    num_correct = 0
    batch_size = 16
    
    # Create the hidden states
    h = lstm.init_hidden(batch_size)

    lstm.eval()
    for inputs, labels in test_loader:

        if((inputs.shape[0], inputs.shape[1]) != (batch_size, max_length)):
            # print('Testing - Input Shape Issue:', inputs.shape)
            continue

        # Reinitialize hidden layer
        h = tuple([each.data for each in h])

        # Make predictions // get the output from the model
        inputs = inputs.type(torch.LongTensor)
        output, h = lstm(inputs, h)

         # Calculate the loss and then perform backpropagation
        test_loss = criterion(output.unsqueeze(1), labels.float())
        test_losses.append(test_loss.item())

        # convert output probabilities to Positive-1 or Negative-0 Review label
        pred = torch.round(output.squeeze())

        # Checking predictions againts true review labels
        correct_tensor = pred.eq(labels.float().view_as(pred))
        correct = np.squeeze(correct_tensor.numpy())
        num_correct += np.sum(correct)

    print("Test loss: {:.2f}".format(np.mean(test_losses)))
    test_acc = num_correct/len(test_loader.dataset)
    print("Test accuracy: {:.2f}".format(test_acc))
    
    return test_acc

In [595]:
lstm_net = None

def main():
    def iteration() -> int:
        """Main Call / Demonstration -----------------------------------------"""
        
        index = Functions() # initilaise the index
        
        # 1. Data Extraction
        # Get the working Directory and change to it
        cwd = os.getcwd()
        os.chdir(cwd)
        
        # Get the list of Reviews
        all_review_files = os.listdir("./product_reviews/")
        # Remove Noisey file (incorrectly labelled and also the README.txt)
        all_review_files.remove('README.txt')
        all_review_files.remove('Canon_PowerShot_SD500.txt')
        review_bank, review_labels = index.prepare_reviews_and_labels(all_review_files)
        
        # Get the Reviews as Word Embeddings in Int form
        vocab_dict, reviews_as_index = index.prepare_review_vocab(review_bank)
        
        # Pad the Reviews so that they are all the same length and get in One-Hot form 
        one_hot_features, max_length = index.prepare_padded_reviews(reviews_as_index)
            
        # 2. Data Preprocessing
        # Get the reviews using indexs of positive/negative
        neutral_review_indexs = [i for i,x in enumerate(review_labels) if x==-1]
        # Separate Positive / Negative Reviews
        pos_review_indexs = [i for i,x in enumerate(review_labels) if x==1]
        neg_review_indexs = [i for i,x in enumerate(review_labels) if x==0]
        
        # Get the One-hot features for given indexs
        list_positive = [one_hot_features[index] for index in pos_review_indexs]
        list_negative = [one_hot_features[index] for index in neg_review_indexs]

        # Make dataframes for positive data -> 'review' | 'y'
        pos_labels = []
        for i in range(len(list_positive)):
            pos_labels.append(1)
            
        neg_labels = []
        for i in range(len(list_negative)):
            neg_labels.append(0)
            
        df_positive  = pd.DataFrame(list(zip(list_positive, pos_labels)), columns=['review', 'y'])
        df_negative= pd.DataFrame(list(zip(list_negative, neg_labels)), columns=['review', 'y'])
        df_all  = df_positive.append(df_negative)
        
        # Make dataframes for Negative data -> 'review' | 'y'
        
        print("Performing LSTM Analysis using 80/20 Train/Test split")

        # Separate trainin/test data
        training_data, testing_data, validation_data = index.prepare_train_test_data(df_positive, df_negative)
        
        # 3. Prepare for LSTM Single Run
        train_x = training_data['review'].to_numpy()
        train_y = training_data['y'].to_numpy()
        validation_x = validation_data['review'].to_numpy()
        validation_y = validation_data['y'].to_numpy()
        test_x = testing_data['review'].to_numpy()
        test_y = testing_data['y'].to_numpy()
        
        train_loader, valid_loader, test_loader  = index.prepare_for_pytorch(train_x, train_y, validation_x, validation_y, test_x, test_y)  
        
        lstm = SentimentLSTM(len(vocab_dict), 1, 100, 512, 3)
        lstm, criterion = train_evaluate_lstm(lstm, train_loader, valid_loader, max_length)
        acc_single = test_lstm_model(lstm, criterion, test_loader, max_length)
        
        print("Random Selection of 80/20 Train/Test split |  Test Accuracy: " + str(acc_single))
        print("")
        print("")
        
        
        print("Performing LSTM Analysis using 5 Fold CV")
        
        
        
        
        """5 Fold CV -----------------------------------------"""
        from sklearn.model_selection import KFold
        kf = KFold(n_splits=5, shuffle=True)
        
        X = df_all['review'].to_numpy()
        y = df_all['y'].to_numpy()
        
        cv_accs = []
        
        for train_index, test_index in kf.split(df_all):
            # print("TRAIN:", train_index, "TEST:", test_index)
            print("Performing Fold")
            
            # Separate the train/test data
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            
            # print(X_test)
            
            test_dataset = pd.DataFrame()
            test_dataset['review'] = X_test
            test_dataset['y'] = y_test

            # Create training data/test/validation
            testing_data, validation_data = index.prepare_train_test_data_kfold(test_dataset)
            
            # print(testing_data)
            train_x = X_train
            train_y = y_train
            validation_x = validation_data['review'].to_numpy()
            validation_y = validation_data['y'].to_numpy()
            test_x = testing_data['review'].to_numpy()
            test_y = testing_data['y'].to_numpy()

            train_loader, valid_loader, test_loader  = index.prepare_for_pytorch(train_x, train_y, validation_x, validation_y, test_x, test_y) 
            
            lstm = SentimentLSTM(len(vocab_dict), 1, 1000, 512, 3)
    
            lstm, criterion = train_evaluate_lstm(lstm, train_loader, valid_loader, max_length)
            acc = test_lstm_model(lstm, criterion, test_loader, max_length) 
            
            cv_accs.append(acc)
        
        mean_acc = np.mean(cv_accs)
        print("5 Fold Cross Validation | (Mean) Test Accuracy: " + str(mean_acc))
    lstm_net = iteration()

    
    
extraction = main()

Performing LSTM Analysis using 80/20 Train/Test split
Test loss: 0.77
Test accuracy: 0.52
Random Selection of 80/20 Train/Test split |  Test Accuracy: 0.5238095238095238


Performing LSTM Analysis using 5 Fold CV
Performing Fold
Test loss: 0.86
Test accuracy: 0.67
Performing Fold
Test loss: 3.25
Test accuracy: 0.45
Performing Fold
Test loss: 1.50
Test accuracy: 0.60
Performing Fold
Test loss: 0.03
Test accuracy: 0.80
Performing Fold
Test loss: 0.49
Test accuracy: 0.65
5 Fold Cross Validation | (Mean) Test Accuracy: 0.6333333333333333
