### Word embedding for wine reviews and sentiment analysis

In [0]:
# development method either "local" or "remote"
development = "remote"

In [2]:
# root path if local
root_path = "."

if development == "remote":
    
    from google.colab import drive

    # mounting google drive to system
    drive.mount('/content/drive')
    
    # root path if remote
    root_path = '/content/drive/My Drive/word_embedding'

# module path
module_path = root_path + "/..";

# model path
model_path = root_path + "/model"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import sys
    
sys.path.insert(0, module_path)

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import helper as hlp
import nltk
import re

from abc import ABC, abstractmethod
from string import punctuation

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import r2_score, mean_squared_error

from sklearn.ensemble import RandomForestRegressor as RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor as DecisionTreeRegressor
from nltk.corpus import stopwords
from collections import Counter

from torch.utils.data import TensorDataset, DataLoader

from pandas.api.types import is_categorical_dtype
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [0]:
# loading data
raw = pd.read_csv(root_path + "/data/wines/wine_reviews.csv", low_memory = False);

# dropping unnecessary column
raw = raw.drop(columns = ["Unnamed: 0"], inplace = False)

In [0]:
def transform_text(data, column, punctuation):
    ''' utility function for text transformation for the machine to interpret '''
    
    # make dataframe copy
    data_copy = data.copy()
    
    # transform each row to lowercase
    data_copy[column] = data_copy[column].str.lower()
    
    # filter out punctuation
    data_copy[column] = data_copy[column].str.replace('[^\w\s]', '')
        
    return data_copy

In [0]:
# transform each review to lowercase and remove punctuation
raw_data = transform_text(raw, "description", punctuation)

In [0]:
# transform non-numerical data to categorical
hlp.trans_categorical(raw_data, labels = ["description"])

# transform/normalize numeric data
raw_numeric_data = hlp.transform_to_numeric(raw_data, suffle_data_frame = True)

# training and validation set
train_set, valid_set = hlp.split_data(raw_numeric_data, threshold = 1 / 8)

# targets; scores for each product and features
train_dataset, valid_dataset = hlp.split_features(train_set, valid_set, columns = ["description", "points"])

In [0]:
transformer = TfidfVectorizer(stop_words = stopwords.words('english'))

In [0]:
# learning vocabulary and get the term-document matrix
train_term_document = transformer.fit_transform(train_dataset[1])

# based on learned vocabulary transform validation set
valid_term_document = transformer.transform(valid_dataset[1])

### Logistic Regression

In [0]:
# create our model
model = LogisticRegression()

# fit data
model.fit(train_term_document, train_dataset[2])

In [0]:
print(r2_score(valid_dataset[2], model.predict(valid_term_document)))

In [0]:
def validation_verbose(description, model, transformer, expected_score = None):
    
    # generate tf-idf-weighted document-term matrix
    description_tdm = transformer.transform([description])

    # predict score given description
    score = model.predict(description_tdm)[0]

    print(f"{description:100.100}...\
          \n\t output {score}, expected {expected_score}\n")

In [0]:
def validation(data, model, transformer, count = 5):
    
    for index in range(count):
        
        # current description
        description = data[0]["description"][index]
        
        # expected output
        score = validation_set[1][index]
        
        validation_verbose(description, model, transformer, expected_score = score)

In [0]:
# some input from validation set
validation(validation_set, lr, tfidf, count = 5)

In [0]:
# some custom input

# average score
validation_verbose("Pretty bad, can't handle the taste, extremely sour, how can someone make such wine?", lr, tfidf)

# good score
validation_verbose("Amazing, fine vintage, delicious, rich texture that sobbing for more takes, just pure quality.", lr, tfidf)

### Word Embedding Algorithms:
    
1. Embedding Layer
2. Word2Vec
    1. CBOW
    2. C. Skip-Gram
3. GloVe

In [0]:
# batch size
batch_size = 16

# training and validation set
train_set, valid_set = hlp.split_data(raw_data, threshold = 1 / 8, batch_trim = batch_size)

# targets; scores for each product and features
train_dataset, valid_dataset = hlp.split_features(train_set, valid_set, columns = ["description", "points"])

In [0]:
class Chainer(ABC):
    ''' chainer class for chaining text transformations '''
    
    @abstractmethod
    def process(self, data, chronicle):
        ''' chain method for data preprocessing '''
        
        pass
    
    def apply(self, data, chronicle):
        ''' default action '''
    
        return self.process(data, chronicle)

    
class Chronicle():
    
    def __init__(self, sequence_max = None, word_to_int = None, int_to_word = None, vocabulary_size = None,\
                mean = None, std = None):
    
        self.sequence_max = sequence_max
        self.word_to_int = word_to_int
        self.int_to_word = int_to_word
        self.vocabulary_size = vocabulary_size
        self.mean = mean
        self.std = std
        
    def copy(self):
        
        # create an independent chronicle
        return Chronicle(sequence_max = self.sequence_max, word_to_int = self.word_to_int, int_to_word = self.int_to_word, 
                         vocabulary_size = self.vocabulary_size, mean = self.mean, std = self.std)
    
    
class Tokenize(Chainer):
    
    def process(self, data, chronicle):
        ''' chain method for data preprocessing '''
        
        # tokenize, split a sentence by space
        data = data.str.split()
        
        # find maximum size of sequence of tokens
        chronicle.sequence_max = max([ len(sequence) for sequence in data ])
        
        return (data, chronicle)

    
class Vocabulary(Chainer):
    
    def process(self, data, chronicle):
        ''' create the known vocabulary basis '''
        
        # count token occurrences
        chronicle.tokens = Counter([ token for sequence in data for token in sequence ])
        
        # vocabulary_size
        chronicle.vocabulary_size = len(chronicle.tokens) + 1
        
        # word to integer mapping, 0 is reserved for padding
        chronicle.word_to_int = { key : (index + 1) for index, key in enumerate(chronicle.tokens) }
        
        # integer to word mapping
        chronicle.int_to_word = { index : word for word, index in chronicle.word_to_int.items() }
        
        return (data, chronicle)
    
    def apply(self, data, chronicle):
        ''' override apply transformation method '''
    
        # nothing to do
        return (data, chronicle)

    
class NumericToToken(Chainer):
    
    def process(self, data, chronicle):
        ''' apply textual transformation '''
        
        assert(hasattr(chronicle, 'int_to_word'))
        
        # transform from textual to numerical representation
        data = [ 
            [ chronicle.int_to_word[token] for token in sequence if token in chronicle.int_to_word ] 
        for sequence in data ]
        
        return (data, chronicle)
    
    
class TokenToNumeric(Chainer):
    
    def process(self, data, chronicle):
        ''' apply numerical transformation '''

        assert(hasattr(chronicle, 'word_to_int'))

        # transform from textual to numerical representation
        data = [ 
            [ chronicle.word_to_int[token] if token in chronicle.word_to_int else 0 for token in sequence ] 
        for sequence in data ]

        return (data, chronicle)
    

class Filler(Chainer):
    
    def process(self, data, chronicle):
        ''' apply padding to numerical content '''

        # assert numerical representation of input data
        assert(all(isinstance(token, int) for sequence in data for token in sequence))
        assert(hasattr(chronicle, 'sequence_max'))

        # get the real size of each sequence
        chronicle.sizes = np.array([ len(sequence) - 1 for sequence in data ], dtype = "longlong")

        # transform by padding
        data = np.array([ seq + [0] * (chronicle.sequence_max - len(seq)) for seq in data ], dtype = "longlong")
        
        return (data, chronicle)

    
class Normalization(Chainer):
    
    def process(self, data, chronicle):
        
        # compute dispersion
        chronicle.std = np.std(data)

        # compute mean
        chronicle.mean = np.mean(data)
        
        # normalize data
        data = np.array((data - chronicle.mean) / chronicle.std, dtype = "float32")
        
        return (data, chronicle)

    def apply(self, data, chronicle):
        
        assert(chronicle.std is not None and chronicle.mean is not None)
        
        return self.process(data, chronicle)

    def scale(self, data, chronicle):
        
        # undo the normalization
        data = np.array(data * chronicle.std + chronicle.mean, dtype = "float32")
        
        return (data, chronicle)

    
class Composer(Chainer):
    
    def __init__(self, transforms):
        
        # current transformations
        self.transforms = transforms
        
        # check each transformation
        for transform in self.transforms:
            
            # check if it's chainer transformer
            if not isinstance(transform, Chainer):
                
                # list item is not an instance of Chainer transformer
                raise Exception("Illegal parameter, provide contiguous set of Chainer(s)")
        
        # initialize chronicle of transformations
        self.chronicle = Chronicle()
    
    def process(self, raw):
        
        # initialize chainer data
        data = raw.copy()
        
        # apply transformations in series
        for transform in self.transforms:
            
            # process data
            data, _ = transform.process(data, self.chronicle)
                
        return (data, self.chronicle)
    
    def apply(self, raw):
        
        # create an independent chronicle
        chronicle = self.chronicle.copy()
        
        # initialize chainer data
        data = raw.copy()
        
        # apply transformations in series
        for transform in self.transforms:
            
            # apply transform to data
            data, _ = transform.apply(data, chronicle)  
                
        return (data, chronicle)

### Apply tranformations to features

In [0]:
# create composer for features
feature_composer = Composer([
    Tokenize(),
    Vocabulary(),
    TokenToNumeric(),
    Filler()
])

# apply transformations and learn the vocabulary of our train dataset
features_train, features_train_chronicle = feature_composer.process(train_dataset[1])

# apply transformations given existing learned vocabulary
features_valid, features_valid_chronicle = feature_composer.apply(valid_dataset[1])

### Apply tranformations to targets

In [0]:
# create composer for targets
target_composer = Composer([
    Normalization()
])

# apply transformations and learn the characteristics 
targets_train, targets_train_chronicle = target_composer.process(train_dataset[2].values)

# apply transformations given existing learned characteristics
targets_valid, _ = target_composer.apply(valid_dataset[2].values)

In [0]:
# create the tensor datasets
train_tensor_dataset = TensorDataset(torch.from_numpy(features_train), torch.from_numpy(targets_train), torch.from_numpy(features_train_chronicle.sizes))
valid_tensor_dataset = TensorDataset(torch.from_numpy(features_valid), torch.from_numpy(targets_valid), torch.from_numpy(features_valid_chronicle.sizes))

# create data loaders
train_loader = DataLoader(train_tensor_dataset, batch_size = batch_size, shuffle = True)
valid_loader = DataLoader(valid_tensor_dataset, batch_size = batch_size, shuffle = True)

### Embedding Layer

In [0]:
class Model(nn.Module):
    
    def __init__(self, num_embeddings, embedding_dim, hidden_size, num_layers, output_size, bidirectional = False):
        super(Model, self).__init__()
        
        self.num_embeddings = num_embeddings
        self.embedding_dim = embedding_dim
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.output_size = output_size
        self.bidirectional = bidirectional
        
        # sparse(embedding) layer
        self.embedding = nn.Embedding(self.num_embeddings, self.embedding_dim)
        
        # recurrent neural network layer(lstm)
        self.rnn = nn.LSTM(self.embedding_dim, self.hidden_size, self.num_layers, 
                               bidirectional = self.bidirectional, batch_first = True)
        
        # fully connected layer(linear) + dropout
        self.sec1 = nn.Sequential(
            nn.Linear(self.hidden_size, 128),
            nn.Dropout(0.15)
        )
        
        # fully connected layer(linear) + dropout
        self.sec2 = nn.Sequential(
            nn.Linear(128, self.output_size),
            nn.Dropout(0.15)
        )
    
    def forward(self, x, indices):

        # embed words into dense representation
        x = self.embedding(x)
        
        # recurrent neural network; pass forward
        x, _ = self.rnn(x)
        
        # stack rnn output; relocate
        x = x.contiguous()
        
        # fully connected layer; pass forward while casually dropping cells
        x = self.sec1(x)
        
        # fully connected layer; pass forward while casually dropping cells
        x = self.sec2(x)
        
        # flatten the tensor to dimen 2
        x = x.view(batch_size, -1)
        
        # extract last prediction from the sequence
        x = x[torch.arange(len(indices)), indices]
        
        return x
    
    def save_model(self, checkpoint):
        ''' save model to dictionary '''
        
        assert(isinstance(checkpoint, dict))
        
        checkpoint["num_embeddings"] = self.num_embeddings
        checkpoint["embedding_dim"] = self.embedding_dim
        checkpoint["hidden_size"] = self.hidden_size
        checkpoint["num_layers"] = self.num_layers
        checkpoint["output_size"] = self.output_size
        checkpoint["bidirectional"] = self.bidirectional
        
        # save model internal weights
        checkpoint["state_dict"] = self.state_dict()
        
        return checkpoint
        
    @staticmethod
    def load_model(checkpoint):
        ''' load model from dictionary '''
        
        assert(isinstance(checkpoint, dict))
        
        # create an instance of Model
        model = Model(
            checkpoint["num_embeddings"],
            checkpoint["embedding_dim"],
            checkpoint["hidden_size"],
            checkpoint["num_layers"],
            checkpoint["output_size"],
            checkpoint["bidirectional"]
        )
        
        # load weights
        model.load_state_dict(checkpoint["state_dict"])
        
        return model

### Defining hyperparameters

In [0]:
class Context():
    
    def __init__(self, model, learning_rate, verbose = True):
        
        assert(isinstance(model, Model))
        
        # device to be used
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        if verbose:
            
            print(f"Model is moved to {self.device}")
        
        # model to be trained
        self.model = model.to(self.device)
        
        # learning rate
        self.learning_rate = learning_rate

        # loss function (mean squared error loss)
        self.criterion = nn.MSELoss(reduction = 'mean')

        # optimizer with momentum
        self.optimizer = optim.Adam(self.model.parameters(), lr = self.learning_rate)
        
        # current validation min
        self.valid_loss_min = np.inf
    
    def create_scheduler(self):
        
        # custom scheduler
        self.scheduler = optim.lr_scheduler.MultiStepLR(self.optimizer, milestones = [4, 9], gamma = 0.72)
        
    def save_context(self, name):
        
        # current checkpoint to be saved
        checkpoint = {
            "state_optim" : self.optimizer.state_dict(),
            "learning_rate" : self.learning_rate,
            "valid_loss_min" : self.valid_loss_min
        }
        
        # save model params to dict
        checkpoint = self.model.save_model(checkpoint)
        
        # save checkpoint to disk
        torch.save(checkpoint, f"{model_path}/{name}")
        
    @staticmethod
    def load_context(name, context = None):
        
        # load latest checkpoint
        checkpoint = torch.load(f"{model_path}/{name}")
        
        # model loading
        model = Model.load_model(checkpoint)
        
        # recreate context
        context = Context(model, checkpoint["learning_rate"])
        
        # update current validation loss
        context.valid_loss_min = checkpoint["valid_loss_min"]
        
        # optimizer loading e.g state momentum
        context.optimizer.load_state_dict(checkpoint["state_optim"])
        
        return context

In [22]:
# our model
model = Model(features_train_chronicle.vocabulary_size, 512, 256, 2, 1)

# context container for model characteristics
context = Context(model, learning_rate = 0.0001)

Model is moved to cuda


### Model testing

In [23]:
# singleton batch sample
feature_sample, target_sample, indices = next(iter(train_loader))

# move to corresponding available
feature_sample, target_sample = feature_sample.to(context.device), target_sample.to(context.device)

# forward pass
output = model.forward(feature_sample, indices)

# compute lossop
loss = context.criterion(output, target_sample)

print(f"Loss is {loss.item()}")

Loss is 0.897085964679718


In [0]:
def valid_model(context, epoch, train_acc_loss):
    ''' model validation '''
    
    # the model to be validated
    model = context.model
        
    # validation loss
    valid_acc_loss = 0
    
    # evaluation mode
    model.eval()

    with torch.no_grad():

        for features, targets, indices in valid_loader:
            
            # move to corresponding available device
            features, targets = features.to(context.device), targets.to(context.device)

            # forward pass
            output = model.forward(features, indices)

            # calculate loss
            loss = context.criterion(output, targets)
            
            # update validation acc loss
            valid_acc_loss += loss.item() 

    # average loss 
    valid_acc_loss = valid_acc_loss / len(valid_loader)

    # print training / validation statistics 
    print('Current Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
        epoch + 1, train_acc_loss, valid_acc_loss))

    if(valid_acc_loss <= context.valid_loss_min):
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
            context.valid_loss_min, valid_acc_loss))

        # update current best validation loss
        context.valid_loss_min = valid_acc_loss

        # save best model
        context.save_context('model.pt')
        

def train_model(context, epochs = 5, show_every_step = len(train_loader)):
    ''' model training '''
    
    # the model to be trained
    model = context.model
    
    # train loss and step counter
    train_acc_loss, step_counter = 0, 0
    
    # training mode
    model.train()
    
    for epoch in range(epochs):
        
        for features, targets, indices in train_loader:
            
            # removing accumulated gradients
            context.optimizer.zero_grad()
            
            # move to corresponding available device
            features, targets = features.to(context.device), targets.to(context.device)
            
            # forward pass
            output = model.forward(features, indices)

            # calculate loss
            loss = context.criterion(output, targets)
            
            # calculate gradients
            loss.backward()
            
            # adjusting weights
            context.optimizer.step()
            
            # update loss acc
            train_acc_loss += loss.item()
            
            # update step counter
            step_counter += 1
            
            if(step_counter % show_every_step == 0):
                
                # average loss 
                train_acc_loss = train_acc_loss / show_every_step

                # print training / validation statistics 
                valid_model(context, epoch, train_acc_loss)
                
                # reset train and step accumulator
                train_acc_loss, step_counter = 0, 0
                
                # training mode
                model.train()

In [31]:
# training model
train_model(context, 8, show_every_step = 2000)

Current Epoch: 1 	Training Loss: 0.426804 	Validation Loss: 0.339788
Validation loss decreased (0.378453 --> 0.339788).  Saving model ...
Current Epoch: 1 	Training Loss: 0.423702 	Validation Loss: 0.333102
Validation loss decreased (0.339788 --> 0.333102).  Saving model ...
Current Epoch: 1 	Training Loss: 0.407590 	Validation Loss: 0.330842
Validation loss decreased (0.333102 --> 0.330842).  Saving model ...
Current Epoch: 2 	Training Loss: 0.400197 	Validation Loss: 0.345221
Current Epoch: 2 	Training Loss: 0.372646 	Validation Loss: 0.314250
Validation loss decreased (0.330842 --> 0.314250).  Saving model ...
Current Epoch: 2 	Training Loss: 0.371759 	Validation Loss: 0.301952
Validation loss decreased (0.314250 --> 0.301952).  Saving model ...
Current Epoch: 2 	Training Loss: 0.366730 	Validation Loss: 0.306131
Current Epoch: 3 	Training Loss: 0.345008 	Validation Loss: 0.327538
Current Epoch: 3 	Training Loss: 0.344156 	Validation Loss: 0.299589
Validation loss decreased (0.30195

### Testing model

In [0]:
def pred_model(context):
    
    # evaluation mode
    context.model.eval()
    
    with torch.no_grad():
    
        # valid features targets 
        features, targets, indices = next(iter(train_loader))

        # move to available device
        features, targets = features.to(context.device), targets.to(context.device)

        # making prediction
        outputs = context.model.forward(features, indices).cpu()

        # scale the outputs
        outputs, _ = target_composer.transforms[0].scale(outputs, targets_train_chronicle)

        # scale the targets
        targets, _ = target_composer.transforms[0].scale(targets.cpu(), targets_train_chronicle)

        for i in range(len(features)):

            print(f"Predicted score: {int(outputs[i])}, actual score {int(targets[i])}")

In [41]:
# making predictions
pred_model(context)

Predicted score: 89, actual score 89
Predicted score: 88, actual score 88
Predicted score: 89, actual score 91
Predicted score: 91, actual score 92
Predicted score: 90, actual score 90
Predicted score: 89, actual score 90
Predicted score: 87, actual score 87
Predicted score: 83, actual score 83
Predicted score: 85, actual score 86
Predicted score: 87, actual score 89
Predicted score: 87, actual score 88
Predicted score: 87, actual score 86
Predicted score: 87, actual score 88
Predicted score: 88, actual score 88
Predicted score: 90, actual score 92
Predicted score: 87, actual score 87
