### Word embedding for wine reviews and sentiment analysis

In [None]:
import sys
    
sys.path.insert(0, "..")

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import r2_score, mean_squared_error

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import helper as hlp
import nltk
import re

from abc import ABC, abstractmethod
from string import punctuation

from sklearn.ensemble import RandomForestRegressor as RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor as DecisionTreeRegressor
from nltk.corpus import stopwords
from collections import Counter

from torch.utils.data import TensorDataset, DataLoader

from pandas.api.types import is_categorical_dtype
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
#  set(stopwords.words('english'))

In [None]:
# loading data
raw = pd.read_csv("./data/wines/wine_reviews.csv", low_memory = False);

# dropping unnecessary column
raw = raw.drop(columns = ["Unnamed: 0"], inplace = False)

In [None]:
def transform_text(data, column, punctuation):
    ''' utility function for text transformation for machine to interpret '''
    
    # make dataframe copy
    data_copy = data.copy()
    
    # transform each row to lowercase
    data_copy[column] = data_copy[column].str.lower()
    
    # filter out punctuation
    data_copy[column] = data_copy[column].str.replace('[^\w\s]', '')
        
    return data_copy

In [None]:
# transform each review to lowercase and remove punctuation
raw_data = transform_text(raw, "description", punctuation)

# transform non-numerical data to categorical
hlp.trans_categorical(raw_data, labels = ["description"])

# transform/normalize numeric data
raw_numeric_data = hlp.transform_to_numeric(raw_data, suffle_data_frame = True)

# split into features and targets
features, targets = hlp.split_target(raw_numeric_data, "points")

# training and validation data
training_set, validation_set = hlp.split_data(features, targets, threshold = 1 / 8)

In [None]:
tfidf = TfidfVectorizer()

cv = CountVectorizer()

In [None]:
cv.fit(training_set[0]["description"])

In [None]:
train_tdm = tfidf.fit_transform(training_set[0]["description"])

valid_tdm = tfidf.transform(validation_set[0]["description"])

In [None]:
tfidf.transform(validation_set[0]["description"].iloc[:2])

Bayesian stuff

In [None]:
mnb = MultinomialNB()

In [None]:
mnb.fit()

Logistic Regression

In [None]:
lr = LogisticRegression()

lr.fit(train_tdm, training_set[1])

In [None]:
print(r2_score(validation_set[1], lr.predict(valid_tdm)))

In [None]:
def validation_verbose(description, model, transformer, expected_score = None):
    
    # generate tf-idf-weighted document-term matrix
    description_tdm = transformer.transform([description])

    # predict score given description
    score = model.predict(description_tdm)[0]

    print(f"{description:100.100}...\
          \n\t output {score}, expected {expected_score}\n")

In [None]:
def validation(data, model, transformer, count = 5):
    
    for index in range(count):
        
        # current description
        description = data[0]["description"][index]
        
        # expected output
        score = validation_set[1][index]
        
        validation_verbose(description, model, transformer, expected_score = score)

In [None]:
# some input from validation set
validation(validation_set, lr, tfidf, count = 5)

In [None]:
# some custom input

# average score
validation_verbose("Pretty bad, can't handle the taste, extremely sour, how can someone make such wine?", lr, tfidf)

# good score
validation_verbose("Amazing, fine vintage, delicious, rich texture that sobbing for more takes, just pure quality.", lr, tfidf)

### Word Embedding Algorithms:
    
1. Embedding Layer
2. Word2Vec
    1. CBOW
    2. C. Skip-Gram
3. GloVe

In [None]:
class Normalization():
    ''' helper class for data normalization and scaling '''
    
    def __init__(self, data):
        
        self.std = None
        self.mean = None
        
        self.data = data
    
    def normalize(self, data):
        
        # compute disparity
        self.std = np.std(data)

        # compute mean
        self.mean = np.mean(data)
        
        return (data - self.mean) / self.std

    def scale(self, data):
        
        assert(self.std is not None and self.mean is not None)
        
        return data * self.std + self.mean

In [None]:
# initiate instance
normalization = Normalization()

# targets; scores for each product and features
features, targets = raw_data["description"], np.array(raw_data["points"])

# normalize scores
targets = normalization.normalize(targets)

In [None]:
class Chainer(ABC):
    ''' chainer class for chaining text transformations '''
    
    @abstractmethod
    def process(self, data, chain = None):
        ''' chain method for data preprocessing '''
        pass
    
class Tokenize(Chainer):
    
    def process(self, data, chain):
        ''' chain method for data preprocessing '''
        
        # tokenize, split a sentence by space
        chain.data = data.str.split()
        
        # find maximum size of sequence of tokens
        chain.sequence_max = max([ len(sequence) for sequence in chain.data ])
        
        return (chain.data, chain)
    
class Vocabulary(Chainer):
    
    def process(self, data, chain):
        ''' create the known vocabulary basis '''
        
        # count token occurrences
        chain.tokens = Counter([ token for sequence in data for token in sequence ])
        
        # vocabulary_size
        chain.vocabulary_size = len(chain.tokens) + 1
        
        # word to integer mapping, 0 is reserved for padding
        chain.word_to_int = { key : (index + 1) for index, key in enumerate(chain.tokens) }
        
        # integer to word mapping
        chain.int_to_word = { index : word for word, index in chain.word_to_int.items() }
        
        return (data, chain)
    
class NumericToToken(Chainer):
    
    def process(self, data, chain):
        ''' apply textual transformation '''
        
        assert(hasattr(chain, 'int_to_word'))
        
        # transform from textual to numerical representation
        chain.data = [ [ chain.int_to_word[token] for token in sequence ] for sequence in data ]
        
        return (chain.data, chain)

class TokenToNumeric(Chainer):
    
    def process(self, data, chain):
        ''' apply numerical transformation '''

        assert(hasattr(chain, 'word_to_int'))

        # transform from textual to numerical representation
        chain.data = [ [ chain.word_to_int[token] for token in sequence ] for sequence in data ]

        return (chain.data, chain)
           
class Filler(Chainer):
    
    def process(self, data, chain):
        ''' apply padding to numerical content '''

        # assert numerical representation of input data
        assert(all(isinstance(token, int) for sequence in data for token in sequence))
        assert(hasattr(chain, 'sequence_max'))

        # transform by padding
        chain.data = [ sequence + [0] * (chain.sequence_max - len(sequence)) for sequence in data ]
        
        return (np.array(chain.data), chain)

class Composer(Chainer):
    
    def __init__(self, transforms):
        
        self.transforms = transforms
    
    def process(self, data, chain = None):
        
        # initialize chainer data
        self.data = data
        
        # apply transformations in series
        for transform in self.transforms:
            
            # check if it's chainer transformer
            if(isinstance(transform, Chainer)):
                
                # update existing data, pass only current class reference
                self.data, _ = transform.process(self.data, self)
                
            else:
                
                # list item is not an instance of Chainer transformer
                raise Exception("Illegal parameter, provide contiguous set of Chainer(s)")
                
        return (self.data, self)

In [None]:
# apply transformations to our data
features, chainer = Composer([
    Tokenize(),
    Vocabulary(),
    TokenToNumeric(),
    Filler()
]).process(features)

In [None]:
# training and validation dataset
train_dataset, valid_dataset = hlp.split_data(features, targets, threshold = 1 / 8)

In [None]:
# create the tensor datasets
train_tensor_dataset = TensorDataset(torch.from_numpy(train_dataset[0]).long(), torch.from_numpy(train_dataset[1]))
valid_tensor_dataset = TensorDataset(torch.from_numpy(valid_dataset[0]).long(), torch.from_numpy(valid_dataset[1]))

# create data loaders
train_loader = DataLoader(train_tensor_dataset, batch_size = 16, shuffle = True)
valid_loader = DataLoader(valid_tensor_dataset, batch_size = 16, shuffle = True)

### Embedding Layer

In [None]:
class Model(nn.Module):
    
    def __init__(self, num_embeddings, embedding_dim, hidden_size, num_layers, output_size, bidirectional = False):
        super(Model, self).__init__()
        
        self.num_embeddings = num_embeddings
        self.embedding_dim = embedding_dim
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.output_size = output_size
        self.bidirectional = bidirectional
        
        # sparse(embedding) layer
        self.embedding = nn.Embedding(self.num_embeddings, self.embedding_dim)
        
        # recurrent neural network layer(lstm)
        self.rnn = nn.LSTM(self.embedding_dim, self.hidden_size, self.num_layers, bidirectional = self.bidirectional)
        
        # fully connected layer(linear) + dropout
        self.sec = nn.Sequential(
            nn.Linear(self.hidden_size, self.output_size),
            nn.Dropout(0.15)
        )
    
    def forward(self, x):
        
        # embed words into dense representation
        x = self.embedding(x)
        
        # recurrent neural network; pass forward
        x, _ = self.rnn(x)
        
        # fully connected layer; pass forward while casually dropping cells
        x = self.sec(x)
        
        return torch.sigmoid(x)

In [None]:
# our model
model = Model(chainer.vocabulary_size, 500, 480, 2, 1)

### Defining hyperparameters

In [None]:
class Context():
    
    def __init__(self, model, learning_rate):
        
        # device to be used
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        # model to be trained
        self.model = model.to(self.device)
        
        # learning rate
        self.learning_rate = learning_rate

        # loss function (mean squared error loss)
        self.criterion = nn.MSELoss(reduction = 'mean')

        # optimizer with momentum
        self.optimizer = optim.Adam(self.model.parameters(), lr = self.learning_rate)
    
    def create_scheduler(self):
        
        # custom scheduler
        self.scheduler = optim.lr_scheduler.MultiStepLR(self.optimizer, milestones = [4, 9], gamma = 0.72)

In [None]:
# context container for model characteristics
context = Context(model, learning_rate = 0.0001)

### Model testing

In [None]:
# singleton batch sample
feature_sample, target_sample = next(iter(train_loader))

# move to corresponding available
feature_sample, target_sample = feature_sample.to(context.device), target_sample.to(context.device)

# forward pass
output = model.forward(feature_sample)

# compute loss
loss = context.criterion(output, target_sample.float())

print(f"Loss is {loss}")