### Word embedding for wine reviews and sentiment analysis

In [76]:
import sys
    
sys.path.insert(0, "..")

In [77]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import r2_score, mean_squared_error

In [78]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

import helper as hlp
import nltk
import re
from string import punctuation

from sklearn.ensemble import RandomForestRegressor as RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor as DecisionTreeRegressor
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter

from torch.utils.data import TensorDataset, DataLoader

from pandas.api.types import is_categorical_dtype
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [79]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Error loading punkt: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>
[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


False

In [113]:
#  set(stopwords.words('english'))

In [80]:
# loading data
raw = pd.read_csv("./data/wines/wine_reviews.csv", low_memory = False);

# dropping unnecessary column
raw = raw.drop(columns = ["Unnamed: 0"], inplace = False)

In [81]:
def transform_text(raw, column, punctuation):
    ''' utility function for text transformation for machine to interpret '''
    
    # make dataframe copy
    data_copy = raw.copy()
    
    # transform each row to lowercase
    data_copy[column] = data_copy[column].str.lower()
    
    # filter out punctuation
    data_copy[column] = data_copy[column].str.replace('[^\w\s]', '')
        
    return data_copy

In [82]:
# transform each review to lowercase and remove punctuation
raw_data = transform_text(raw, "description", punctuation)

# transform non-numerical data to categorical
hlp.trans_categorical(raw_data, labels = ["description"])

# transform/normalize numerical data
features, targets = hlp.trans_numerical(raw_data, "points", suffle_data_frame = True)

# training and validation data
training_set, validation_set = hlp.split_data(features, targets, threshold = 1 / 8)

In [9]:
tfidf = TfidfVectorizer()

cv = CountVectorizer()

In [10]:
cv.fit(training_set[0]["description"])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [13]:
train_tdm = tfidf.fit_transform(training_set[0]["description"])

valid_tdm = tfidf.transform(validation_set[0]["description"])

In [14]:
tfidf.transform(validation_set[0]["description"].iloc[:2])

<2x43343 sparse matrix of type '<class 'numpy.float64'>'
	with 53 stored elements in Compressed Sparse Row format>

Bayesian stuff

In [29]:
mnb = MultinomialNB()

In [None]:
mnb.fit()

Logistic Regression

In [15]:
lr = LogisticRegression()

lr.fit(train_tdm, training_set[1])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [16]:
print(r2_score(validation_set[1], lr.predict(valid_tdm)))

0.5419663372456802


In [17]:
def validation_verbose(description, model, transformer, expected_score = None):
    
    # generate tf-idf-weighted document-term matrix
    description_tdm = transformer.transform([description])

    # predict score given description
    score = model.predict(description_tdm)[0]

    print(f"{description:100.100}...\
          \n\t output {score}, expected {expected_score}\n")

In [18]:
def validation(data, model, transformer, count = 5):
    
    for index in range(count):
        
        # current description
        description = data[0]["description"][index]
        
        # expected output
        score = validation_set[1][index]
        
        validation_verbose(description, model, transformer, expected_score = score)

In [19]:
# some input from validation set
validation(validation_set, lr, tfidf, count = 5)

sweet rosé from the vinho verde region this has some fresh raspberry and a light prickle to make it ...          
	 output 85, expected 82

aromas of ripe darkskinned fruit leather clove rose and a whiff of menthol set the tone the firmly s...          
	 output 90, expected 91

this cabernet sauvignonbased wine is a blend of all the bordeaux varieties including carmenere its s...          
	 output 86, expected 88

aromas of ripe blackberry brown spice and fragrant blue flower waft from the glass a blend of 50 neg...          
	 output 88, expected 89

cutting streaks of lime and lemon intensify the earthy wet riverrock notes in this complex dry riesl...          
	 output 92, expected 94



In [20]:
# some custom input

# average score
validation_verbose("Pretty bad, can't handle the taste, extremely sour, how can someone make such wine?", lr, tfidf)

# good score
validation_verbose("Amazing, fine vintage, delicious, rich texture that sobbing for more takes, just pure quality.", lr, tfidf)

Pretty bad, can't handle the taste, extremely sour, how can someone make such wine?                 ...          
	 output 82, expected None

Amazing, fine vintage, delicious, rich texture that sobbing for more takes, just pure quality.      ...          
	 output 93, expected None



### Word Embedding Algorithms:
    
1. Embedding Layer
2. Word2Vec
    1. CBOW
    2. C. Skip-Gram
3. GloVe

In [105]:
class TextTransformer():
    ''' utility for data transformations '''
    
    def __init__(self, raw, target):
        
        self.raw = raw
        self.target = target
        
        self.raw_copy = raw.copy()
    
    def tokenize(self):

        # tokenize, split a sentence by space
        self.raw_copy[self.target] = self.raw[self.target].str.split()
        
        # find maximum size of sequence of tokens
        self.sequence_max = max([ len(review) for review in self.raw_copy[self.target] ])
        
    def create_vocabulary(self):
        ''' create the vocabulary base from the known data '''
        
        # count token occurrences
        self.tokens = Counter([ token for review in self.raw_copy[self.target] for token in review ])
        
        # vocabulary_size
        self.vocabulary_size = len(self.tokens)
        
        # word to integer mapping
        self.word_to_int = { key : index for index, key in enumerate(self.tokens) }
        
        # integer to word mapping
        self.int_to_word = { index : word for word, index in self.word_to_int.items() }
        
    def apply_to_numerical(self):
        ''' apply numerical transformation '''
        
        # transform from textual to numerical representation
        self.raw_copy[self.target] = [ [ self.word_to_int[token] for token in review ] for review in self.raw_copy[self.target] ]
        
    def apply_to_textual(self):
        ''' apply textual transformation '''
        
        # transform from textual to numerical representation
        self.raw_copy[self.target] = [ [ self.word_to_int[token] for token in review ] for review in self.raw_copy[self.target] ]
        
    def apply_padding(self):
        ''' apply padding to numerical content '''
        
        # assert numerical representation of input data
        assert(all(isinstance(token, int) for review in self.raw_copy[self.target] for token in review))
         
        # transform by padding
        self.raw_copy[self.target] = [ review + [-1] * (len(review) - self.sequence_max) for review in self.raw_copy[self.target] ]

In [106]:
# apply transformer to our data
transformer = TextTransformer(raw_data, "description")

transformer.tokenize()
transformer.create_vocabulary()
transformer.apply_to_numerical()
transformer.apply_padding()

In [110]:
# training and validation data
features, targets = hlp.split_target(transformer.raw_copy, "description")

# training and validation data
training_dataset, validation_dataset = hlp.split_target(features, targets, threshold = 1 / 8)

AttributeError: module 'helper' has no attribute 'split_target'

#### Embedding Layer

In [6]:
class Model:
    
    def __init__(self, num_embeddings, embedding_dim, hidden_size, num_layers, output_size, bidirectional = False):
        
        self.num_embeddings = num_embeddings
        self.embedding_dim = embedding_dim
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.output_size = output_size
        self.bidirectional = bidirectional
        
        # sparse(embedding) layer
        self.embedding = nn.Embedding(self.num_embeddings, self.embedding_dim)
        
        # recurrent neural network layer(lstm)
        self.rnn = nn.LSTM(self.embedding_dim, self.hidden_size, self.num_layers, bidirectional = self.bidirectional)
        
        # dropout layyer
        self.drp = nn.Dropout(0.15)
        
        # linear fully connected layer
        self.fc = nn.Linear(self.hidden_size, self.output_size)
    
    def forward(self, x):
        
        # embed words into dense representation
        x = self.embedding(x)
        
        # recurrent neural network; pass forward
        x = self.rnn(x)
        
        # fully connected layer; pass forward while casually dropping cells
        x = self.drp(self.fc(x))
        
        return F.sigmoid(x)

In [None]:
Model model(1024, 600, )

SyntaxError: invalid syntax (<ipython-input-1-2b730490c568>, line 1)