### Word embedding for wine reviews and sentiment analysis

In [4]:
import sys
    
sys.path.insert(0, "..")

In [176]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import r2_score, mean_squared_error

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math

import helper as hlp

from sklearn.ensemble import RandomForestRegressor as RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor as DecisionTreeRegressor

from pandas.api.types import is_categorical_dtype
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [7]:
# loading data
raw = pd.read_csv("./data/wine_reviews.csv", low_memory = False);

# dropping unnecessary column
raw = raw.drop(columns = ["Unnamed: 0"], inplace = False)

In [8]:
# transform non-numerical data to categorical
hlp.trans_categorical(raw, labels = ["description"])

# transform/normalize numerical data
features, targets = hlp.trans_numerical(raw, "points", suffle_data_frame = True)

# training and validation data
training_set, validation_set = hlp.split_data(features, targets, threshold = 1 / 8)

In [10]:
tfidf = TfidfVectorizer()

In [17]:
cv = CountVectorizer()

In [18]:
cv.fit(training_set[0]["description"])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [22]:
len(cv.vocabulary_)

29871

In [89]:
train_tdm = tfidf.fit_transform(training_set[0]["description"])

valid_tdm = tfidf.transform(validation_set[0]["description"])

In [92]:
tfidf.transform(validation_set[0]["description"].iloc[:2])

<2x29871 sparse matrix of type '<class 'numpy.float64'>'
	with 60 stored elements in Compressed Sparse Row format>

Bayesian stuff

In [29]:
mnb = MultinomialNB()

In [None]:
mnb.fit()

Logistic Regression

In [53]:
lr = LogisticRegression()

lr.fit(train_tdm, training_set[1])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [180]:
print(r2_score(validation_set[1], lr.predict(valid_tdm)))

0.5484543964005837


In [187]:
def validation_verbose(description, model, transformer, expected_score = None):
    
    # generate tf-idf-weighted document-term matrix
    description_tdm = transformer.transform([description])

    # predict score given description
    score = model.predict(description_tdm)[0]

    print(f"{description:100.100}...\
          \n\t output {score}, expected {expected_score}\n")

In [188]:
def validation(data, model, transformer, count = 5):
    
    for index in range(count):
        
        # current description
        description = data[0]["description"][index]
        
        # expected output
        score = validation_set[1][index]
        
        validation_verbose(description, model, transformer, expected_score = score)

In [189]:
# some input from validation set
validation(validation_set, lr, tfidf, count = 5)

Less effusive on the nose than the Mort's Block, this boasts notes of tangerine zest and lemon. This...          
	 output 91, expected 92

88-90 Barrel sample. Caramel new wood aromas are followed by spice and toast flavors. The fruit seem...          
	 output 89, expected 89

Somewhat smoky and earthy on the nose, but also with some harshness that could be chalked up to yout...          
	 output 87, expected 84

A product of the cool 2010 vintage, this has a green, minty flavor, with notes of lemon and lime. It...          
	 output 87, expected 89

Blended with a little Cabernet Sauvignon, this Petit Verdot succeeds wildly in appealing to the sens...          
	 output 87, expected 92



In [203]:
# some custom input

# average score
validation_verbose("Pretty bad, can't handle the taste, extremely sour, how can someone make such wine?", lr, tfidf)

# good score
validation_verbose("Amazing, fine vintage, delicious, rich texture that sobbing for more takes, just pure quality.", lr, tfidf)

Pretty bad, can't handle the taste, extremely sour, how can someone make such wine?                 ...          
	 output 82, expected None

Amazing, fine vintage, delicious, rich texture that sobbing for more takes, just pure quality.      ...          
	 output 92, expected None

