# Whiskey Review

In [22]:
from datetime import datetime as dt
dt.utcnow()

datetime.datetime(2019, 7, 31, 1, 8, 54, 437674)

In [23]:
# general libraries
import pandas as pd
import os
import numpy as np

# machine learning specific libraries
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics

# model specific libraries
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC 


In [24]:
# read data
rawData = pd.read_csv(os.path.join("clean-data","Whiskey_data","Whiskey_Advocate_All_scraped_KHupdate-with-description.csv"), encoding='iso-8859-1' )
rawData.columns

Index(['row_caller', 'Maker', 'distilled', 'age in cast', 'ABV', 'Blended',
       'Bourbon', 'Flavored', 'Other', 'Rye', 'Scotch', 'single blended grain',
       'single blended malt', 'world', 'review score', 'price', 'style',
       'country', 'row_other', 'description'],
      dtype='object')

In [25]:
# clean data
rawData.dropna(how = 'all',inplace=True)
rawData.dropna(subset=['description', 'price'], inplace = True)

In [26]:
# head data
rawData.head(2)

Unnamed: 0,row_caller,Maker,distilled,age in cast,ABV,Blended,Bourbon,Flavored,Other,Rye,Scotch,single blended grain,single blended malt,world,review score,price,style,country,row_other,description
0,1,Johnnie Walker,,,40.00%,1,0,0,0,0,1,0,0,0,97,225,Blended Scotch Whisky,UK,2.0,"Magnificently powerful and intense. Caramels, ..."
1,2,Black Bowmore,1964.0,42 year old,40.50%,0,0,0,0,0,1,0,1,0,97,4500,Single Malt Scotch,UK,3.0,What impresses me most is how this whisky evol...


In [27]:
# review points bin
rp_bins = [(0,75), (75,80), (80,85), (85,90), (90,95), (95, 100)]

# find bin based on value
def find_rp_bin(value):
    
    for i in range(0, len(rp_bins)):
        if rp_bins[i][0] <= value < rp_bins[i][1]:
            return rp_bins[i][0] #lower end of the bin is returned
    return -1

# fill y value
rawData['rp_bins'] = rawData['review score'].apply(find_rp_bin)

In [28]:
# drop bottom 5 % and top 5 % based upon price
bottom5 = np.percentile(rawData['price'],5,axis=0, interpolation='lower')
top5 = np.percentile(rawData['price'],95,axis=0, interpolation='lower')
print(bottom5,top5)

25 625


In [29]:
rawData = rawData[rawData.price >= 25]
rawData = rawData[rawData.price <= 625]

In [30]:
# price quartiles
Q1 = np.percentile(rawData['price'],25,axis=0, interpolation='lower')
Q2 = np.percentile(rawData['price'],50,axis=0, interpolation='lower')
Q3 = np.percentile(rawData['price'],75,axis=0, interpolation='lower')
Q4 = np.percentile(rawData['price'],100,axis=0, interpolation='lower')
print(Q1,Q2,Q3,Q4)

50 79 127 625


In [31]:
# price bin
price_bins = [(0,Q1), (Q1,Q2), (Q2,Q3), (Q3,Q4+1)]

# find bin based on value
def find_price_bin(value):
    price = 0
    for i in range(0, len(price_bins)):
        if price_bins[i][0] <= value < price_bins[i][1]:
            return price_bins[i][0] #lower end of the bin is returned
    return -1

# 
rawData['price_bins'] = rawData['price'].apply(find_price_bin)

In [32]:
price_bins

[(0, 50), (50, 79), (79, 127), (127, 626)]

# Models

## Logistic Regression

In [36]:
def LogisticRegressionModel(dataset,textColumn,yColumn):
    # input:
    #   dataset: unrestricted dataset
    #   textColumn: column name containing text that will be analysed for machine learning
    #   yColumn: column name containing the label
    # return
    #   LogisticRegressionModel_score

    #  split data
    X = dataset[textColumn].values
    y = dataset[yColumn].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1000)
    
    try:    
        # vectorize training descriptions
        vectorizer = CountVectorizer()
        vectorizer.fit(X_train)
        X_v_train = vectorizer.transform(X_train)
        X_v_test  = vectorizer.transform(X_test)

        # logistic regression classification model 
#         classifier = LogisticRegression(solver='liblinear',multi_class='auto')
        classifier = LogisticRegression(solver='newton-cg',multi_class='auto')
        classifier.fit(X_v_train, y_train) # vectorized training data
        LogisticRegressionModel_score = classifier.score(X_v_test, y_test)
    except:
        LogisticRegressionModel_score = 0
    
    return LogisticRegressionModel_score


## Random forests

In [13]:
def RandomForestModel(dataset,textColumn,yColumn):
    # input:
    #   dataset: unrestricted dataset
    #   textColumn: column name containing text that will be analysed for machine learning
    #   yColumn: column name containing the label
    # return
    #   RandomForestModel_score

    #  split data
    X = dataset[textColumn].values
    y = dataset[yColumn].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1000)
    
    try:
        # vectorize training descriptions
        vectorizer = CountVectorizer()
        vectorizer.fit(X_train)
        X_v_train = vectorizer.transform(X_train)
        X_v_test  = vectorizer.transform(X_test)

        # Random Forest classification model
        classifier = RandomForestClassifier(n_estimators=200)
        classifier.fit(X_v_train, y_train) # vectorized training data
        RandomForestModel_score = classifier.score(X_v_test, y_test)
    except:
        RandomForestModel_score = 0
    
    return RandomForestModel_score


## Naive Bayes - Multinomial

In [14]:
def NaiveBayesMultinomialModel(dataset,textColumn,yColumn):
    # input:
    #   dataset: unrestricted dataset
    #   textColumn: column name containing text that will be analysed for machine learning
    #   yColumn: column name containing the label
    # return
    #   NaiveBayesMultinomialModel_score

    #  split data
    X = dataset[textColumn].values
    y = dataset[yColumn].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1000)
    
    # vectorize training descriptions
    vectorizer = CountVectorizer()
    vectorizer.fit(X_train)
    X_v_train = vectorizer.transform(X_train)
    X_v_test  = vectorizer.transform(X_test)
    
    # Naive Bayes - Multinomial - classification model
    classifier = MultinomialNB()
    classifier.fit(X_v_train, y_train) # vectorized training data
    MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
    NaiveBayesMultinomialModel_score = metrics.accuracy_score(y_test, classifier.predict(X_v_test))
    
    return NaiveBayesMultinomialModel_score


## Naive Bayes - Bernoulli

In [15]:
def NaiveBayesBernoulliModel(dataset,textColumn,yColumn):
    # input:
    #   dataset: unrestricted dataset
    #   textColumn: column name containing text that will be analysed for machine learning
    #   yColumn: column name containing the label
    # return
    #   NaiveBayesBernoulliModel_score

    #  split data
    X = dataset[textColumn].values
    y = dataset[yColumn].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1000)
    
    # vectorize training descriptions
    vectorizer = CountVectorizer()
    vectorizer.fit(X_train)
    X_v_train = vectorizer.transform(X_train)
    X_v_test  = vectorizer.transform(X_test)
    
    # Naive Bayes - Bernoulli - classification model
    classifier = BernoulliNB()
    classifier.fit(X_v_train, y_train) # vectorized training data
    BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)
    NaiveBayesBernoulliModel_score = metrics.accuracy_score(y_test, classifier.predict(X_v_test))
    
    return NaiveBayesBernoulliModel_score


## Support Vector Machine

In [16]:
def SupportVectorMachineModel(dataset,textColumn,yColumn):
    # input:
    #   dataset: unrestricted dataset
    #   textColumn: column name containing text that will be analysed for machine learning
    #   yColumn: column name containing the label
    # return
    #   SupportVectorMachineModel_score

    #  split data
    X = dataset[textColumn].values
    y = dataset[yColumn].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1000)
    
    # vectorize training descriptions
    vectorizer = CountVectorizer()
    vectorizer.fit(X_train)
    X_v_train = vectorizer.transform(X_train)
    X_v_test  = vectorizer.transform(X_test)
    
    # Naive Bayes - Bernoulli - classification model
    classifier = SVC(gamma='auto')
    classifier.fit(X_v_train, y_train) # vectorized training data
    SupportVectorMachineModel_score = metrics.accuracy_score(y_test, classifier.predict(X_v_test))
    
    return SupportVectorMachineModel_score


# Machine Learning

In [17]:
def AllModelsApplied(dataset,check):
    # Logistic Regression
    LRM_ReviewPoints_Value_score = LogisticRegressionModel(dataset,'description','review score')
    LRM_ReviewPoints_Bins_score = LogisticRegressionModel(dataset,'description','rp_bins')
    LRM_Price_Value_score = LogisticRegressionModel(dataset,'description','price')
    LRM_Price_Bins_score = LogisticRegressionModel(dataset,'description','price_bins')
    Outcomes.append(round(LRM_ReviewPoints_Value_score,2))
    Outcomes.append(round(LRM_ReviewPoints_Bins_score,2))
    Outcomes.append(round(LRM_Price_Value_score,2))
    Outcomes.append(round(LRM_Price_Bins_score,2))

    # Random Forest
    RF_ReviewPoints_Value_score = RandomForestModel(dataset,'description','review score')
    RF_ReviewPoints_Bins_score = RandomForestModel(dataset,'description','rp_bins')
    RF_Price_Value_score = RandomForestModel(dataset,'description','price')
    RF_Price_Bins_score = RandomForestModel(dataset,'description','price_bins')
    Outcomes.append(round(RF_ReviewPoints_Value_score,2))
    Outcomes.append(round(RF_ReviewPoints_Bins_score,2))
    Outcomes.append(round(RF_Price_Value_score,2))
    Outcomes.append(round(RF_Price_Bins_score,2))

    # Naive Bayes - Multinomial
    NBM_ReviewPoints_Value_score = NaiveBayesMultinomialModel(dataset,'description','review score')
    NBM_ReviewPoints_Bins_score = NaiveBayesMultinomialModel(dataset,'description','rp_bins')
    NBM_Price_Value_score = NaiveBayesMultinomialModel(dataset,'description','price')
    NBM_Price_Bins_score = NaiveBayesMultinomialModel(dataset,'description','price_bins')
    Outcomes.append(round(NBM_ReviewPoints_Value_score,2))
    Outcomes.append(round(NBM_ReviewPoints_Bins_score,2))
    Outcomes.append(round(NBM_Price_Value_score,2))
    Outcomes.append(round(NBM_Price_Bins_score,2))

    # Naive Bayes - Bernoulli
    NBB_ReviewPoints_Value_score = NaiveBayesBernoulliModel(dataset,'description','review score')
    NBB_ReviewPoints_Bins_score = NaiveBayesBernoulliModel(dataset,'description','rp_bins')
    NBB_Price_Value_score = NaiveBayesBernoulliModel(dataset,'description','price')
    NBB_Price_Bins_score = NaiveBayesBernoulliModel(dataset,'description','price_bins')
    Outcomes.append(round(NBB_ReviewPoints_Value_score,2))
    Outcomes.append(round(NBB_ReviewPoints_Bins_score,2))
    Outcomes.append(round(NBB_Price_Value_score,2))
    Outcomes.append(round(NBB_Price_Bins_score,2))
    
    # Support Vector Machines
    SVM_ReviewPoints_Value_score = SupportVectorMachineModel(dataset,'description','review score')
    SVM_ReviewPoints_Bins_score = SupportVectorMachineModel(dataset,'description','rp_bins')
    SVM_Price_Value_score = SupportVectorMachineModel(dataset,'description','price')
    if ( check != 'Q1' and check != 'Q2' and check != 'Q3' and check != 'Q4'):
        SVM_Price_Bins_score = SupportVectorMachineModel(dataset,'description','price_bins')
    else:
        SVM_Price_Bins_score = 0
    Outcomes.append(round(SVM_ReviewPoints_Value_score,2))
    Outcomes.append(round(SVM_ReviewPoints_Bins_score,2))
    Outcomes.append(round(SVM_Price_Value_score,2))
    Outcomes.append(round(SVM_Price_Bins_score,2))
        
    return Outcomes

# Data Sets

In [18]:
# header data
Result = pd.DataFrame(columns=('Model','Feature','Full_Data','Q1_Q2','Q2_Q3','Q1','Q2','Q3','Q4'))
Result['Model'] = ['Logistic Regression','Logistic Regression','Logistic Regression','Logistic Regression','Random Forest','Random Forest','Random Forest','Random Forest','Naive Bayes - Multinomial','Naive Bayes - Multinomial','Naive Bayes - Multinomial','Naive Bayes - Multinomial','Naive Bayes - Bernoulli','Naive Bayes - Bernoulli','Naive Bayes - Bernoulli','Naive Bayes - Bernoulli','SupportVectorMachine','SupportVectorMachine','SupportVectorMachine','SupportVectorMachine']
Result['Feature'] = ['review score','review score bins','price','price bins','review score','review score bins','price','price bins','review score','review score bins','price','price bins','review score','review score bins','price','price bins','review score','review score bins','price','price bins']

In [19]:
# full data set
Outcomes = []
FilteredData = rawData.copy()
AllModelsApplied(FilteredData,'full data set')
Result['Full_Data'] = Outcomes

In [20]:
# Q1/Q2
Outcomes = []
FilteredData = rawData.copy()
indexNames = FilteredData[ (FilteredData['price'] >= Q2)].index
FilteredData.drop(indexNames , inplace=True)
AllModelsApplied(FilteredData,'Q1/Q2')
Result['Q1_Q2'] = Outcomes

In [21]:
# Q2/Q3
Outcomes = []
FilteredData = rawData.copy()
indexNames = FilteredData[ (FilteredData['price'] < Q1)].index
FilteredData.drop(indexNames , inplace=True)
indexNames = FilteredData[ (FilteredData['price'] >= Q3)].index
FilteredData.drop(indexNames , inplace=True)
AllModelsApplied(FilteredData,'Q2/Q3')
Result['Q2_Q3'] = Outcomes

In [22]:
# Q1
Outcomes = []
FilteredData = rawData.copy()
indexNames = FilteredData[ (FilteredData['price'] >= Q1)].index
FilteredData.drop(indexNames , inplace=True)
AllModelsApplied(FilteredData,'Q1')
Result['Q1'] = Outcomes

In [23]:
# Q2
Outcomes = []
FilteredData = rawData.copy()
indexNames = FilteredData[ (FilteredData['price'] < Q1)].index
FilteredData.drop(indexNames , inplace=True)
indexNames = FilteredData[ (FilteredData['price'] >= Q2)].index
FilteredData.drop(indexNames , inplace=True)
AllModelsApplied(FilteredData,'Q2')
Result['Q2'] = Outcomes

In [24]:
# Q3
Outcomes = []
FilteredData = rawData.copy()
indexNames = FilteredData[ (FilteredData['price'] < Q2)].index
FilteredData.drop(indexNames , inplace=True)
indexNames = FilteredData[ (FilteredData['price'] >= Q3)].index
FilteredData.drop(indexNames , inplace=True)
AllModelsApplied(FilteredData,'Q3')
Result['Q3'] = Outcomes

In [25]:
# Q4
Outcomes = []
FilteredData = rawData.copy()
indexNames = FilteredData[ (FilteredData['price'] < Q3)].index
FilteredData.drop(indexNames , inplace=True)
AllModelsApplied(FilteredData,'Q4')
Result['Q4'] = Outcomes

# Summary

In [26]:
Result

Unnamed: 0,Model,Feature,Full_Data,Q1_Q2,Q2_Q3,Q1,Q2,Q3,Q4
0,Logistic Regression,review score,0.11,0.11,0.1,0.14,0.1,0.11,0.1
1,Logistic Regression,review score bins,0.47,0.5,0.47,0.48,0.51,0.46,0.46
2,Logistic Regression,price,0.07,0.12,0.09,0.2,0.17,0.14,0.1
3,Logistic Regression,price bins,0.44,0.63,0.57,0.0,0.0,0.0,0.0
4,Random Forest,review score,0.09,0.11,0.12,0.15,0.11,0.09,0.12
5,Random Forest,review score bins,0.46,0.5,0.54,0.54,0.52,0.46,0.47
6,Random Forest,price,0.08,0.14,0.13,0.22,0.17,0.14,0.08
7,Random Forest,price bins,0.48,0.68,0.64,1.0,1.0,1.0,1.0
8,Naive Bayes - Multinomial,review score,0.09,0.14,0.11,0.12,0.14,0.09,0.11
9,Naive Bayes - Multinomial,review score bins,0.5,0.52,0.54,0.51,0.56,0.46,0.46


# Words

In [27]:
Result.to_csv('TextReviewOverview.csv')

In [6]:
Result = pd.read_csv('TextReviewOverview.csv')

In [49]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
nltk.download('stopwords')
stopWords = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [51]:
Outcomes = []
dataset = rawData.copy()
indexNames = dataset[ (dataset['price_bins'] >= Q2)].index
dataset.drop(indexNames , inplace=True)
# Result['Q1_Q2'] = Outcomes

#  split data
X = dataset['description'].values
y = dataset['price_bins'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1000)

    # vectorize training descriptions
vectorizer = TfidfVectorizer(stop_words=stopWords, max_features=50)
vectorizer.fit(X_train)
featureNames = vectorizer.get_feature_names()
X_v_train = vectorizer.transform(X_train)
X_v_test  = vectorizer.transform(X_test)

# Random Forest classification model
classifier = RandomForestClassifier(n_estimators=200)
classifier.fit(X_v_train, y_train) # vectorized training data
RandomForestModel_score = classifier.score(X_v_test, y_test)

In [55]:
classifier.class_weight

In [58]:
classifier.fit(X_v_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [61]:
featureNames

['apple',
 'bourbon',
 'caramel',
 'chocolate',
 'cinnamon',
 'citrus',
 'corn',
 'dark',
 'distillery',
 'dried',
 'dry',
 'finish',
 'flavors',
 'fresh',
 'fruit',
 'fruits',
 'grain',
 'hint',
 'honey',
 'lemon',
 'light',
 'like',
 'malt',
 'new',
 'nose',
 'notes',
 'oak',
 'old',
 'one',
 'orange',
 'palate',
 'peat',
 'pepper',
 'rich',
 'rye',
 'sherry',
 'smoke',
 'soft',
 'spice',
 'spices',
 'spicy',
 'sweet',
 'sweetness',
 'toffee',
 'vanilla',
 'whiskey',
 'whisky',
 'wood',
 'year',
 'years']

In [60]:
classifier.feature_importances_

array([0.0135924 , 0.0179647 , 0.02129999, 0.02041331, 0.01960871,
       0.01391127, 0.02353406, 0.01412008, 0.01654921, 0.01336544,
       0.01338555, 0.04124588, 0.01806338, 0.0155288 , 0.02596583,
       0.01362344, 0.01454145, 0.01245092, 0.0169954 , 0.01216409,
       0.02506965, 0.01927553, 0.02051106, 0.0138293 , 0.03389338,
       0.02981037, 0.02919623, 0.01610588, 0.0175476 , 0.02188878,
       0.0295274 , 0.01606408, 0.0163846 , 0.0151307 , 0.02195973,
       0.01475945, 0.01534584, 0.01523697, 0.02385336, 0.0177164 ,
       0.01449361, 0.0354929 , 0.01621605, 0.01757819, 0.03247024,
       0.04048642, 0.02475295, 0.01556653, 0.01493957, 0.01657333])

In [82]:
df2 = pd.DataFrame(featureNames,columns=['words'])
df2['weights'] = classifier.feature_importances_
df2.sort_values(by=['weights'],ascending=False, inplace=True)

In [83]:
df2.head(10)

Unnamed: 0,words,weights
11,finish,0.041246
45,whiskey,0.040486
41,sweet,0.035493
24,nose,0.033893
44,vanilla,0.03247
25,notes,0.02981
30,palate,0.029527
26,oak,0.029196
14,fruit,0.025966
20,light,0.02507


In [76]:
df2.to_csv('wordweighting.csv')