# Whiskey Review

In [1]:
# general libraries
import pandas as pd
import os
import numpy as np

# machine learning specific libraries
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics

# model specific libraries
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB


In [2]:
# read data
rawData = pd.read_csv(os.path.join("clean-data","Whiskey_data","Whiskey_Advocate_All_scraped_KHupdate-with-description.csv"), encoding='iso-8859-1' )
rawData.columns

Index(['row_caller', 'Maker', 'distilled', 'age in cast', 'ABV', 'Blended',
       'Bourbon', 'Flavored', 'Other', 'Rye', 'Scotch', 'single blended grain',
       'single blended malt', 'world', 'review score', 'price', 'style',
       'country', 'row_other', 'description'],
      dtype='object')

In [3]:
# clean data
rawData.dropna(how = 'all',inplace=True)
rawData.dropna(subset=['description', 'price'], inplace = True)

In [4]:
# head data
rawData.head(2)

Unnamed: 0,row_caller,Maker,distilled,age in cast,ABV,Blended,Bourbon,Flavored,Other,Rye,Scotch,single blended grain,single blended malt,world,review score,price,style,country,row_other,description
0,1,Johnnie Walker,,,40.00%,1,0,0,0,0,1,0,0,0,97,225,Blended Scotch Whisky,UK,2.0,"Magnificently powerful and intense. Caramels, ..."
1,2,Black Bowmore,1964.0,42 year old,40.50%,0,0,0,0,0,1,0,1,0,97,4500,Single Malt Scotch,UK,3.0,What impresses me most is how this whisky evol...


In [5]:
# review points bin
rp_bins = [(0,75), (75,80), (80,85), (85,90), (90,95), (95, 100)]

# find bin based on value
def find_rp_bin(value):
    
    for i in range(0, len(rp_bins)):
        if rp_bins[i][0] <= value < rp_bins[i][1]:
            return rp_bins[i][0] #lower end of the bin is returned
    return -1

# fill y value
rawData['rp_bins'] = rawData['review score'].apply(find_rp_bin)

In [6]:
# price bin
price_bins = [(0,10), (10,25), (25,50), (50,75), (75,250), (250,500), (500,1000000)]

# find bin based on value
def find_price_bin(value):
    price = 0
    for i in range(0, len(price_bins)):
        if price_bins[i][0] <= value < price_bins[i][1]:
            return price_bins[i][0] #lower end of the bin is returned
    return -1

# 
rawData['price_bins'] = rawData['price'].apply(find_price_bin)

In [7]:
# price quartiles
Q1 = np.percentile(rawData['price'],25,axis=0, interpolation='lower')
Q2 = np.percentile(rawData['price'],50,axis=0, interpolation='lower')
Q3 = np.percentile(rawData['price'],75,axis=0, interpolation='lower')
Q4 = np.percentile(rawData['price'],100,axis=0, interpolation='lower')
print(Q1,Q2,Q3,Q4)

50 79 135 157000


# Models

## Logistic Regression

In [None]:
def LogisticRegressionModel(dataset,textColumn,yColumn):
    # input:
    #   dataset: unrestricted dataset
    #   textColumn: column name containing text that will be analysed for machine learning
    #   yColumn: column name containing the label
    # return
    #   LogisticRegressionModel_score

    # vectorize description: take the words of each description and create a vocabulary of all the unique words in the descriptions.
    vectorizer = CountVectorizer(min_df=0, lowercase=False)
    vectorizer.fit(dataset[textColumn])
    # create vector with all words for each description = Bag-of-words (BOW) model
    vectorizer.transform(dataset[textColumn]).toarray()

    #  split data
    X = dataset[textColumn].values
    y = dataset[yColumn].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1000)
    
    try:    
        # vectorize training descriptions
        vectorizer = CountVectorizer()
        vectorizer.fit(X_train)
        X_v_train = vectorizer.transform(X_train)
        X_v_test  = vectorizer.transform(X_test)

        # logistic regression classification model
        classifier = LogisticRegression(solver='liblinear',multi_class='auto')
        classifier.fit(X_v_train, y_train) # vectorized training data
        LogisticRegressionModel_score = classifier.score(X_v_test, y_test)
    except:
        LogisticRegressionModel_score = 0
    
    return LogisticRegressionModel_score


## Random forests

In [None]:
def RandomForestModel(dataset,textColumn,yColumn):
    # input:
    #   dataset: unrestricted dataset
    #   textColumn: column name containing text that will be analysed for machine learning
    #   yColumn: column name containing the label
    # return
    #   RandomForestModel_score

    #  split data
    X = dataset[textColumn].values
    y = dataset[yColumn].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1000)
    
    try:
        # vectorize training descriptions
        vectorizer = CountVectorizer()
        vectorizer.fit(X_train)
        X_v_train = vectorizer.transform(X_train)
        X_v_test  = vectorizer.transform(X_test)

        # Random Forest classification model
        classifier = RandomForestClassifier(n_estimators=200)
        classifier.fit(X_v_train, y_train) # vectorized training data
        RandomForestModel_score = classifier.score(X_v_test, y_test)
    except:
        RandomForestModel_score = 0
    
    return RandomForestModel_score


## Naive Bayes - Multinomial

In [None]:
def NaiveBayesMultinomialModel(dataset,textColumn,yColumn):
    # input:
    #   dataset: unrestricted dataset
    #   textColumn: column name containing text that will be analysed for machine learning
    #   yColumn: column name containing the label
    # return
    #   NaiveBayesMultinomialModel_score

    #  split data
    X = dataset[textColumn].values
    y = dataset[yColumn].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1000)
    
    # vectorize training descriptions
    vectorizer = CountVectorizer()
    vectorizer.fit(X_train)
    X_v_train = vectorizer.transform(X_train)
    X_v_test  = vectorizer.transform(X_test)
    
    # Naive Bayes - Multinomial - classification model
    classifier = MultinomialNB()
    classifier.fit(X_v_train, y_train) # vectorized training data
    MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
    NaiveBayesMultinomialModel_score = metrics.accuracy_score(y_test, classifier.predict(X_v_test))
    
    return NaiveBayesMultinomialModel_score


## Naive Bayes - Bernoulli

In [None]:
def NaiveBayesBernoulliModel(dataset,textColumn,yColumn):
    # input:
    #   dataset: unrestricted dataset
    #   textColumn: column name containing text that will be analysed for machine learning
    #   yColumn: column name containing the label
    # return
    #   NaiveBayesBernoulliModel_score

    #  split data
    X = dataset[textColumn].values
    y = dataset[yColumn].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1000)
    
    # vectorize training descriptions
    vectorizer = CountVectorizer()
    vectorizer.fit(X_train)
    X_v_train = vectorizer.transform(X_train)
    X_v_test  = vectorizer.transform(X_test)
    
    # Naive Bayes - Bernoulli - classification model
    classifier = BernoulliNB()
    classifier.fit(X_v_train, y_train) # vectorized training data
    BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)
    NaiveBayesBernoulliModel_score = metrics.accuracy_score(y_test, classifier.predict(X_v_test))
    
    return NaiveBayesBernoulliModel_score


# Machine Learning

In [None]:
def AllModelsApplied(dataset,check):
    # Logistic Regression
    if ( check != 'Q1' and check != 'Q3' and check != 'Q4' ):
        LRM_ReviewPoints_Value_score = LogisticRegressionModel(dataset,'description','review score')
        LRM_ReviewPoints_Bins_score = LogisticRegressionModel(dataset,'description','rp_bins')
        LRM_Price_Value_score = LogisticRegressionModel(dataset,'description','price')
        LRM_Price_Bins_score = LogisticRegressionModel(dataset,'description','price_bins')
    else:
        LRM_ReviewPoints_Value_score = 0
        LRM_ReviewPoints_Bins_score = 0
        LRM_Price_Value_score = 0
        LRM_Price_Bins_score = 0
    Outcomes.append(round(LRM_ReviewPoints_Value_score,2))
    Outcomes.append(round(LRM_ReviewPoints_Bins_score,2))
    Outcomes.append(round(LRM_Price_Value_score,2))
    Outcomes.append(round(LRM_Price_Bins_score,2))

    # Random Forest
    RF_ReviewPoints_Value_score = RandomForestModel(dataset,'description','review score')
    RF_ReviewPoints_Bins_score = RandomForestModel(dataset,'description','rp_bins')
    RF_Price_Value_score = RandomForestModel(dataset,'description','price')
    RF_Price_Bins_score = RandomForestModel(dataset,'description','price_bins')
    Outcomes.append(round(RF_ReviewPoints_Value_score,2))
    Outcomes.append(round(RF_ReviewPoints_Bins_score,2))
    Outcomes.append(round(RF_Price_Value_score,2))
    Outcomes.append(round(RF_Price_Bins_score,2))

    # Naive Bayes - Multinomial
    NBM_ReviewPoints_Value_score = NaiveBayesMultinomialModel(dataset,'description','review score')
    NBM_ReviewPoints_Bins_score = NaiveBayesMultinomialModel(dataset,'description','rp_bins')
    NBM_Price_Value_score = NaiveBayesMultinomialModel(dataset,'description','price')
    NBM_Price_Bins_score = NaiveBayesMultinomialModel(dataset,'description','price_bins')
    Outcomes.append(round(NBM_ReviewPoints_Value_score,2))
    Outcomes.append(round(NBM_ReviewPoints_Bins_score,2))
    Outcomes.append(round(NBM_Price_Value_score,2))
    Outcomes.append(round(NBM_Price_Bins_score,2))

    # Naive Bayes - Bernoulli
    NBB_ReviewPoints_Value_score = NaiveBayesBernoulliModel(dataset,'description','review score')
    NBB_ReviewPoints_Bins_score = NaiveBayesBernoulliModel(dataset,'description','rp_bins')
    NBB_Price_Value_score = NaiveBayesBernoulliModel(dataset,'description','price')
    NBB_Price_Bins_score = NaiveBayesBernoulliModel(dataset,'description','price_bins')
    Outcomes.append(round(NBB_ReviewPoints_Value_score,2))
    Outcomes.append(round(NBB_ReviewPoints_Bins_score,2))
    Outcomes.append(round(NBB_Price_Value_score,2))
    Outcomes.append(round(NBB_Price_Bins_score,2))
    return Outcomes

# Data Sets

In [None]:
# header data
Result = pd.DataFrame(columns=('Model','Feature','Full Data','Q1/Q2','Q2/Q3','Q1','Q2','Q3','Q4'))
Result['Model'] = ['Logistic Regression','Logistic Regression','Logistic Regression','Logistic Regression','Random Forest','Random Forest','Random Forest','Random Forest','Naive Bayes - Multinomial','Naive Bayes - Multinomial','Naive Bayes - Multinomial','Naive Bayes - Multinomial','Naive Bayes - Bernoulli','Naive Bayes - Bernoulli','Naive Bayes - Bernoulli','Naive Bayes - Bernoulli']
Result['Feature'] = ['review score','review score bins','price','price bins','review score','review score bins','price','price bins','review score','review score bins','price','price bins','review score','review score bins','price','price bins']

In [None]:
# full data set
Outcomes = []
FilteredData = rawData
AllModelsApplied(FilteredData,'full data set')
Result['Full Data'] = Outcomes

In [None]:
# Q1/Q2
Outcomes = []
FilteredData = rawData
indexNames = FilteredData[ (FilteredData['price'] >= Q2)].index
FilteredData.drop(indexNames , inplace=True)
AllModelsApplied(FilteredData,'Q1/Q2')
Result['Q1/Q2'] = Outcomes

In [None]:
# Q2/Q3
Outcomes = []
FilteredData = rawData
indexNames = FilteredData[ (FilteredData['price'] < Q1)].index
FilteredData.drop(indexNames , inplace=True)
indexNames = FilteredData[ (FilteredData['price'] >= Q3)].index
FilteredData.drop(indexNames , inplace=True)
AllModelsApplied(FilteredData,'Q2/Q3')
Result['Q2/Q3'] = Outcomes

In [None]:
# Q1
Outcomes = []
FilteredData = rawData
indexNames = FilteredData[ (FilteredData['price'] >= Q1)].index
FilteredData.drop(indexNames , inplace=True)
AllModelsApplied(FilteredData,'Q1')
Result['Q1'] = Outcomes

In [None]:
# Q2
Outcomes = []
FilteredData = rawData
indexNames = FilteredData[ (FilteredData['price'] < Q1)].index
FilteredData.drop(indexNames , inplace=True)
indexNames = FilteredData[ (FilteredData['price'] >= Q2)].index
FilteredData.drop(indexNames , inplace=True)
AllModelsApplied(FilteredData,'Q2')
Result['Q2'] = Outcomes

In [None]:
# Q3
Outcomes = []
FilteredData = rawData
indexNames = FilteredData[ (FilteredData['price'] < Q2)].index
FilteredData.drop(indexNames , inplace=True)
indexNames = FilteredData[ (FilteredData['price'] >= Q3)].index
FilteredData.drop(indexNames , inplace=True)
AllModelsApplied(FilteredData,'Q3')
Result['Q3'] = Outcomes

In [None]:
# Q4
Outcomes = []
FilteredData = rawData
indexNames = FilteredData[ (FilteredData['price'] < Q3)].index
FilteredData.drop(indexNames , inplace=True)
AllModelsApplied(FilteredData,'Q4')
Result['Q4'] = Outcomes

# Summary

In [None]:
Result

In [None]:
Outcomes = []
FilteredData = rawData
indexNames = FilteredData[ (FilteredData['price'] >= Q1)].index
FilteredData.drop(indexNames , inplace=True)
FilteredData.describe

In [None]:
Q1

In [None]:
rawData.desribe