# Machine Learning

In [1]:
# libraries
import pandas as pd
import os
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [2]:
# read data
rawData = pd.read_csv(os.path.join("clean-data","Whiskey_data","Whiskey_Advocate_All_scraped_KHupdate-with-description.csv"), encoding='iso-8859-1' )
rawData.columns

Index(['row_caller', 'Maker', 'distilled', 'age in cast', 'ABV', 'Blended',
       'Bourbon', 'Flavored', 'Other', 'Rye', 'Scotch', 'single blended grain',
       'single blended malt', 'world', 'review score', 'price', 'style',
       'country', 'row_other', 'description'],
      dtype='object')

In [3]:
# head data
rawData.head()

Unnamed: 0,row_caller,Maker,distilled,age in cast,ABV,Blended,Bourbon,Flavored,Other,Rye,Scotch,single blended grain,single blended malt,world,review score,price,style,country,row_other,description
0,1,Johnnie Walker,,,40.00%,1,0,0,0,0,1,0,0,0,97,225,Blended Scotch Whisky,UK,2.0,What impresses me most is how this whisky evol...
1,2,Black Bowmore,1964.0,42 year old,40.50%,0,0,0,0,0,1,0,1,0,97,4500,Single Malt Scotch,UK,3.0,There have been some legendary Bowmores from t...
2,3,Bowmore,,46 year old,42.90%,0,0,0,0,0,1,0,1,0,97,13500,Single Malt Scotch,UK,4.0,With a name inspired by a 1926 Buster Keaton m...
3,4,Compass Box,,30 years old,53.40%,1,0,0,0,0,1,0,1,0,96,325,Blended Malt Scotch Whisky,UK,5.0,"Captivating, enticing, and wonderfully charmin..."
4,5,Chivas,,,40.00%,1,0,0,0,0,1,0,1,0,96,160,Blended Malt Scotch Whisky,UK,6.0,Deep gold color. Surprisingly lively on the no...


In [4]:
# review points bin
rp_bins = [(0,75), (75,80), (80,85), (85,90), (90,95), (95, 100)]

# find bin based on value
def find_rp_bin(value):
    
    for i in range(0, len(rp_bins)):
        if rp_bins[i][0] <= value < rp_bins[i][1]:
            return rp_bins[i][0] #lower end of the bin is returned
    return -1

# fill y value
rawData['rp_bins'] = rawData['review score'].apply(find_rp_bin)

In [5]:
# price bin
price_bins = [(0,10), (10,25), (25,50), (50,75), (75,250), (250,500), (500,1000000)]

# find bin based on value
def find_price_bin(value):
    price = 0
    for i in range(0, len(price_bins)):
        if price_bins[i][0] <= value < price_bins[i][1]:
            return price_bins[i][0] #lower end of the bin is returned
    return -1

# 
rawData['price_bins'] = rawData['price'].apply(find_price_bin)

In [6]:
# clean data
rawData.dropna(how = 'all',inplace=True)
rawData.dropna(subset=['description', 'price'], inplace = True)

# Models

## Review Points

### Logistic Regresssion

In [7]:
# Specific libraries
from sklearn.linear_model import LogisticRegression

In [8]:
# vectorize description: take the words of each description and create a vocabulary of all the unique words in the descriptions.
# This vocabulary can then be used to create a feature vector of the count of the words:
vectorizer = CountVectorizer(min_df=0, lowercase=False)
vectorizer.fit(rawData['description'])
# vectorizer.vocabulary_

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=False, max_df=1.0, max_features=None, min_df=0,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [9]:
# create vector with all words for each description = Bag-of-words (BOW) model
vectorizer.transform(rawData['description']).toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

#### Values

In [10]:
#  split data
X_LR_rp = rawData['description'].values
y_LR_rp_values = rawData['review score'].values

X_LR_rp_train, X_LR_rp_test, y_LR_rp_values_train, y_LR_rp_values_test = train_test_split(X_LR_rp, y_LR_rp_values, test_size=0.25, random_state=1000)

In [11]:
# vectorize training descriptions
vectorizer = CountVectorizer()
vectorizer.fit(X_LR_rp_train)
X_LR_rp_v_train = vectorizer.transform(X_LR_rp_train)
X_LR_rp_v_test  = vectorizer.transform(X_LR_rp_test)

In [12]:
# logistic regression classification model
classifier = LogisticRegression()
classifier.fit(X_LR_rp_v_train, y_LR_rp_values_train) # vectorized training data
score_LR_rp_values = classifier.score(X_LR_rp_v_test, y_LR_rp_values_test)
print("Review Points:","Logistic Regression:","values:","Accuracy:",score_LR_rp_values)



Review Points Logistic Regression: values: Accuracy: 0.12254901960784313


#### Bins

In [13]:
#  split data
X_LR_rp = rawData['description'].values
y_LR_rp_bins = rawData['review score'].values

X_LR_rp_train, X_LR_rp_test, y_LR_rp_bins_train, y_LR_rp_bins_test = train_test_split(X_LR_rp, y_LR_rp_bins, test_size=0.25, random_state=1000)

In [14]:
# vectorize training descriptions
vectorizer = CountVectorizer()
vectorizer.fit(X_LR_rp_train)
X_LR_rp_v_train = vectorizer.transform(X_LR_rp_train)
X_LR_rp_v_test  = vectorizer.transform(X_LR_rp_test)

In [15]:
# logistic regression classification model
classifier = LogisticRegression()
classifier.fit(X_LR_rp_v_train, y_LR_rp_bins_train) # vectorized training data
score_LR_rp_bins = classifier.score(X_LR_rp_v_test, y_LR_rp_values_test)
print("Review Points:","Logistic Regression:","bins:","Accuracy:",score_LR_rp_bins)



Review Points Logistic Regression: values: Accuracy: 0.12254901960784313


### Random forests

In [16]:
# Specific libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree

#### Values

In [17]:
#  split data
X_RF_rp = rawData['description'].values
y_RF_rp_values = rawData['review score'].values

X_RF_rp_train, X_RF_rp_test, y_RF_rp_values_train, y_RF_rp_values_test = train_test_split(X_RF_rp, y_RF_rp_values, test_size=0.25, random_state=1000)

In [18]:
# vectorize training descriptions
vectorizer = CountVectorizer()
vectorizer.fit(X_RF_rp_train)
X_RF_rp_v_train = vectorizer.transform(X_RF_rp_train)
X_RF_rp_v_test  = vectorizer.transform(X_RF_rp_test)

In [20]:
# Create a random forest classifier
rf_rp = RandomForestClassifier(n_estimators=200)
rf_rp_values = rf_rp.fit(X_RF_rp_v_train, y_RF_rp_values_train)
score_RF_rp_values = rf_rp_values.score(X_RF_rp_v_test, y_RF_rp_values_test)
print("Review Points:","Random forests:","values:","Accuracy:",score_RF_rp_values)

Review Points Random forests: values: Accuracy: 0.10130718954248366


#### Bins

In [21]:
#  split data
X_RF_rp = rawData['description'].values
y_RF_rp_bins = rawData['review score'].values

X_RF_rp_train, X_RF_rp_test, y_RF_rp_bins_train, y_RF_rp_bins_test = train_test_split(X_RF_rp, y_RF_rp_bins, test_size=0.25, random_state=1000)

In [22]:
# vectorize training descriptions
vectorizer = CountVectorizer()
vectorizer.fit(X_RF_rp_train)
X_RF_rp_v_train = vectorizer.transform(X_RF_rp_train)
X_RF_rp_v_test  = vectorizer.transform(X_RF_rp_test)

In [23]:
# Create a random forest classifier
rf_rp = RandomForestClassifier(n_estimators=200)
rf_rp_bins = rf_rp.fit(X_RF_rp_v_train, y_RF_rp_bins_train)
score_RF_rp_bins = rf_rp_bins.score(X_RF_rp_v_test, y_RF_rp_bins_test)
print("Review Points:","Random forests:","bins:","Accuracy:",score_RF_rp_bins)

Review Points Random forests: bins: Accuracy: 0.11764705882352941


### Naive Bayes - Multinomial
It is used for discrete counts. For example, let’s say, we have a text classification problem. Here we can consider bernoulli trials which is one step further and instead of “word occurring in the document”, we have “count how often word occurs in the document”, you can think of it as “number of times outcome number x_i is observed over the n trials”.

#### Values

In [25]:
# split data
X_NBM_rp = rawData['description'].values
y_NBM_rp_values = rawData['review score'].values

X_NBM_rp_train, X_NBM_rp_test, y_NBM_rp_values_train, y_NBM_rp_values_test = train_test_split(X_NBM_rp, y_NBM_rp_values, test_size=0.25, random_state=1000)

In [27]:
# vectorize training descriptions
vectorizer = CountVectorizer()
vectorizer.fit(X_NBM_rp_train)
X_NBM_rp_v_train = vectorizer.transform(X_NBM_rp_train)
X_NBM_rp_v_test  = vectorizer.transform(X_NBM_rp_test)

In [32]:
from sklearn.naive_bayes import MultinomialNB
#Create a Multinomial Classifier
model_NBM_rp = MultinomialNB()
# Train the model using the training sets
model_NBM_rp.fit(X_NBM_rp_v_train,y_NBM_rp_values_train)
#Predict Output 
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
y_NBM_rp_values_prediction = model_NBM_rp.predict(X_NBM_rp_v_test)
score_NBM_rp_values = metrics.accuracy_score(y_NBM_rp_values_test, y_NBM_rp_values_prediction)
print("Review Points:","Naive Bayes - Multinomial:","values:","Accuracy:",score_NBM_rp_values)

Review Points: Naive Bayes - Multinomial: review points: Accuracy: 0.119281045751634


### Naive Bayes - Bernoulli
The binomial model is useful if your feature vectors are binary (i.e. zeros and ones). One application would be text classification with ‘bag of words’ model where the 1s & 0s are “word occurs in the document” and “word does not occur in the document” respectively.

#### Values

In [37]:
# split data
X_NBB_rp = rawData['description'].values
y_NBB_rp_values = rawData['review score'].values

X_NBB_rp_train, X_NBB_rp_test, y_NBB_rp_values_train, y_NBB_rp_values_test = train_test_split(X_NBB_rp, y_NBB_rp_values, test_size=0.25, random_state=1000)

In [38]:
# vectorize training descriptions
vectorizer = CountVectorizer()
vectorizer.fit(X_NBB_rp_train)
X_NBB_rp_v_train = vectorizer.transform(X_NBB_rp_train)
X_NBB_rp_v_test  = vectorizer.transform(X_NBB_rp_test)

In [39]:
from sklearn.naive_bayes import BernoulliNB
#Create a BernoulliNB Classifier
model_NBB_rp = BernoulliNB()
# Train the model using the training sets
model_NBB_rp.fit(X_NBB_rp_v_train,y_NBB_rp_values_train)
#Predict Output 
BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)
y_NBB_rp_values_prediction = model_NBB_rp.predict(X_NBB_rp_v_test)
score_NBB_rp_values = metrics.accuracy_score(y_NBB_rp_values_test, y_NBB_rp_values_prediction)
print("Review Points:","Naive Bayes - Bernoulli:","values:","Accuracy:",score_NBB_rp_values)

Review Points: Naive Bayes - Bernoulli: review points: Accuracy: 0.10620915032679738


In [41]:
# Summary
print("Review Points:","Logistic Regression:","values:","Accuracy:",score_LR_rp_values)
print("Review Points:","Logistic Regression:","bins:","Accuracy:",score_LR_rp_bins)

print("Review Points:","Random forests:","values:","Accuracy:",score_RF_rp_values)
print("Review Points:","Random forests:","bins:","Accuracy:",score_RF_rp_bins)

print("Review Points:","Naive Bayes - Multinomial:","values:","Accuracy:",score_NBM_rp_values)
print("Review Points:","Naive Bayes - Bernoulli:","values:","Accuracy:",score_NBB_rp_values)

Review Points: Logistic Regression: values: Accuracy: 0.12254901960784313
Review Points: Logistic Regression: bins: Accuracy: 0.12254901960784313
Review Points: Random forests: values: Accuracy: 0.10130718954248366
Review Points: Random forests: bins: Accuracy: 0.11764705882352941
Review Points: Naive Bayes - Multinomial: values: Accuracy: 0.119281045751634
Review Points: Naive Bayes - Bernoulli: values: Accuracy: 0.10620915032679738
