# Machine Learning

In [1]:
# libraries
import pandas as pd
import os
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [10]:
# read data
rawData = pd.read_csv(os.path.join("clean-data","Whiskey_data","Whiskey_Advocate_All_scraped_KHupdate-with-description.csv"), encoding='iso-8859-1' )
rawData.columns

Index(['row_caller', 'Maker', 'distilled', 'age in cast', 'ABV', 'Blended',
       'Bourbon', 'Flavored', 'Other', 'Rye', 'Scotch', 'single blended grain',
       'single blended malt', 'world', 'review score', 'price', 'style',
       'country', 'row_other', 'description'],
      dtype='object')

In [11]:
# head data
rawData.head()

Unnamed: 0,row_caller,Maker,distilled,age in cast,ABV,Blended,Bourbon,Flavored,Other,Rye,Scotch,single blended grain,single blended malt,world,review score,price,style,country,row_other,description
0,1,Johnnie Walker,,,40.00%,1,0,0,0,0,1,0,0,0,97,225,Blended Scotch Whisky,UK,2.0,What impresses me most is how this whisky evol...
1,2,Black Bowmore,1964.0,42 year old,40.50%,0,0,0,0,0,1,0,1,0,97,4500,Single Malt Scotch,UK,3.0,There have been some legendary Bowmores from t...
2,3,Bowmore,,46 year old,42.90%,0,0,0,0,0,1,0,1,0,97,13500,Single Malt Scotch,UK,4.0,With a name inspired by a 1926 Buster Keaton m...
3,4,Compass Box,,30 years old,53.40%,1,0,0,0,0,1,0,1,0,96,325,Blended Malt Scotch Whisky,UK,5.0,"Captivating, enticing, and wonderfully charmin..."
4,5,Chivas,,,40.00%,1,0,0,0,0,1,0,1,0,96,160,Blended Malt Scotch Whisky,UK,6.0,Deep gold color. Surprisingly lively on the no...


In [13]:
# review points bin
rp_bins = [(0,75), (75,80), (80,85), (85,90), (90,95), (95, 100)]

# find bin based on value
def find_rp_bin(value):
    
    for i in range(0, len(rp_bins)):
        if rp_bins[i][0] <= value < rp_bins[i][1]:
            return rp_bins[i][0] #lower end of the bin is returned
    return -1

# fill y value
rawData['rp_bins'] = rawData['review score'].apply(find_rp_bin)

In [7]:
# price bin
price_bins = [(0,10), (10,25), (25,50), (50,75), (75,250), (250,500), (500,1000000)]

# find bin based on value
def find_price_bin(value):
    price = 0
    for i in range(0, len(price_bins)):
        if price_bins[i][0] <= value < price_bins[i][1]:
            return price_bins[i][0] #lower end of the bin is returned
    return -1

# 
rawData['price_bins'] = rawData['price'].apply(find_price_bin)

In [14]:
# clean data
rawData.dropna(how = 'all',inplace=True)
rawData.dropna(subset=['description', 'price'], inplace = True)

## Models

### Logistic Regresssion

In [15]:
from sklearn.linear_model import LogisticRegression

In [17]:
# vectorize description: take the words of each description and create a vocabulary of all the unique words in the descriptions.
# This vocabulary can then be used to create a feature vector of the count of the words:
vectorizer = CountVectorizer(min_df=0, lowercase=False)
vectorizer.fit(rawData['description'])
# vectorizer.vocabulary_

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=False, max_df=1.0, max_features=None, min_df=0,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [18]:
# create vector with all words for each description = Bag-of-words (BOW) model
vectorizer.transform(rawData['description']).toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

#### Values

In [20]:
#  split data
X_LR = rawData['description'].values
y_LR_values = rawData['review score'].values

X_LR_train, X_LR_test, y_LR_values_train, y_LR_values_test = train_test_split(X_LR, y_LR_values, test_size=0.25, random_state=1000)

In [21]:
# vectorize training descriptions
vectorizer = CountVectorizer()
vectorizer.fit(X_LR_train)
X_LR_v_train = vectorizer.transform(X_LR_train)
X_LR_v_test  = vectorizer.transform(X_LR_test)

In [22]:
# logistic regression classification model
classifier = LogisticRegression()
classifier.fit(X_LR_v_train, y_LR_values_train) # vectorized training data
score_LR_values = classifier.score(X_LR_v_test, y_LR_values_test)
print("Logistic Regression:","values:","Accuracy:",score_LR_values)



Logistic Regression: values: Accuracy: 0.12254901960784313


#### Bins

In [25]:
#  split data
X_LR = rawData['description'].values
y_LR_bins = rawData['review score'].values

X_LR_train, X_LR_test, y_LR_bins_train, y_LR_bins_test = train_test_split(X_LR, y_LR_bins, test_size=0.25, random_state=1000)

In [26]:
# vectorize training descriptions
vectorizer = CountVectorizer()
vectorizer.fit(X_LR_train)
X_LR_v_train = vectorizer.transform(X_LR_train)
X_LR_v_test  = vectorizer.transform(X_LR_test)

In [27]:
# logistic regression classification model
classifier = LogisticRegression()
classifier.fit(X_LR_v_train, y_LR_bins_train) # vectorized training data
score_LR_bins = classifier.score(X_LR_v_test, y_LR_values_test)
print("Logistic Regression:","values:","Accuracy:",score_LR_bins)



Logistic Regression: values: Accuracy: 0.12254901960784313
