In [37]:
import pandas as pd
import os
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [38]:
# read data
rawData = pd.read_csv(os.path.join("clean-data","Whiskey_data","Whiskey_Advocate_All_scraped_KHupdate-with-description.csv"), encoding='iso-8859-1' )
rawData.columns

Index(['row_caller', 'Maker', 'distilled', 'age in cast', 'ABV', 'Blended',
       'Bourbon', 'Flavored', 'Other', 'Rye', 'Scotch', 'single blended grain',
       'single blended malt', 'world', 'review score', 'price', 'style',
       'country', 'row_other', 'description'],
      dtype='object')

In [39]:
rawData.head()

Unnamed: 0,row_caller,Maker,distilled,age in cast,ABV,Blended,Bourbon,Flavored,Other,Rye,Scotch,single blended grain,single blended malt,world,review score,price,style,country,row_other,description
0,1,Johnnie Walker,,,40.00%,1,0,0,0,0,1,0,0,0,97,225,Blended Scotch Whisky,UK,2.0,What impresses me most is how this whisky evol...
1,2,Black Bowmore,1964.0,42 year old,40.50%,0,0,0,0,0,1,0,1,0,97,4500,Single Malt Scotch,UK,3.0,There have been some legendary Bowmores from t...
2,3,Bowmore,,46 year old,42.90%,0,0,0,0,0,1,0,1,0,97,13500,Single Malt Scotch,UK,4.0,With a name inspired by a 1926 Buster Keaton m...
3,4,Compass Box,,30 years old,53.40%,1,0,0,0,0,1,0,1,0,96,325,Blended Malt Scotch Whisky,UK,5.0,"Captivating, enticing, and wonderfully charmin..."
4,5,Chivas,,,40.00%,1,0,0,0,0,1,0,1,0,96,160,Blended Malt Scotch Whisky,UK,6.0,Deep gold color. Surprisingly lively on the no...


In [40]:
rawData.dropna(how = 'all',inplace=True)

In [43]:
#  review points bin
rp_bins = [(0,75), (75,80), (80,85), (85,90), (90,95), (95, 100)]

# find bin based on value
def find_rp_bin(value):
    
    for i in range(0, len(rp_bins)):
        if rp_bins[i][0] <= value < rp_bins[i][1]:
            return rp_bins[i][0] #lower end of the bin is returned
    return -1

# fill y value
rawData['rp_bins'] = rawData['review score'].apply(find_rp_bin)

In [44]:
rawData['rp_bins'].head()

0    95
1    95
2    95
3    95
4    95
Name: rp_bins, dtype: int64

In [45]:
for i in rawData['price']

SyntaxError: invalid syntax (<ipython-input-45-8ccb191f0fe2>, line 1)

In [46]:
#  price bin
price_bins = [(0,10), (10,25), (25,50), (50,75), (75,250), (250,500), (500,1000000)]

# find bin based on value
def find_price_bin(value):
    
    for i in range(0, len(price_bins)):
#         try:
            if price_bins[i][0] <= value < price_bins[i][1]:
                return price_bins[i][0] #lower end of the bin is returned
            else:
                return -1
#         except:
#             return -1

# fill y value
rawData['price_bins'] = rawData['price'].apply(find_price_bin)

In [34]:
rawData['price_bins'].head()

0   -1
1   -1
2   -1
3   -1
4   -1
Name: price_bins, dtype: int64

In [35]:
find_price_bin(52)

-1

In [14]:
#  split data
descriptions = rawData['description'].values
# y = rawData['review.point'].values
y = rawData['price_bins'].values

descriptions_train, descriptions_test, y_train, y_test = train_test_split(descriptions, y, test_size=0.25, random_state=1000)

In [15]:
# vectorize training descriptions
vectorizer = CountVectorizer()
vectorizer.fit(descriptions_train)
X_train = vectorizer.transform(descriptions_train)
X_test  = vectorizer.transform(descriptions_test)

## Naive Bayes
Multinomial: It is used for discrete counts. For example, let’s say,  we have a text classification problem. Here we can consider bernoulli trials which is one step further and instead of “word occurring in the document”, we have “count how often word occurs in the document”, you can think of it as “number of times outcome number x_i is observed over the n trials”.

Bernoulli: The binomial model is useful if your feature vectors are binary (i.e. zeros and ones). One application would be text classification with ‘bag of words’ model where the 1s & 0s are “word occurs in the document” and “word does not occur in the document” respectively.

## Multinomial

In [16]:
from sklearn.naive_bayes import MultinomialNB
#Create a Multinomial Classifier
model = MultinomialNB()
# Train the model using the training sets
model.fit(X_train,y_train)
#Predict Output 
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
y_prediction = model.predict(X_test)

In [17]:
print("Accuracy:",metrics.accuracy_score(y_test, y_prediction))

Accuracy: 1.0


## Bernoulli

In [18]:
from sklearn.naive_bayes import BernoulliNB
#Create a BernoulliNB Classifier
model = BernoulliNB()
# Train the model using the training sets
model.fit(X_train,y_train)
#Predict Output 
BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)
y_prediction = model.predict(X_test)

In [19]:
print("Accuracy:",metrics.accuracy_score(y_test, y_prediction))

Accuracy: 1.0
