# Machine Learning

In [1]:
# libraries
import pandas as pd
import os
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [103]:
# read data
rawData = pd.read_csv(os.path.join("clean-data","Whiskey_data","Whiskey_Advocate_All_scraped_KHupdate-with-description.csv"), encoding='iso-8859-1' )
rawData.columns

Index(['row_caller', 'Maker', 'distilled', 'age in cast', 'ABV', 'Blended',
       'Bourbon', 'Flavored', 'Other', 'Rye', 'Scotch', 'single blended grain',
       'single blended malt', 'world', 'review score', 'price', 'style',
       'country', 'row_other', 'description'],
      dtype='object')

In [104]:
# head data
rawData.head()

Unnamed: 0,row_caller,Maker,distilled,age in cast,ABV,Blended,Bourbon,Flavored,Other,Rye,Scotch,single blended grain,single blended malt,world,review score,price,style,country,row_other,description
0,1,Johnnie Walker,,,40.00%,1,0,0,0,0,1,0,0,0,97,225,Blended Scotch Whisky,UK,2.0,"Magnificently powerful and intense. Caramels, ..."
1,2,Black Bowmore,1964.0,42 year old,40.50%,0,0,0,0,0,1,0,1,0,97,4500,Single Malt Scotch,UK,3.0,What impresses me most is how this whisky evol...
2,3,Bowmore,,46 year old,42.90%,0,0,0,0,0,1,0,1,0,97,13500,Single Malt Scotch,UK,4.0,There have been some legendary Bowmores from t...
3,4,Compass Box,,30 years old,53.40%,1,0,0,0,0,1,0,1,0,96,325,Blended Malt Scotch Whisky,UK,5.0,With a name inspired by a 1926 Buster Keaton m...
4,5,Chivas,,,40.00%,1,0,0,0,0,1,0,1,0,96,160,Blended Malt Scotch Whisky,UK,6.0,"Captivating, enticing, and wonderfully charmin..."


In [105]:
# review points bin
rp_bins = [(0,75), (75,80), (80,85), (85,90), (90,95), (95, 100)]

# find bin based on value
def find_rp_bin(value):
    
    for i in range(0, len(rp_bins)):
        if rp_bins[i][0] <= value < rp_bins[i][1]:
            return rp_bins[i][0] #lower end of the bin is returned
    return -1

# fill y value
rawData['rp_bins'] = rawData['review score'].apply(find_rp_bin)

In [106]:
# price bin
price_bins = [(0,10), (10,25), (25,50), (50,75), (75,250), (250,500), (500,1000000)]

# find bin based on value
def find_price_bin(value):
    price = 0
    for i in range(0, len(price_bins)):
        if price_bins[i][0] <= value < price_bins[i][1]:
            return price_bins[i][0] #lower end of the bin is returned
    return -1

# 
rawData['price_bins'] = rawData['price'].apply(find_price_bin)

In [107]:
# clean data
rawData.dropna(how = 'all',inplace=True)
rawData.dropna(subset=['description', 'price'], inplace = True)

In [108]:
# price quartiles
Q1 = np.percentile(rawData['price'],25,axis=0, interpolation='lower')
Q2 = np.percentile(rawData['price'],50,axis=0, interpolation='lower')
Q3 = np.percentile(rawData['price'],75,axis=0, interpolation='lower')
Q4 = np.percentile(rawData['price'],100,axis=0, interpolation='lower')
print(Q1,Q2,Q3,Q4)

50 79 135 157000


In [109]:
# drop quartiles Q1 / Q4
indexNames = rawData[ (rawData['price'] < Q1)].index
rawData.drop(indexNames , inplace=True)
indexNames = rawData[ (rawData['price'] >= Q3)].index
rawData.drop(indexNames , inplace=True)

In [110]:
rawData.head()

Unnamed: 0,row_caller,Maker,distilled,age in cast,ABV,Blended,Bourbon,Flavored,Other,Rye,...,single blended malt,world,review score,price,style,country,row_other,description,rp_bins,price_bins
12,13,Compass Box,,,48.90%,1,0,0,0,0,...,1,0,95,105,Blended Malt Scotch Whisky,UK,14.0,"A marriage of three different single malts, ag...",95,75
13,14,Compass Box,,,54.70%,1,0,0,0,0,...,0,0,95,120,Blended Scotch Whisky,UK,15.0,"As you 'd expect, solid peat is the first thin...",95,75
15,16,Chivas,,18 year old,40.00%,1,0,0,0,0,...,0,0,95,70,Blended Scotch Whisky,UK,17.0,An essay in balance on both the aroma and pala...,95,50
18,19,Ardbeg,,,57.10%,0,0,0,0,0,...,1,0,95,90,Single Malt Scotch,UK,20.0,"Part of the permanent Ardbeg range since 2008,...",95,75
39,41,Compass Box,,,46.00%,1,0,0,0,0,...,1,0,94,65,Blended Malt Scotch Whisky,UK,42.0,The formula for this whisky has changed slight...,90,50


# Models

## Review Points

### Logistic Regresssion

In [111]:
# Specific libraries
from sklearn.linear_model import LogisticRegression

In [112]:
# vectorize description: take the words of each description and create a vocabulary of all the unique words in the descriptions.
# This vocabulary can then be used to create a feature vector of the count of the words:
vectorizer = CountVectorizer(min_df=0, lowercase=False)
vectorizer.fit(rawData['description'])
# vectorizer.vocabulary_

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=False, max_df=1.0, max_features=None, min_df=0,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [113]:
# create vector with all words for each description = Bag-of-words (BOW) model
vectorizer.transform(rawData['description']).toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

#### Values

In [114]:
#  split data
X_LR_rp = rawData['description'].values
y_LR_rp_values = rawData['review score'].values

X_LR_rp_train, X_LR_rp_test, y_LR_rp_values_train, y_LR_rp_values_test = train_test_split(X_LR_rp, y_LR_rp_values, test_size=0.25, random_state=1000)

In [115]:
# vectorize training descriptions
vectorizer = CountVectorizer()
vectorizer.fit(X_LR_rp_train)
X_LR_rp_v_train = vectorizer.transform(X_LR_rp_train)
X_LR_rp_v_test  = vectorizer.transform(X_LR_rp_test)

In [116]:
# logistic regression classification model
classifier = LogisticRegression()
classifier.fit(X_LR_rp_v_train, y_LR_rp_values_train) # vectorized training data
score_LR_rp_values = classifier.score(X_LR_rp_v_test, y_LR_rp_values_test)
print("Review Points:","Logistic Regression:","values:","Accuracy:",score_LR_rp_values)



Review Points: Logistic Regression: values: Accuracy: 0.10847457627118644


#### Bins

In [117]:
#  split data
X_LR_rp = rawData['description'].values
y_LR_rp_bins = rawData['rp_bins'].values

X_LR_rp_train, X_LR_rp_test, y_LR_rp_bins_train, y_LR_rp_bins_test = train_test_split(X_LR_rp, y_LR_rp_bins, test_size=0.25, random_state=1000)

In [118]:
# vectorize training descriptions
vectorizer = CountVectorizer()
vectorizer.fit(X_LR_rp_train)
X_LR_rp_v_train = vectorizer.transform(X_LR_rp_train)
X_LR_rp_v_test  = vectorizer.transform(X_LR_rp_test)

In [119]:
# logistic regression classification model
classifier = LogisticRegression()
classifier.fit(X_LR_rp_v_train, y_LR_rp_bins_train) # vectorized training data
score_LR_rp_bins = classifier.score(X_LR_rp_v_test, y_LR_rp_values_test)
print("Review Points:","Logistic Regression:","bins:","Accuracy:",score_LR_rp_bins)



Review Points: Logistic Regression: bins: Accuracy: 0.1


### Random forests

In [120]:
# Specific libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree

#### Values

In [121]:
#  split data
X_RF_rp = rawData['description'].values
y_RF_rp_values = rawData['review score'].values

X_RF_rp_train, X_RF_rp_test, y_RF_rp_values_train, y_RF_rp_values_test = train_test_split(X_RF_rp, y_RF_rp_values, test_size=0.25, random_state=1000)

In [122]:
# vectorize training descriptions
vectorizer = CountVectorizer()
vectorizer.fit(X_RF_rp_train)
X_RF_rp_v_train = vectorizer.transform(X_RF_rp_train)
X_RF_rp_v_test  = vectorizer.transform(X_RF_rp_test)

In [123]:
# Create a random forest classifier
rf_rp = RandomForestClassifier(n_estimators=200)
rf_rp_values = rf_rp.fit(X_RF_rp_v_train, y_RF_rp_values_train)
score_RF_rp_values = rf_rp_values.score(X_RF_rp_v_test, y_RF_rp_values_test)
print("Review Points:","Random forests:","values:","Accuracy:",score_RF_rp_values)

Review Points: Random forests: values: Accuracy: 0.08983050847457627


#### Bins

In [124]:
#  split data
X_RF_rp = rawData['description'].values
y_RF_rp_bins = rawData['rp_bins'].values

X_RF_rp_train, X_RF_rp_test, y_RF_rp_bins_train, y_RF_rp_bins_test = train_test_split(X_RF_rp, y_RF_rp_bins, test_size=0.25, random_state=1000)

In [125]:
# vectorize training descriptions
vectorizer = CountVectorizer()
vectorizer.fit(X_RF_rp_train)
X_RF_rp_v_train = vectorizer.transform(X_RF_rp_train)
X_RF_rp_v_test  = vectorizer.transform(X_RF_rp_test)

In [126]:
# Create a random forest classifier
rf_rp = RandomForestClassifier(n_estimators=200)
rf_rp_bins = rf_rp.fit(X_RF_rp_v_train, y_RF_rp_bins_train)
score_RF_rp_bins = rf_rp_bins.score(X_RF_rp_v_test, y_RF_rp_bins_test)
print("Review Points:","Random forests:","bins:","Accuracy:",score_RF_rp_bins)

Review Points: Random forests: bins: Accuracy: 0.4864406779661017


### Naive Bayes - Multinomial
It is used for discrete counts. For example, let’s say, we have a text classification problem. Here we can consider bernoulli trials which is one step further and instead of “word occurring in the document”, we have “count how often word occurs in the document”, you can think of it as “number of times outcome number x_i is observed over the n trials”.

#### Values

In [127]:
# split data
X_NBM_rp = rawData['description'].values
y_NBM_rp_values = rawData['review score'].values

X_NBM_rp_train, X_NBM_rp_test, y_NBM_rp_values_train, y_NBM_rp_values_test = train_test_split(X_NBM_rp, y_NBM_rp_values, test_size=0.25, random_state=1000)

In [128]:
# vectorize training descriptions
vectorizer = CountVectorizer()
vectorizer.fit(X_NBM_rp_train)
X_NBM_rp_v_train = vectorizer.transform(X_NBM_rp_train)
X_NBM_rp_v_test  = vectorizer.transform(X_NBM_rp_test)

In [129]:
from sklearn.naive_bayes import MultinomialNB
#Create a Multinomial Classifier
model_NBM_rp = MultinomialNB()
# Train the model using the training sets
model_NBM_rp.fit(X_NBM_rp_v_train,y_NBM_rp_values_train)
#Predict Output 
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
y_NBM_rp_values_prediction = model_NBM_rp.predict(X_NBM_rp_v_test)
score_NBM_rp_values = metrics.accuracy_score(y_NBM_rp_values_test, y_NBM_rp_values_prediction)
print("Review Points:","Naive Bayes - Multinomial:","values:","Accuracy:",score_NBM_rp_values)

Review Points: Naive Bayes - Multinomial: values: Accuracy: 0.08983050847457627


#### Bins

In [130]:
# split data
X_NBM_rp = rawData['description'].values
y_NBM_rp_bins = rawData['rp_bins'].values

X_NBM_rp_train, X_NBM_rp_test, y_NBM_rp_bins_train, y_NBM_rp_bins_test = train_test_split(X_NBM_rp, y_NBM_rp_bins, test_size=0.25, random_state=1000)

In [131]:
# vectorize training descriptions
vectorizer = CountVectorizer()
vectorizer.fit(X_NBM_rp_train)
X_NBM_rp_v_train = vectorizer.transform(X_NBM_rp_train)
X_NBM_rp_v_test  = vectorizer.transform(X_NBM_rp_test)

In [132]:
from sklearn.naive_bayes import MultinomialNB
#Create a Multinomial Classifier
model_NBM_rp = MultinomialNB()
# Train the model using the training sets
model_NBM_rp.fit(X_NBM_rp_v_train,y_NBM_rp_bins_train)
#Predict Output 
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
y_NBM_rp_bins_prediction = model_NBM_rp.predict(X_NBM_rp_v_test)
score_NBM_rp_bins = metrics.accuracy_score(y_NBM_rp_bins_test, y_NBM_rp_bins_prediction)
print("Review Points:","Naive Bayes - Multinomial:","bins:","Accuracy:",score_NBM_rp_bins)

Review Points: Naive Bayes - Multinomial: bins: Accuracy: 0.5050847457627119


### Naive Bayes - Bernoulli
The binomial model is useful if your feature vectors are binary (i.e. zeros and ones). One application would be text classification with ‘bag of words’ model where the 1s & 0s are “word occurs in the document” and “word does not occur in the document” respectively.

#### Values

In [133]:
# split data
X_NBB_rp = rawData['description'].values
y_NBB_rp_values = rawData['review score'].values

X_NBB_rp_train, X_NBB_rp_test, y_NBB_rp_values_train, y_NBB_rp_values_test = train_test_split(X_NBB_rp, y_NBB_rp_values, test_size=0.25, random_state=1000)

In [134]:
# vectorize training descriptions
vectorizer = CountVectorizer()
vectorizer.fit(X_NBB_rp_train)
X_NBB_rp_v_train = vectorizer.transform(X_NBB_rp_train)
X_NBB_rp_v_test  = vectorizer.transform(X_NBB_rp_test)

In [135]:
from sklearn.naive_bayes import BernoulliNB
#Create a BernoulliNB Classifier
model_NBB_rp = BernoulliNB()
# Train the model using the training sets
model_NBB_rp.fit(X_NBB_rp_v_train,y_NBB_rp_values_train)
#Predict Output 
BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)
y_NBB_rp_values_prediction = model_NBB_rp.predict(X_NBB_rp_v_test)
score_NBB_rp_values = metrics.accuracy_score(y_NBB_rp_values_test, y_NBB_rp_values_prediction)
print("Review Points:","Naive Bayes - Bernoulli:","values:","Accuracy:",score_NBB_rp_values)

Review Points: Naive Bayes - Bernoulli: values: Accuracy: 0.09152542372881356


#### Bins

In [136]:
# split data
X_NBB_rp = rawData['description'].values
y_NBB_rp_bins = rawData['rp_bins'].values

X_NBB_rp_train, X_NBB_rp_test, y_NBB_rp_bins_train, y_NBB_rp_bins_test = train_test_split(X_NBB_rp, y_NBB_rp_bins, test_size=0.25, random_state=1000)

In [137]:
# vectorize training descriptions
vectorizer = CountVectorizer()
vectorizer.fit(X_NBB_rp_train)
X_NBB_rp_v_train = vectorizer.transform(X_NBB_rp_train)
X_NBB_rp_v_test  = vectorizer.transform(X_NBB_rp_test)

In [138]:
from sklearn.naive_bayes import BernoulliNB
#Create a BernoulliNB Classifier
model_NBB_rp = BernoulliNB()
# Train the model using the training sets
model_NBB_rp.fit(X_NBB_rp_v_train,y_NBB_rp_bins_train)
#Predict Output 
BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)
y_NBB_rp_bins_prediction = model_NBB_rp.predict(X_NBB_rp_v_test)
score_NBB_rp_bins = metrics.accuracy_score(y_NBB_rp_bins_test, y_NBB_rp_bins_prediction)
print("Review Points:","Naive Bayes - Bernoulli:","bins:","Accuracy:",score_NBB_rp_bins)

Review Points: Naive Bayes - Bernoulli: bins: Accuracy: 0.488135593220339


## Price

### Logistic Regresssion

In [139]:
# Specific libraries
from sklearn.linear_model import LogisticRegression

In [140]:
# vectorize description: take the words of each description and create a vocabulary of all the unique words in the descriptions.
# This vocabulary can then be used to create a feature vector of the count of the words:
vectorizer = CountVectorizer(min_df=0, lowercase=False)
vectorizer.fit(rawData['description'])
# vectorizer.vocabulary_

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=False, max_df=1.0, max_features=None, min_df=0,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [141]:
# create vector with all words for each description = Bag-of-words (BOW) model
vectorizer.transform(rawData['description']).toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

#### Values

In [142]:
#  split data
X_LR_price = rawData['description'].values
y_LR_price_values = rawData['price'].values

X_LR_price_train, X_LR_price_test, y_LR_price_values_train, y_LR_price_values_test = train_test_split(X_LR_price, y_LR_price_values, test_size=0.25, random_state=1000)

In [143]:
# vectorize training descriptions
vectorizer = CountVectorizer()
vectorizer.fit(X_LR_price_train)
X_LR_price_v_train = vectorizer.transform(X_LR_price_train)
X_LR_price_v_test  = vectorizer.transform(X_LR_price_test)

In [144]:
# logistic regression classification model
classifier = LogisticRegression()
classifier.fit(X_LR_price_v_train, y_LR_price_values_train) # vectorized training data
score_LR_price_values = classifier.score(X_LR_price_v_test, y_LR_price_values_test)
print("Price:","Logistic Regression:","values:","Accuracy:",score_LR_price_values)



Price: Logistic Regression: values: Accuracy: 0.0847457627118644


#### Bins

In [145]:
#  split data
X_LR_price = rawData['description'].values
y_LR_price_bins = rawData['price_bins'].values

X_LR_price_train, X_LR_price_test, y_LR_price_bins_train, y_LR_price_bins_test = train_test_split(X_LR_price, y_LR_price_bins, test_size=0.25, random_state=1000)

In [146]:
# vectorize training descriptions
vectorizer = CountVectorizer()
vectorizer.fit(X_LR_price_train)
X_LR_price_v_train = vectorizer.transform(X_LR_price_train)
X_LR_price_v_test  = vectorizer.transform(X_LR_price_test)

In [147]:
# logistic regression classification model
classifier = LogisticRegression()
classifier.fit(X_LR_price_v_train, y_LR_price_bins_train) # vectorized training data
score_LR_price_bins = classifier.score(X_LR_price_v_test, y_LR_price_bins_test)
print("Price:","Logistic Regression:","bins:","Accuracy:",score_LR_price_bins)

Price: Logistic Regression: bins: Accuracy: 0.5898305084745763




### Random forests

In [148]:
# Specific libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree

#### Values

In [149]:
#  split data
X_RF_price = rawData['description'].values
y_RF_price_values = rawData['price'].values

X_RF_price_train, X_RF_price_test, y_RF_price_values_train, y_RF_price_values_test = train_test_split(X_RF_price, y_RF_price_values, test_size=0.25, random_state=1000)

In [150]:
# vectorize training descriptions
vectorizer = CountVectorizer()
vectorizer.fit(X_RF_price_train)
X_RF_price_v_train = vectorizer.transform(X_RF_price_train)
X_RF_price_v_test  = vectorizer.transform(X_RF_price_test)

In [151]:
# Create a random forest classifier
rf_price = RandomForestClassifier(n_estimators=200)
rf_price_values = rf_price.fit(X_RF_price_v_train, y_RF_price_values_train)
score_RF_price_values = rf_price_values.score(X_RF_price_v_test, y_RF_price_values_test)
print("Price:","Random forests:","values:","Accuracy:",score_RF_price_values)

Price: Random forests: values: Accuracy: 0.12372881355932204


#### Bins

In [152]:
#  split data
X_RF_price = rawData['description'].values
y_RF_price_bins = rawData['price_bins'].values

X_RF_price_train, X_RF_price_test, y_RF_price_bins_train, y_RF_price_bins_test = train_test_split(X_RF_price, y_RF_price_bins, test_size=0.25, random_state=1000)

In [153]:
# vectorize training descriptions
vectorizer = CountVectorizer()
vectorizer.fit(X_RF_price_train)
X_RF_price_v_train = vectorizer.transform(X_RF_price_train)
X_RF_price_v_test  = vectorizer.transform(X_RF_price_test)

In [154]:
# Create a random forest classifier
rf_price = RandomForestClassifier(n_estimators=200)
rf_price_bins = rf_price.fit(X_RF_price_v_train, y_RF_price_bins_train)
score_RF_price_bins = rf_price_bins.score(X_RF_price_v_test, y_RF_price_bins_test)
print("Price:","Random forests:","bins:","Accuracy:",score_RF_price_bins)

Price: Random forests: bins: Accuracy: 0.5949152542372881


### Naive Bayes - Multinomial
It is used for discrete counts. For example, let’s say, we have a text classification problem. Here we can consider bernoulli trials which is one step further and instead of “word occurring in the document”, we have “count how often word occurs in the document”, you can think of it as “number of times outcome number x_i is observed over the n trials”.

#### Values

In [155]:
# split data
X_NBM_price = rawData['description'].values
y_NBM_price_values = rawData['price'].values

X_NBM_price_train, X_NBM_price_test, y_NBM_price_values_train, y_NBM_price_values_test = train_test_split(X_NBM_price, y_NBM_price_values, test_size=0.25, random_state=1000)

In [156]:
# vectorize training descriptions
vectorizer = CountVectorizer()
vectorizer.fit(X_NBM_price_train)
X_NBM_price_v_train = vectorizer.transform(X_NBM_price_train)
X_NBM_price_v_test  = vectorizer.transform(X_NBM_price_test)

In [157]:
from sklearn.naive_bayes import MultinomialNB
#Create a Multinomial Classifier
model_NBM_price = MultinomialNB()
# Train the model using the training sets
model_NBM_price.fit(X_NBM_price_v_train,y_NBM_price_values_train)
#Predict Output 
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
y_NBM_price_values_prediction = model_NBM_rp.predict(X_NBM_price_v_test)
score_NBM_price_values = metrics.accuracy_score(y_NBM_price_values_test, y_NBM_price_values_prediction)
print("Price:","Naive Bayes - Multinomial:","values:","Accuracy:",score_NBM_price_values)

Price: Naive Bayes - Multinomial: values: Accuracy: 0.03559322033898305


#### Bins

In [158]:
# split data
X_NBM_price = rawData['description'].values
y_NBM_price_bins = rawData['price_bins'].values

X_NBM_price_train, X_NBM_price_test, y_NBM_price_bins_train, y_NBM_price_bins_test = train_test_split(X_NBM_price, y_NBM_price_bins, test_size=0.25, random_state=1000)

In [159]:
# vectorize training descriptions
vectorizer = CountVectorizer()
vectorizer.fit(X_NBM_price_train)
X_NBM_price_v_train = vectorizer.transform(X_NBM_price_train)
X_NBM_price_v_test  = vectorizer.transform(X_NBM_price_test)

In [160]:
from sklearn.naive_bayes import MultinomialNB
#Create a Multinomial Classifier
model_NBM_price = MultinomialNB()
# Train the model using the training sets
model_NBM_price.fit(X_NBM_price_v_train,y_NBM_price_bins_train)
#Predict Output 
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
y_NBM_price_bins_prediction = model_NBM_rp.predict(X_NBM_price_v_test)
score_NBM_price_bins = metrics.accuracy_score(y_NBM_price_bins_test, y_NBM_price_bins_prediction)
print("Price:","Naive Bayes - Multinomial:","bins:","Accuracy:",score_NBM_price_bins)

Price: Naive Bayes - Multinomial: bins: Accuracy: 0.0


### Naive Bayes - Bernoulli
The binomial model is useful if your feature vectors are binary (i.e. zeros and ones). One application would be text classification with ‘bag of words’ model where the 1s & 0s are “word occurs in the document” and “word does not occur in the document” respectively.

#### Values

In [161]:
# split data
X_NBB_price = rawData['description'].values
y_NBB_price_values = rawData['price'].values

X_NBB_price_train, X_NBB_price_test, y_NBB_price_values_train, y_NBB_price_values_test = train_test_split(X_NBB_price, y_NBB_price_values, test_size=0.25, random_state=1000)

In [162]:
# vectorize training descriptions
vectorizer = CountVectorizer()
vectorizer.fit(X_NBB_price_train)
X_NBB_price_v_train = vectorizer.transform(X_NBB_price_train)
X_NBB_price_v_test  = vectorizer.transform(X_NBB_price_test)

In [163]:
from sklearn.naive_bayes import BernoulliNB
#Create a BernoulliNB Classifier
model_NBB_price = BernoulliNB()
# Train the model using the training sets
model_NBB_price.fit(X_NBB_price_v_train,y_NBB_price_values_train)
#Predict Output 
BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)
y_NBB_price_values_prediction = model_NBB_price.predict(X_NBB_price_v_test)
score_NBB_price_values = metrics.accuracy_score(y_NBB_price_values_test, y_NBB_price_values_prediction)
print("Price Points:","Naive Bayes - Bernoulli:","values:","Accuracy:",score_NBB_price_values)

Price Points: Naive Bayes - Bernoulli: values: Accuracy: 0.1


#### Bins

In [164]:
# split data
X_NBB_price = rawData['description'].values
y_NBB_price_bins = rawData['price_bins'].values

X_NBB_price_train, X_NBB_price_test, y_NBB_price_bins_train, y_NBB_price_bins_test = train_test_split(X_NBB_price, y_NBB_price_bins, test_size=0.25, random_state=1000)

In [165]:
# vectorize training descriptions
vectorizer = CountVectorizer()
vectorizer.fit(X_NBB_price_train)
X_NBB_price_v_train = vectorizer.transform(X_NBB_price_train)
X_NBB_price_v_test  = vectorizer.transform(X_NBB_price_test)

In [166]:
from sklearn.naive_bayes import BernoulliNB
#Create a BernoulliNB Classifier
model_NBB_price = BernoulliNB()
# Train the model using the training sets
model_NBB_price.fit(X_NBB_price_v_train,y_NBB_price_bins_train)
#Predict Output 
BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)
y_NBB_price_bins_prediction = model_NBB_price.predict(X_NBB_price_v_test)
score_NBB_price_bins = metrics.accuracy_score(y_NBB_price_bins_test, y_NBB_price_bins_prediction)
print("Price Points:","Naive Bayes - Bernoulli:","bins:","Accuracy:",score_NBB_price_bins)

Price Points: Naive Bayes - Bernoulli: bins: Accuracy: 0.6067796610169491


In [167]:
# Summary
print("Review Points:","Logistic Regression:","values:","Accuracy:",score_LR_rp_values)
print("Review Points:","Logistic Regression:","bins:","Accuracy:",score_LR_rp_bins)

print("Review Points:","Random forests:","values:","Accuracy:",score_RF_rp_values)
print("Review Points:","Random forests:","bins:","Accuracy:",score_RF_rp_bins)

print("Review Points:","Naive Bayes - Multinomial:","values:","Accuracy:",score_NBM_rp_values)
print("Review Points:","Naive Bayes - Multinomial:","bins:","Accuracy:",score_NBM_rp_bins)

print("Review Points:","Naive Bayes - Bernoulli:","values:","Accuracy:",score_NBB_rp_values)
print("Review Points:","Naive Bayes - Bernoulli:","bins:","Accuracy:",score_NBB_rp_bins)

print("Price:","Logistic Regression:","values:","Accuracy:",score_LR_price_values)
print("Price:","Logistic Regression:","bins:","Accuracy:",score_LR_price_bins)

print("Price:","Random forests:","values:","Accuracy:",score_RF_price_values)
print("Price:","Random forests:","bins:","Accuracy:",score_RF_price_bins)

print("Price:","Naive Bayes - Multinomial:","values:","Accuracy:",score_NBM_price_values)
print("Price:","Naive Bayes - Multinomial:","bins:","Accuracy:",score_NBM_price_bins)

print("Price:","Naive Bayes - Bernoulli:","values:","Accuracy:",score_NBB_price_values)
print("Price:","Naive Bayes - Bernoulli:","bins:","Accuracy:",score_NBB_price_bins)

Review Points: Logistic Regression: values: Accuracy: 0.10847457627118644
Review Points: Logistic Regression: bins: Accuracy: 0.1
Review Points: Random forests: values: Accuracy: 0.08983050847457627
Review Points: Random forests: bins: Accuracy: 0.4864406779661017
Review Points: Naive Bayes - Multinomial: values: Accuracy: 0.08983050847457627
Review Points: Naive Bayes - Multinomial: bins: Accuracy: 0.5050847457627119
Review Points: Naive Bayes - Bernoulli: values: Accuracy: 0.09152542372881356
Review Points: Naive Bayes - Bernoulli: bins: Accuracy: 0.488135593220339
Price: Logistic Regression: values: Accuracy: 0.0847457627118644
Price: Logistic Regression: bins: Accuracy: 0.5898305084745763
Price: Random forests: values: Accuracy: 0.12372881355932204
Price: Random forests: bins: Accuracy: 0.5949152542372881
Price: Naive Bayes - Multinomial: values: Accuracy: 0.03559322033898305
Price: Naive Bayes - Multinomial: bins: Accuracy: 0.0
Price: Naive Bayes - Bernoulli: values: Accuracy: 0.1