In [4]:
import pandas as pd
import os
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree

In [5]:
# read data
rawData = pd.read_csv(os.path.join("FD test NLP.csv"))
rawData.head()

Unnamed: 0.1,Unnamed: 0,name,category,review.point,price,currency,description
0,1,"Johnnie Walker Blue Label, 40%",Blended Scotch Whisky,97,225,$,"Magnificently powerful and intense. Caramels, ..."
1,2,"Black Bowmore, 1964 vintage, 42 year old, 40.5%",Single Malt Scotch,97,4500,$,What impresses me most is how this whisky evol...
2,3,"Bowmore 46 year old (distilled 1964), 42.9%",Single Malt Scotch,97,13500,$,There have been some legendary Bowmores from t...
3,4,"Compass Box The General, 53.4%",Blended Malt Scotch Whisky,96,325,$,With a name inspired by a 1926 Buster Keaton m...
4,5,"Chivas Regal Ultis, 40%",Blended Malt Scotch Whisky,96,160,$,"Captivating, enticing, and wonderfully charmin..."


In [6]:
#  split data
descriptions = rawData['description'].values
y = rawData['review.point'].values
# y = rawData['price'].values

descriptions_train, descriptions_test, y_train, y_test = train_test_split(descriptions, y, test_size=0.25, random_state=1000)

In [7]:
# vectorize training descriptions
vectorizer = CountVectorizer()
vectorizer.fit(descriptions_train)
X_train = vectorizer.transform(descriptions_train)
X_test  = vectorizer.transform(descriptions_test)

## Naive Bayes
Multinomial: It is used for discrete counts. For example, let’s say,  we have a text classification problem. Here we can consider bernoulli trials which is one step further and instead of “word occurring in the document”, we have “count how often word occurs in the document”, you can think of it as “number of times outcome number x_i is observed over the n trials”.

Bernoulli: The binomial model is useful if your feature vectors are binary (i.e. zeros and ones). One application would be text classification with ‘bag of words’ model where the 1s & 0s are “word occurs in the document” and “word does not occur in the document” respectively.

## Multinomial

In [12]:
from sklearn.naive_bayes import MultinomialNB
#Create a Multinomial Classifier
model = MultinomialNB()
# Train the model using the training sets
model.fit(X_train,y_train)
#Predict Output 
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
model.predict(X_test)

array([85, 87, 86, 85, 87, 86, 89, 87, 88, 86, 89, 85, 85, 89, 88, 87, 86,
       86, 89, 89, 86, 86, 84, 87, 86, 86, 89, 87, 86, 90, 87, 87, 86, 87,
       86, 90, 87, 86, 87, 86, 87, 86, 90, 86, 88, 88, 86, 86, 86, 87, 86,
       89, 89, 85, 86, 86, 87, 86, 87, 88, 87, 90, 86, 85, 85, 87, 85, 87,
       86, 87, 85, 86, 90, 87, 89, 86, 86, 86, 90, 86, 88, 87, 87, 88, 86,
       87, 90, 86, 87, 87, 86, 88, 87, 89, 89, 89, 89, 87, 86, 87, 86, 86,
       89, 86, 87, 86, 89, 88, 90, 89, 86, 86, 87, 83, 89, 89, 86, 86, 87,
       84, 87, 88, 86, 86, 86, 86, 89, 86, 88, 85, 88, 89, 86, 89, 88, 88,
       88, 89, 87, 87, 90, 89, 89, 86, 87, 87, 87, 89, 90, 84, 86, 87, 87,
       89, 87, 89, 89, 87, 87, 87, 87, 89, 90, 86, 87, 88, 86, 87, 89, 87,
       86, 87, 86, 87, 89, 85, 87, 87, 87, 87, 86, 88, 88, 86, 86, 90, 86,
       90, 87, 87, 89, 87, 88, 87, 88, 88, 87, 88, 88, 86, 87, 87, 85, 86,
       88, 87, 88, 85, 85, 86, 90, 89, 87, 85, 86, 86, 86, 88, 87, 87, 86,
       87, 86, 87, 87, 88

## Bernoulli

In [14]:
from sklearn.naive_bayes import BernoulliNB
#Create a BernoulliNB Classifier
model = BernoulliNB()
# Train the model using the training sets
model.fit(X_train,y_train)
#Predict Output 
BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)
model.predict(X_test)

array([87, 87, 86, 87, 87, 86, 87, 87, 87, 86, 86, 87, 87, 87, 88, 87, 86,
       86, 89, 89, 86, 87, 86, 87, 86, 86, 89, 87, 86, 86, 87, 87, 86, 87,
       86, 87, 87, 86, 87, 86, 87, 86, 86, 86, 87, 87, 86, 86, 86, 87, 86,
       87, 87, 85, 86, 86, 87, 86, 87, 87, 87, 86, 87, 87, 85, 87, 86, 87,
       86, 87, 87, 86, 87, 87, 87, 86, 86, 86, 86, 87, 87, 87, 87, 86, 86,
       87, 87, 87, 87, 87, 86, 86, 87, 86, 87, 87, 89, 87, 86, 87, 86, 86,
       87, 86, 87, 86, 88, 87, 90, 86, 86, 86, 87, 87, 87, 89, 86, 86, 87,
       87, 87, 88, 86, 86, 86, 86, 87, 86, 87, 85, 86, 86, 86, 87, 87, 86,
       88, 87, 87, 87, 87, 87, 87, 86, 87, 87, 87, 87, 87, 86, 86, 87, 87,
       87, 87, 87, 87, 87, 87, 87, 87, 87, 86, 86, 87, 86, 86, 87, 87, 87,
       86, 87, 86, 87, 87, 86, 87, 87, 87, 87, 86, 87, 88, 86, 86, 87, 86,
       86, 87, 87, 87, 87, 88, 87, 87, 87, 87, 87, 88, 86, 87, 87, 86, 87,
       87, 87, 87, 86, 87, 86, 87, 87, 87, 86, 86, 86, 86, 87, 87, 87, 86,
       87, 86, 87, 87, 87