In [9]:
%matplotlib inline
import re
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

In [164]:
df = pd.read_csv('Whiskey_Advocate_All_scraped.csv', encoding='iso-8859-1')#[['category', 'Description']]
df = df.replace(True, 1).replace(False, 0).dropna(how = 'all')
df.tail()

Unnamed: 0,row,name,name_short,year,age,ABV,American,Blended,Bourbon,Canadian,...,Scotch,Single Blended Grain,Single Blended Malt,World,review.point,price,currency,style,description,country
4750,8243.0,"Beach Whiskey, 40%",Beach Whiskey,,,40.00%,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,72.0,28,$,Craft Whiskey,There are flavoredÑcinnamon and coconutÑversio...,
4751,8245.0,Peaden Brothers Genuine Corn Whiskey Moonshine...,Peaden Brothers,,,50.00%,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,71.0,30,$,Craft Whiskey,"The nose starts with sweet corn, cornmeal, yea...",
4752,8247.0,"Kansas Clean Distilled, 40%",Kansas Clean Distilled,,,40.00%,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,70.0,30,$,Craft Whiskey,"Clear, slight golden hue; as expected from a 7...",
4753,8248.0,"Rogue Dead Guy Whiskey, 40%",Rogue,,,40.00%,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,69.0,40,$,American Whiskey (Unspecified),"I love Rogue beers, as well as RogueÕs attitud...",US
4754,8249.0,"Iowa Legendary Rye, 40%",Iowa Legendary,,,40.00%,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,65.0,35,$,Craft Whiskey,"An unaged whiskey from Carroll County, Iowa, w...",


categories = df['category'].unique()
categories

In [145]:
categories = df.columns[6:18]

In [146]:
X = df['description']
y = df[categories]

In [147]:
df.columns

Index(['row', 'name', 'name_short', 'year', 'age', 'ABV', 'American',
       'Blended', 'Bourbon', 'Canadian', 'Flavored', 'Irish', 'Japanese',
       'Other', 'Rye', 'Scotch', 'Single Blended Grain', 'Single Blended Malt',
       'World', 'review.point', 'price', 'currency', 'style', 'description',
       'country'],
      dtype='object')

In [166]:
dummies = pd.get_dummies(df['category'])
df_onehot = dummies.join(df).drop(columns = 'category')
df_onehot.head()

ValueError: columns overlap but no suffix specified: Index(['Canadian', 'Irish', 'Japanese'], dtype='object')

In [148]:
cat_list = list(categories)
cat_list.append('description')
df[cat_list].ehad()

Unnamed: 0,American,Blended,Bourbon,Canadian,Flavored,Irish,Japanese,Other,Rye,Scotch,Single Blended Grain,Single Blended Malt,description
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,"Magnificently powerful and intense. Caramels, ..."
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,What impresses me most is how this whisky evol...
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,There have been some legendary Bowmores from t...
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,With a name inspired by a 1926 Buster Keaton m...
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,"Captivating, enticing, and wonderfully charmin..."


In [149]:
train, test = train_test_split(df[cat_list], random_state=42, test_size=0.33, shuffle=True)
X_train = train.description
X_test = test.description
print(X_train.shape)
print(X_test.shape)

(3185,)
(1570,)


In [150]:
NB_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(MultinomialNB(
                    fit_prior=True, class_prior=None))),
            ])

In [151]:
for category in categories:
    print('... Processing {}'.format(category))
    # train the model using X_dtm & y
    NB_pipeline.fit(X_train, train[category])
    # compute the testing accuracy
    prediction = NB_pipeline.predict(X_test)
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))

... Processing American
Test accuracy is 0.7859872611464969
... Processing Blended
Test accuracy is 0.8834394904458599
... Processing Bourbon
Test accuracy is 0.8757961783439491
... Processing Canadian
Test accuracy is 0.9464968152866242
... Processing Flavored
Test accuracy is 0.986624203821656
... Processing Irish
Test accuracy is 0.9471337579617835
... Processing Japanese
Test accuracy is 0.9828025477707006
... Processing Other
Test accuracy is 0.9923566878980892
... Processing Rye
Test accuracy is 0.9681528662420382
... Processing Scotch
Test accuracy is 0.8535031847133758
... Processing Single Blended Grain
Test accuracy is 0.9821656050955414
... Processing Single Blended Malt
Test accuracy is 0.8910828025477707


In [155]:
SVC_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=1)),
            ])

In [156]:
for category in categories:
    print('... Processing {}'.format(category))
    # train the model using X_dtm & y
    SVC_pipeline.fit(X_train, train[category])
    # compute the testing accuracy
    prediction = SVC_pipeline.predict(X_test)
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))


... Processing American
Test accuracy is 0.9414012738853503
... Processing Blended
Test accuracy is 0.9159235668789809
... Processing Bourbon
Test accuracy is 0.9286624203821656
... Processing Canadian
Test accuracy is 0.9796178343949045
... Processing Flavored
Test accuracy is 0.9872611464968153
... Processing Irish
Test accuracy is 0.9573248407643312
... Processing Japanese
Test accuracy is 0.9853503184713376
... Processing Other
Test accuracy is 0.9923566878980892
... Processing Rye
Test accuracy is 0.9687898089171975
... Processing Scotch
Test accuracy is 0.8636942675159236
... Processing Single Blended Grain
Test accuracy is 0.9821656050955414
... Processing Single Blended Malt
Test accuracy is 0.8968152866242038


In [157]:
LogReg_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=1)),
            ])
for category in categories:
    print('... Processing {}'.format(category))
    # train the model using X_dtm & y
    LogReg_pipeline.fit(X_train, train[category])
    # compute the testing accuracy
    prediction = LogReg_pipeline.predict(X_test)
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))


... Processing American
Test accuracy is 0.9178343949044586
... Processing Blended
Test accuracy is 0.8853503184713376
... Processing Bourbon
Test accuracy is 0.9044585987261147
... Processing Canadian
Test accuracy is 0.9509554140127389
... Processing Flavored
Test accuracy is 0.986624203821656
... Processing Irish
Test accuracy is 0.9490445859872612
... Processing Japanese
Test accuracy is 0.9828025477707006
... Processing Other
Test accuracy is 0.9923566878980892
... Processing Rye
Test accuracy is 0.9687898089171975
... Processing Scotch
Test accuracy is 0.8503184713375797
... Processing Single Blended Grain
Test accuracy is 0.9821656050955414
... Processing Single Blended Malt
Test accuracy is 0.8974522292993631
