# Prediction of type of wine based on review/comment using Naive Bayes
https://www.toptal.com/machine-learning/nlp-tutorial-text-classification

In [1]:
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer

In [2]:
df = pd.read_csv('data/wine_data.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


In [4]:
counter = Counter(df['variety'].tolist())
print(type(counter))
print(counter)

<class 'collections.Counter'>
Counter({'Pinot Noir': 13272, 'Chardonnay': 11753, 'Cabernet Sauvignon': 9472, 'Red Blend': 8946, 'Bordeaux-style Red Blend': 6915, 'Riesling': 5189, 'Sauvignon Blanc': 4967, 'Syrah': 4142, 'Rosé': 3564, 'Merlot': 3102, 'Nebbiolo': 2804, 'Zinfandel': 2714, 'Sangiovese': 2707, 'Malbec': 2652, 'Portuguese Red': 2466, 'White Blend': 2360, 'Sparkling Blend': 2153, 'Tempranillo': 1810, 'Rhône-style Red Blend': 1471, 'Pinot Gris': 1455, 'Champagne Blend': 1396, 'Cabernet Franc': 1353, 'Grüner Veltliner': 1345, 'Portuguese White': 1159, 'Bordeaux-style White Blend': 1066, 'Pinot Grigio': 1052, 'Gamay': 1025, 'Gewürztraminer': 1012, 'Viognier': 996, 'Shiraz': 836, 'Petite Sirah': 770, 'Sangiovese Grosso': 751, 'Barbera': 721, 'Glera': 709, 'Port': 668, 'Grenache': 651, 'Corvina, Rondinella, Molinara': 619, 'Chenin Blanc': 591, 'Tempranillo Blend': 588, 'Carmenère': 575, 'Albariño': 477, 'Pinot Blanc': 442, 'Rhône-style White Blend': 425, "Nero d'Avola": 365, 'Agli

In [5]:
top_10_varieties = {i[0]: idx for idx, i in enumerate(counter.most_common(10))}

In [6]:
a = counter.most_common(10)
print(a)
print('---------')
print(top_10_varieties)

[('Pinot Noir', 13272), ('Chardonnay', 11753), ('Cabernet Sauvignon', 9472), ('Red Blend', 8946), ('Bordeaux-style Red Blend', 6915), ('Riesling', 5189), ('Sauvignon Blanc', 4967), ('Syrah', 4142), ('Rosé', 3564), ('Merlot', 3102)]
---------
{'Pinot Noir': 0, 'Chardonnay': 1, 'Cabernet Sauvignon': 2, 'Red Blend': 3, 'Bordeaux-style Red Blend': 4, 'Riesling': 5, 'Sauvignon Blanc': 6, 'Syrah': 7, 'Rosé': 8, 'Merlot': 9}


In [7]:
df = df[df['variety'].map(lambda x: x in top_10_varieties)]

In [8]:
df['variety'].map(lambda x: x in top_10_varieties).head()

3     True
4     True
10    True
12    True
14    True
Name: variety, dtype: bool

In [9]:
df.head()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
3,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks
10,10,US,"Soft, supple plum envelopes an oaky structure ...",Mountain Cuvée,87,19.0,California,Napa Valley,Napa,Virginie Boone,@vboone,Kirkland Signature 2011 Mountain Cuvée Caberne...,Cabernet Sauvignon,Kirkland Signature
12,12,US,"Slightly reduced, this wine offers a chalky, t...",,87,34.0,California,Alexander Valley,Sonoma,Virginie Boone,@vboone,Louis M. Martini 2012 Cabernet Sauvignon (Alex...,Cabernet Sauvignon,Louis M. Martini
14,14,US,Building on 150 years and six generations of w...,,87,12.0,California,Central Coast,Central Coast,Matt Kettmann,@mattkettmann,Mirassou 2012 Chardonnay (Central Coast),Chardonnay,Mirassou


In [10]:
description_list = df['description'].tolist()

In [11]:
print(description_list)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [12]:
print(description_list[:3])
print('----------')
print(description_list[0])
print(description_list[1])
print(description_list[2])

['Pineapple rind, lemon pith and orange blossom start off the aromas. The palate is a bit more opulent, with notes of honey-drizzled guava and mango giving way to a slightly astringent, semidry finish.', "Much like the regular bottling from 2012, this comes across as rather rough and tannic, with rustic, earthy, herbal characteristics. Nonetheless, if you think of it as a pleasantly unfussy country wine, it's a good companion to a hearty winter stew.", 'Soft, supple plum envelopes an oaky structure in this Cabernet, supported by 15% Merlot. Coffee and chocolate complete the picture, finishing strong at the end, resulting in a value-priced wine of attractive flavor and immediate accessibility.']
----------
Pineapple rind, lemon pith and orange blossom start off the aromas. The palate is a bit more opulent, with notes of honey-drizzled guava and mango giving way to a slightly astringent, semidry finish.
Much like the regular bottling from 2012, this comes across as rather rough and tanni

In [13]:
varietal_list = [top_10_varieties[i] for i in df['variety'].tolist()]
print(varietal_list)

[5, 0, 2, 2, 1, 5, 3, 0, 9, 0, 3, 3, 3, 6, 0, 2, 0, 6, 9, 3, 5, 6, 3, 3, 1, 1, 0, 2, 6, 1, 1, 4, 3, 1, 2, 2, 4, 5, 1, 0, 8, 5, 9, 3, 5, 4, 5, 5, 1, 3, 3, 3, 2, 3, 4, 1, 7, 3, 5, 1, 2, 2, 4, 2, 2, 4, 5, 2, 0, 2, 2, 5, 2, 0, 7, 7, 5, 3, 5, 3, 2, 9, 0, 5, 5, 2, 0, 6, 3, 7, 1, 1, 0, 1, 0, 3, 9, 1, 3, 2, 3, 1, 2, 7, 1, 2, 1, 3, 4, 1, 1, 0, 5, 3, 0, 7, 2, 2, 1, 1, 3, 1, 6, 9, 0, 4, 2, 0, 0, 3, 2, 0, 3, 3, 0, 7, 1, 3, 3, 2, 2, 0, 0, 3, 0, 5, 0, 2, 4, 2, 2, 2, 3, 1, 1, 0, 7, 2, 2, 9, 2, 1, 3, 0, 8, 8, 0, 1, 5, 1, 1, 5, 1, 0, 5, 1, 2, 1, 0, 1, 2, 6, 6, 6, 9, 2, 1, 6, 9, 6, 0, 2, 1, 6, 1, 9, 0, 6, 7, 3, 7, 1, 2, 5, 3, 4, 9, 8, 7, 6, 5, 4, 2, 2, 1, 1, 1, 0, 0, 1, 5, 3, 1, 0, 5, 5, 0, 0, 2, 3, 4, 4, 0, 2, 3, 1, 1, 0, 8, 8, 6, 7, 3, 1, 2, 9, 0, 0, 0, 0, 0, 0, 7, 1, 0, 0, 7, 0, 0, 2, 5, 4, 2, 0, 0, 0, 0, 2, 3, 2, 1, 0, 1, 8, 8, 2, 0, 7, 0, 0, 7, 3, 2, 2, 1, 7, 4, 8, 4, 4, 8, 4, 8, 9, 0, 1, 6, 1, 1, 5, 2, 5, 2, 6, 2, 3, 3, 6, 0, 1, 9, 1, 0, 3, 3, 3, 0, 0, 5, 5, 1, 1, 2, 0, 5, 2, 5, 5, 5, 0, 1, 2, 2, 

In [14]:
varietal_list = np.array(varietal_list)
print(varietal_list)

[5 0 2 ... 2 5 0]


In [15]:
count_vect = CountVectorizer()
x_train_counts = count_vect.fit_transform(description_list)
print(x_train_counts)

  (0, 8226)	1
  (0, 18626)	1
  (0, 1595)	1
  (0, 19275)	1
  (0, 21558)	1
  (0, 23242)	1
  (0, 9284)	1
  (0, 12760)	1
  (0, 9762)	1
  (0, 6756)	1
  (0, 10358)	1
  (0, 14537)	1
  (0, 14349)	1
  (0, 23549)	1
  (0, 14687)	1
  (0, 13694)	1
  (0, 2400)	1
  (0, 11126)	1
  (0, 15061)	1
  (0, 1465)	1
  (0, 21294)	2
  (0, 14541)	1
  (0, 19945)	1
  (0, 2536)	1
  (0, 14692)	1
  :	:
  (71321, 10242)	1
  (71321, 13123)	1
  (71321, 17243)	1
  (71321, 9282)	1
  (71321, 809)	1
  (71321, 4567)	1
  (71321, 23375)	1
  (71321, 2735)	1
  (71321, 6742)	1
  (71321, 1837)	1
  (71321, 11154)	1
  (71321, 8844)	1
  (71321, 4566)	1
  (71321, 8358)	1
  (71321, 4105)	1
  (71321, 1604)	1
  (71321, 19449)	1
  (71321, 11145)	1
  (71321, 1513)	2
  (71321, 13829)	1
  (71321, 21558)	1
  (71321, 14537)	1
  (71321, 23549)	1
  (71321, 11126)	2
  (71321, 1153)	2


In [16]:
# print(x_train_counts.toarray())

In [17]:
tfidf_transformer = TfidfTransformer()
x_train_tfidf = tfidf_transformer.fit_transform(x_train_counts)
print(type(x_train_tfidf))

<class 'scipy.sparse.csr.csr_matrix'>


In [18]:
print(x_train_tfidf)

  (0, 23549)	0.055875471541712726
  (0, 23242)	0.19325840238801922
  (0, 21558)	0.0793303075658288
  (0, 21294)	0.09602880557015973
  (0, 19945)	0.238962985535226
  (0, 19275)	0.1698785378735431
  (0, 18626)	0.31306827387936265
  (0, 17659)	0.2257338478465582
  (0, 15777)	0.23655033124089098
  (0, 15708)	0.1729996166253862
  (0, 15061)	0.0935384030565263
  (0, 14692)	0.16387172322894078
  (0, 14687)	0.21952508729936226
  (0, 14541)	0.1757807127143178
  (0, 14537)	0.05008417182636346
  (0, 14349)	0.11868786554725343
  (0, 13694)	0.1407473117845848
  (0, 12760)	0.21936288700026169
  (0, 12046)	0.14492877735867943
  (0, 11126)	0.06360897623275127
  (0, 10358)	0.18621274642158622
  (0, 9762)	0.2553812698551216
  (0, 9284)	0.2014378581128622
  (0, 8226)	0.09320226879297053
  (0, 6756)	0.3333291312883109
  :	:
  (71321, 13829)	0.14551572331806495
  (71321, 13123)	0.21854648240272462
  (71321, 11154)	0.09323837774431191
  (71321, 11145)	0.04984300251511684
  (71321, 11126)	0.09889029643549949

In [19]:
train_x, test_x, train_y, test_y = train_test_split(x_train_tfidf, varietal_list, test_size=0.3)

In [30]:
print(test_x.shape)
print(type(test_x))

(21397, 23888)
<class 'scipy.sparse.csr.csr_matrix'>


In [21]:
clf = MultinomialNB().fit(train_x, train_y)
y_score = clf.predict(test_x)

In [22]:
print(y_score)

[0 3 2 ... 4 0 6]


In [23]:
n_right = 0
for i in range(len(y_score)):
    if y_score[i] == test_y[i]:
        n_right += 1

In [24]:
print("Accuracy: %.2f%%" % ((n_right/float(len(test_y)) * 100)))

Accuracy: 63.68%


In [25]:
sentence = ['Pineapple rind, lemon pith and orange blossom','Soft, supple plum envelopes ']

# fit_tranform use to define size of matrix and develop matrix
# Whereas transform is use to develop matrix based on matrix previously developed using fit_transform
# This help me in solving dimension issue or we can say length of input to model will be same.
# example 
#count_vect1 = CountVectorizer()
#x = count_vect1.transform(sentence)
#print(x)
#print(x.toarray())
#print(count_vect1.get_feature_names())
#this will error as we have form the matrix using fit_transform
#whereas
#count_vect1 = CountVectorizer()
#x = count_vect1.fit_transform(sentence)
#print(x)
#print(x.toarray())
#print(count_vect1.get_feature_names())
#this run ok and form and develop matrix

In [26]:
x = count_vect.transform(sentence)
print(x)
print(x.toarray())
print(count_vect.get_feature_names())

  (0, 1153)	1
  (0, 2536)	1
  (0, 12046)	1
  (0, 14692)	1
  (0, 15708)	1
  (0, 15777)	1
  (0, 17659)	1
  (1, 7403)	1
  (1, 15907)	1
  (1, 19449)	1
  (1, 20619)	1
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [27]:
xinput = tfidf_transformer.transform(x)
print(xinput.toarray())

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [28]:
print(test_x.shape)
print(train_x.shape)
print(xinput.shape)

(21397, 23888)
(49925, 23888)
(2, 23888)


In [29]:
y_pred = clf.predict(xinput)
print(y_pred)

[1 2]
