In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('Cleaned Data with States.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Alcohol,Appellation,Category,Points,Price,Review,Wine,Winery,Year,Variety,State
0,0,14.9,"Russian River Valley, Sonoma, California, US",Red,87,42.0,"Baked plum, licorice and lavender aromas and f...",V. Sattui 2015 Gilsson Vineyard Old Vine Zinfa...,V. Sattui,2015.0,Zinfandel,California
1,1,14.5,"Central Coast, Central Coast, California, US",Red,87,19.98,Pomegranate and light baking-spice aromas show...,Wente 2014 Coastal Selection Pinot Noir (Centr...,Wente,2014.0,Pinot Noir,California
2,2,13.8,California Republic,Red,86,15.0,Butter and vanilla notes dominate the jammy fr...,California Republic 2016 Cabernet Sauvignon (C...,California Republic,2016.0,Cabernet Sauvignon,California
3,3,13.8,"Lodi, Central Valley, California, US",Red,86,10.0,An aroma like toasted almonds and wood smoke m...,Collier Creek 2015 Red Wagon Pinot Noir (Lodi),Collier Creek,2015.0,Pinot Noir,California
4,4,13.0,"New Jersey, US",Red,86,27.0,Plum skin and pomegranate lead the nose while ...,DiLuca 2016 Rosso Black Label Red,DiLuca,2016.0,Red Blend,


# Potential target variables to Predict based on data above:
- color: there are 3 unique colors
- country: there are 7 unique countries
- points - below mean or above mean?
- price - below mean or above mean?
- taster name - based on the language they used?
- the variety of wine? - pinot noir etc (there are 434 total wine varieties)

In [3]:
len(df)

4768

In [4]:
df.Variety.describe()

count           4768
unique           117
top       Pinot Noir
freq            1058
Name: Variety, dtype: object

In [5]:

# df.isna().sum()

In [6]:
df.columns

Index(['Unnamed: 0', 'Alcohol', 'Appellation', 'Category', 'Points', 'Price',
       'Review', 'Wine', 'Winery', 'Year', 'Variety', 'State'],
      dtype='object')

In [7]:
# region_2 and twitter handle have the most null values so dropping the column
df_cleaned1 = df.drop(['Unnamed: 0', 'Alcohol', 'Appellation', 'Points', 'Price', 'Wine', 'Winery', 'Year', 'Category', 'State'],axis=1)
df_cleaned1

Unnamed: 0,Review,Variety
0,"Baked plum, licorice and lavender aromas and f...",Zinfandel
1,Pomegranate and light baking-spice aromas show...,Pinot Noir
2,Butter and vanilla notes dominate the jammy fr...,Cabernet Sauvignon
3,An aroma like toasted almonds and wood smoke m...,Pinot Noir
4,Plum skin and pomegranate lead the nose while ...,Red Blend
5,This is a big ripe red wine moderate in struct...,Pinot Noir
6,This wine is a blend of 48% Cabernet Sauvignon...,Red Blend
7,"Honey-dipped pineapple and guava, with a touch...",Sauvignon Blanc
8,"Thick and oaky, this estate wine evokes carame...",Chardonnay
9,"With an ashy undercurrent of char and smoke, t...",Sangiovese


In [8]:
list(df_cleaned1.columns.values)

['Review', 'Variety']

## Word Vectorization

In [9]:
import pandas as pd
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.manifold import TSNE
from nltk.tokenize import word_tokenize
import nltk
np.random.seed(0)

In [10]:
df_cleaned1.Review

0       Baked plum, licorice and lavender aromas and f...
1       Pomegranate and light baking-spice aromas show...
2       Butter and vanilla notes dominate the jammy fr...
3       An aroma like toasted almonds and wood smoke m...
4       Plum skin and pomegranate lead the nose while ...
5       This is a big ripe red wine moderate in struct...
6       This wine is a blend of 48% Cabernet Sauvignon...
7       Honey-dipped pineapple and guava, with a touch...
8       Thick and oaky, this estate wine evokes carame...
9       With an ashy undercurrent of char and smoke, t...
10      Dusty with highlights of cedar and sage, this ...
11      Yellow apple and sweet oak aromas make for a p...
12      This is a full-bodied, boldly ripe Cabernet Sa...
13      Aromas of ripe blueberry and blackberry mingle...
14      Brawny fruit is overwhelmed by smoke and charr...
15      Pear, butter and vanilla aromas lead to a soft...
16      This is an easygoing, fruity wine that shows s...
17      This h

In [11]:
experiment_line= str(df_cleaned1.Review[2:3].values)
experiment_line

"['Butter and vanilla notes dominate the jammy fruit flavors in this rich and seemingly sweet wine. It is medium bodied, lightly tannic and heavily influenced by oaky aromas and flavors, including baking spices and maple syrup.']"

In [12]:
# experiment_line= df_cleaned4.description[2:3].to_string()

# clean_line = []
# # for word in experiment_line:

pattern = "([a-zA-Z]+(?:'[a-z]+)?)"
line_tokens_raw = nltk.regexp_tokenize(experiment_line, pattern)
print(line_tokens_raw)    
    

['Butter', 'and', 'vanilla', 'notes', 'dominate', 'the', 'jammy', 'fruit', 'flavors', 'in', 'this', 'rich', 'and', 'seemingly', 'sweet', 'wine', 'It', 'is', 'medium', 'bodied', 'lightly', 'tannic', 'and', 'heavily', 'influenced', 'by', 'oaky', 'aromas', 'and', 'flavors', 'including', 'baking', 'spices', 'and', 'maple', 'syrup']


In [13]:
reviews = str(df_cleaned1.Review[0:20].values)

In [14]:
reviews

'[\'Baked plum, licorice and lavender aromas and flavors define this jammy wine, which takes on a sweetness of oak. Thick on the midpalate, it has substantial weight from the tannins and powerfully ripe fruit.\'\n \'Pomegranate and light baking-spice aromas show on the mellow nose of this bottling. The palate is clean with fruit punch and spice-cake flavors wrapped in a fairly mellow texture.\'\n \'Butter and vanilla notes dominate the jammy fruit flavors in this rich and seemingly sweet wine. It is medium bodied, lightly tannic and heavily influenced by oaky aromas and flavors, including baking spices and maple syrup.\'\n "An aroma like toasted almonds and wood smoke meets spiced plum and red-cherry flavors in this medium-bodied wine. It\'s an oaky version that maintains pretty good balance."\n \'Plum skin and pomegranate lead the nose while white pepper and raw oak take on an earthy edge. The soft-bodied palate is awash in silky tannins, with intense flavors of vanilla, oak and green

In [15]:
# categories = ['Red', 'White', 'Orange']

In [16]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import *
stemmer = SnowballStemmer("english")

In [17]:
data = df_cleaned1
data

Unnamed: 0,Review,Variety
0,"Baked plum, licorice and lavender aromas and f...",Zinfandel
1,Pomegranate and light baking-spice aromas show...,Pinot Noir
2,Butter and vanilla notes dominate the jammy fr...,Cabernet Sauvignon
3,An aroma like toasted almonds and wood smoke m...,Pinot Noir
4,Plum skin and pomegranate lead the nose while ...,Red Blend
5,This is a big ripe red wine moderate in struct...,Pinot Noir
6,This wine is a blend of 48% Cabernet Sauvignon...,Red Blend
7,"Honey-dipped pineapple and guava, with a touch...",Sauvignon Blanc
8,"Thick and oaky, this estate wine evokes carame...",Chardonnay
9,"With an ashy undercurrent of char and smoke, t...",Sangiovese


In [18]:
numpy_array = data.as_matrix()
numpy_array

  """Entry point for launching an IPython kernel.


array([['Baked plum, licorice and lavender aromas and flavors define this jammy wine, which takes on a sweetness of oak. Thick on the midpalate, it has substantial weight from the tannins and powerfully ripe fruit.',
        'Zinfandel'],
       ['Pomegranate and light baking-spice aromas show on the mellow nose of this bottling. The palate is clean with fruit punch and spice-cake flavors wrapped in a fairly mellow texture.',
        'Pinot Noir'],
       ['Butter and vanilla notes dominate the jammy fruit flavors in this rich and seemingly sweet wine. It is medium bodied, lightly tannic and heavily influenced by oaky aromas and flavors, including baking spices and maple syrup.',
        'Cabernet Sauvignon'],
       ...,
       ['This is a microscopic production, which shows itself to be medium bodied to full, with soft moderate structure. Dried cherry, raspberry and dried citrus round out the approachable, likable fruit-forwardness of the layered wine.',
        'Pinot Noir'],
      

In [19]:
X = numpy_array[:,0]
Y = numpy_array[:,1]
Y

array(['Zinfandel', 'Pinot Noir', 'Cabernet Sauvignon', ..., 'Pinot Noir',
       'Pinot Noir', 'Merlot'], dtype=object)

In [20]:
type(Y)

numpy.ndarray

In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline 


In [22]:

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X)
X_train_counts.shape


(4768, 6721)

In [23]:
count_vect.vocabulary_.get(u'algorithm')

In [24]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

(4768, 6721)

In [25]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(4768, 6721)

In [26]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, Y)

In [27]:
target_names = list(set(Y))
target_names

['Cabernet Sauvignon',
 'Moscato',
 'Elephant Mountain Vineyards',
 'Barbera',
 'Aglianico',
 'Rosato',
 'Petite Syrah',
 'Primitivo',
 'Petite',
 'Falanghina',
 'Gamay',
 'Pinot Grigio',
 'Aria White Port',
 'Torrontés',
 'Rosé',
 "Nero d'Avola",
 'Tannat',
 'Spirit Canyon Vineyard Arneis',
 'Red',
 'Mourvèdre',
 'White',
 'Trousseau',
 'Other White',
 'Gewurztraminer',
 'Petit Manseng',
 'Gewürztraminer',
 'Tempranillo',
 'Chenin',
 'Monastrell',
 'Aleatico',
 'Albariño',
 'Sangiovese',
 'Pinot Blanc',
 'Rkatsiteli',
 'Claudia',
 'Valdiguié',
 'Porton Norton',
 'Zinfandel',
 'Cabernet',
 'Verdejo',
 'Grenache Blanc',
 'Port',
 'Grenache Noir',
 'Vidal Blanc',
 'Auxerrois',
 'Verdelho',
 'Sauvignon Blanc',
 'Pinotage',
 'Swing',
 'Semillon',
 'Sin So Cinsault',
 'Pinot Noir',
 'Müller-Thurgau',
 'Picpoul',
 'Merlot',
 'Pinot Gris',
 'Sylvaner',
 'Malbec',
 'Sémillon',
 'Melon de Bourgogne',
 'Cabernet Franc',
 'Cinsault',
 'Teroldego',
 'Mataro',
 'Dolcetto',
 'Gamay Noir',
 'Malvasia

In [28]:
len(target_names)

117

## Build a Pipeline

In [29]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
('vect', CountVectorizer(stop_words='english')),
('tfidf', TfidfTransformer()),
('clf', MultinomialNB()),])

In [30]:
text_clf.fit(X_train, Y_train) 

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        ...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [31]:
predicted = text_clf.predict(X_test)
np.mean(predicted == Y_test)

0.4056603773584906

## Use a SVM for better Accuracy

In [32]:
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-3, random_state=42,
                                           max_iter=5, tol=None)),])


In [33]:
text_clf.fit(X_train, Y_train) 
predicted = text_clf.predict(X_test)
np.mean(predicted == Y_test)



0.5482180293501048

In [34]:
d = {'X_test': X_test, 'Y_test': Y_test,'Predicted': predicted}
results = pd.DataFrame(data=d)
results

Unnamed: 0,X_test,Y_test,Predicted
0,"Savory aromas of pepper, roast beef and hearty...",Syrah,Syrah
1,"Prominent aromas of pear, peach and flower are...",White Blend,Chardonnay
2,"Aged in neutral puncheons, this wine has aroma...",Grenache,Merlot
3,Lewis Vineyard makes up the majority of this w...,Syrah,Syrah
4,A touch of seeming-sweetness from generous oak...,Pinot Noir,Pinot Noir
5,"Grown on the producer's Home Ranch, this offer...",Riesling,Viognier
6,This bottling from the extremely coastal Jespe...,Sauvignon Blanc,Syrah
7,This medium-bodied wine is fruity and relative...,Cabernet Sauvignon,Pinot Noir
8,"Boysenberry, mocha, tar, roasted meat and mint...",Syrah,Syrah
9,This wine comes from a vineyard best known for...,Chardonnay,Chardonnay


## Try to predict the outcome on a new review

In [35]:
def predict_review(docs_new):
    X_new_counts = count_vect.transform(docs_new)
    X_new_tfidf = tfidf_transformer.transform(X_new_counts)
    predicted = clf.predict(X_new_tfidf)
    for doc, category in zip(docs_new, predicted):
        print('%r => %s' % (doc, category))

In [36]:
docs_new = ['buttery, goes well with fish', 'bold blackberry cherry smooth figs raisins caramel']

In [37]:
predict_review(docs_new)

'buttery, goes well with fish' => Chardonnay
'bold blackberry cherry smooth figs raisins caramel' => Pinot Noir


In [38]:
predict_review(['refreshing','yellow'])

'refreshing' => Pinot Noir
'yellow' => Chardonnay


In [39]:
predict_review(['red meat','harsh'])

'red meat' => Pinot Noir
'harsh' => Cabernet Sauvignon


In [40]:
predict_review(['overly dry'])

'overly dry' => Pinot Noir


In [42]:
predict_review(['grapey dry'])

'grapey dry' => Red Blend


In [55]:
predict_review(['fermented'])

'fermented' => Pinot Noir


In [56]:
predict_review(['tarte sweet'])

'tarte sweet' => Pinot Noir


In [59]:
predict_review(['dark'])

'dark' => Pinot Noir


In [61]:
predict_review(['slightly bitter earthier kick slightly dry light'])

'slightly bitter earthier kick slightly dry light' => Pinot Noir


In [62]:
predict_review(['buttery light tarte'])

'buttery light tarte' => Chardonnay


In [63]:
predict_review(['cat piss crisp citrus'])

'cat piss crisp citrus' => Chardonnay


In [64]:
predict_review(['mineral'])

'mineral' => Chardonnay


In [65]:
predict_review(['sweet fruity smooth'])

'sweet fruity smooth' => Pinot Noir
