In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('Cleaned Data with States.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Alcohol,Appellation,Category,Points,Price,Review,Wine,Winery,Year,Variety,State
0,0,14.9,"Russian River Valley, Sonoma, California, US",Red,87,42.0,"Baked plum, licorice and lavender aromas and f...",V. Sattui 2015 Gilsson Vineyard Old Vine Zinfa...,V. Sattui,2015.0,Zinfandel,California
1,1,14.5,"Central Coast, Central Coast, California, US",Red,87,19.98,Pomegranate and light baking-spice aromas show...,Wente 2014 Coastal Selection Pinot Noir (Centr...,Wente,2014.0,Pinot Noir,California
2,2,13.8,California Republic,Red,86,15.0,Butter and vanilla notes dominate the jammy fr...,California Republic 2016 Cabernet Sauvignon (C...,California Republic,2016.0,Cabernet Sauvignon,California
3,3,13.8,"Lodi, Central Valley, California, US",Red,86,10.0,An aroma like toasted almonds and wood smoke m...,Collier Creek 2015 Red Wagon Pinot Noir (Lodi),Collier Creek,2015.0,Pinot Noir,California
4,4,13.0,"New Jersey, US",Red,86,27.0,Plum skin and pomegranate lead the nose while ...,DiLuca 2016 Rosso Black Label Red,DiLuca,2016.0,Red Blend,


# Potential target variables to Predict based on data above:
- color: there are 3 unique colors
- country: there are 7 unique countries
- points - below mean or above mean?
- price - below mean or above mean?
- taster name - based on the language they used?
- the variety of wine? - pinot noir etc (there are 434 total wine varieties)

In [3]:
len(df)

4768

In [4]:
df.Variety.describe()

count           4768
unique           117
top       Pinot Noir
freq            1058
Name: Variety, dtype: object

In [5]:

# df.isna().sum()

In [6]:
df.columns

Index(['Unnamed: 0', 'Alcohol', 'Appellation', 'Category', 'Points', 'Price',
       'Review', 'Wine', 'Winery', 'Year', 'Variety', 'State'],
      dtype='object')

In [7]:
# region_2 and twitter handle have the most null values so dropping the column
reviews_only = df.drop(['Unnamed: 0', 'Alcohol', 'Appellation', 'Points', 'Price', 'Wine', 'Winery', 'Year','Variety', 'State','Category'],axis=1)
reviews_only.head()

Unnamed: 0,Review
0,"Baked plum, licorice and lavender aromas and f..."
1,Pomegranate and light baking-spice aromas show...
2,Butter and vanilla notes dominate the jammy fr...
3,An aroma like toasted almonds and wood smoke m...
4,Plum skin and pomegranate lead the nose while ...


In [8]:
len(reviews_only)

4768

## Word Vectorization

In [9]:
import pandas as pd
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.manifold import TSNE
from nltk.tokenize import word_tokenize
import nltk
np.random.seed(0)

In [10]:
reviews_array = reviews_only

In [11]:
review_array_2 = reviews_array['Review']

In [12]:
review_array_2[0:10]

0    Baked plum, licorice and lavender aromas and f...
1    Pomegranate and light baking-spice aromas show...
2    Butter and vanilla notes dominate the jammy fr...
3    An aroma like toasted almonds and wood smoke m...
4    Plum skin and pomegranate lead the nose while ...
5    This is a big ripe red wine moderate in struct...
6    This wine is a blend of 48% Cabernet Sauvignon...
7    Honey-dipped pineapple and guava, with a touch...
8    Thick and oaky, this estate wine evokes carame...
9    With an ashy undercurrent of char and smoke, t...
Name: Review, dtype: object

In [13]:
len(reviews_array)

4768

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
vectorizer = TfidfVectorizer()
reviews_vect = vectorizer.fit_transform(review_array_2)
# >>> print(vectorizer.get_feature_names())
# ['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
print(reviews_vect.shape)

(4768, 6721)


In [15]:
# # experiment_line= df_cleaned4.description[2:3].to_string()

# # clean_line = []
# # # for word in experiment_line:

# pattern = "([a-zA-Z]+(?:'[a-z]+)?)"
# line_tokens_raw = nltk.regexp_tokenize(reviews_only.to_string(), pattern)
# print(line_tokens_raw)    
    

In [16]:
# categories = ['Red', 'White', 'Orange']

In [17]:
# from nltk.corpus import stopwords
# from sklearn.feature_extraction.text import CountVectorizer
# from nltk.stem import *
# stemmer = SnowballStemmer("english")

In [18]:
# arta_tokens = [i.lower() for i in line_tokens_raw]

# # stop words
# from nltk.corpus import stopwords
# stopwords.words("english")

# stop_words = set(stopwords.words('english'))
# arta_tokens_stopped = [w for w in arta_tokens if not w in stop_words]

# # stem words
# stemmer = SnowballStemmer("english")
# arta_stemmed = [stemmer.stem(word) for word in arta_tokens_stopped]
# arta_tokens_stopped[0:20]

In [19]:
# len(arta_tokens_stopped)

In [20]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# tfidf = TfidfVectorizer()
# response = tfidf.fit_transform(arta_tokens_stopped)

# import pandas as pd
# reviews_vectorized = pd.DataFrame(response.toarray(), columns=tfidf.get_feature_names())
# reviews_vectorized

In [21]:
type(reviews_vect)

scipy.sparse.csr.csr_matrix

In [22]:

reviews_df = pd.DataFrame(reviews_vect.toarray())
reviews_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,6711,6712,6713,6714,6715,6716,6717,6718,6719,6720
0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
# import scipy.sparse as sp
# data = data.apply(lambda col: col.str.strip())

# vect = CountVectorizer(ngram_range=(1, 3))
# train = sp.hstack(data.apply(lambda col: vect.fit_transform(col)))

In [24]:
df.columns

Index(['Unnamed: 0', 'Alcohol', 'Appellation', 'Category', 'Points', 'Price',
       'Review', 'Wine', 'Winery', 'Year', 'Variety', 'State'],
      dtype='object')

In [25]:
one_hot_df = pd.get_dummies(df, columns=['Year', 'Category','State','Appellation', 'Winery',])
one_hot_df.head()

Unnamed: 0.1,Unnamed: 0,Alcohol,Points,Price,Review,Wine,Variety,Year_2004.0,Year_2005.0,Year_2007.0,...,Winery_ZD,Winery_Zaca Mesa,Winery_Zanoli,Winery_Zeitgeist,Winery_Ziata,Winery_Zinfandelic,Winery_Zotovich,Winery_van Loben Sels,Winery_Écluse,Winery_Órale
0,0,14.9,87,42.0,"Baked plum, licorice and lavender aromas and f...",V. Sattui 2015 Gilsson Vineyard Old Vine Zinfa...,Zinfandel,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,14.5,87,19.98,Pomegranate and light baking-spice aromas show...,Wente 2014 Coastal Selection Pinot Noir (Centr...,Pinot Noir,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,13.8,86,15.0,Butter and vanilla notes dominate the jammy fr...,California Republic 2016 Cabernet Sauvignon (C...,Cabernet Sauvignon,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,13.8,86,10.0,An aroma like toasted almonds and wood smoke m...,Collier Creek 2015 Red Wagon Pinot Noir (Lodi),Pinot Noir,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,13.0,86,27.0,Plum skin and pomegranate lead the nose while ...,DiLuca 2016 Rosso Black Label Red,Red Blend,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
labels = one_hot_df.Variety
one_hot_df.drop(['Variety','Review','Wine'], axis=1, inplace=True)
one_hot_df.head()

Unnamed: 0.1,Unnamed: 0,Alcohol,Points,Price,Year_2004.0,Year_2005.0,Year_2007.0,Year_2008.0,Year_2009.0,Year_2010.0,...,Winery_ZD,Winery_Zaca Mesa,Winery_Zanoli,Winery_Zeitgeist,Winery_Ziata,Winery_Zinfandelic,Winery_Zotovich,Winery_van Loben Sels,Winery_Écluse,Winery_Órale
0,0,14.9,87,42.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,14.5,87,19.98,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,13.8,86,15.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,13.8,86,10.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,13.0,86,27.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
frames = [reviews_df, one_hot_df]

result = pd.concat(frames, axis=1, sort=False)

In [28]:
result.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,Winery_ZD,Winery_Zaca Mesa,Winery_Zanoli,Winery_Zeitgeist,Winery_Ziata,Winery_Zinfandelic,Winery_Zotovich,Winery_van Loben Sels,Winery_Écluse,Winery_Órale
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
result.shape

(4768, 9214)

In [30]:
len(labels)

4768

In [31]:
# from sklearn.preprocessing import StandardScaler

# scaler = StandardScaler()
# #* Use the scaler's `.fit_transform()` method to create a scaled version of our dataset. 
# scaled_data = scaler.fit_transform(one_hot_df)
# scaled_df = pd.DataFrame(scaled_data, columns=one_hot_df.columns)
# scaled_df.head()

In [32]:
# X = data([:,1:3])
# Y = data.Category

In [33]:
result = result.as_matrix()

  """Entry point for launching an IPython kernel.


In [34]:
# labels= labels.as_matrix()

In [35]:
X = result
Y = labels
Y

0                           Zinfandel
1                          Pinot Noir
2                  Cabernet Sauvignon
3                          Pinot Noir
4                           Red Blend
5                          Pinot Noir
6                           Red Blend
7                     Sauvignon Blanc
8                          Chardonnay
9                          Sangiovese
10                     Cabernet Franc
11                         Chardonnay
12                 Cabernet Sauvignon
13                             Merlot
14                          Zinfandel
15                         Chardonnay
16                 Cabernet Sauvignon
17                 Cabernet Sauvignon
18                          Zinfandel
19                         Pinot Noir
20                            Barbera
21      Spirit Canyon Vineyard Arneis
22                 Cabernet Sauvignon
23                     Cabernet Franc
24                         Pinot Noir
25                 Cabernet Sauvignon
26          

In [36]:
X.shape, Y.shape

((4768, 9214), (4768,))

In [37]:
type(Y)

pandas.core.series.Series

In [38]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline 


In [39]:
type(X)

numpy.ndarray

In [40]:

# count_vect = CountVectorizer()
# X_train_counts = count_vect.fit_transform(X)
# X_train_counts.shape


In [41]:
# count_vect.vocabulary_.get(u'algorithm')

In [42]:
# from sklearn.feature_extraction.text import TfidfTransformer
# tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
# X_train_tf = tf_transformer.transform(X_train_counts)
# X_train_tf.shape

In [43]:
# tfidf_transformer = TfidfTransformer()
# X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
# X_train_tfidf.shape

In [44]:
# from sklearn.naive_bayes import MultinomialNB
# clf = MultinomialNB().fit(X_train_tfidf, Y)

In [45]:
target_names = list(set(Y))
target_names

['McKinley Springs Vineyard Cinsault',
 'Gewürztraminer',
 'Grenache',
 'Montepulciano',
 'Tempranillo',
 'Graziano Vineyard Arneis',
 'Red Blend',
 'Chardonnay',
 'Seyval Blanc',
 'Verdejo',
 'Port',
 'Rosé',
 'Aria White Port',
 'Carignane',
 'Lemberger',
 'Petit Manseng',
 'Petite',
 'Gewurztraminer',
 'White Blend',
 'Claudia',
 'Roussanne',
 'Ugni Blanc',
 'Monastrell',
 'Rhône-style Red Blend',
 'Swing',
 'Royal Escort Paladini Vineyard Port',
 'Cabernet Franc',
 'Cinsault',
 'Elephant Mountain Vineyards',
 'Sin So Cinsault',
 'Tannat',
 'Trousseau',
 'Müller-Thurgau',
 'Fiano',
 'Pinot Grigio',
 'Pinot Gris',
 'Chenin',
 'Traminette',
 'Chenin Blanc',
 'Spirit Canyon Vineyard Arneis',
 'Sangiovese',
 'Petit Verdot',
 'Malbec',
 'Vermentino',
 "Nero d'Avola",
 'Pinotage',
 'Sauvignon',
 'Cabernet',
 'Gamay Noir',
 'Mataro',
 'Viognier',
 'Red',
 'Valdiguié',
 'Melon de Bourgogne',
 'Limited Release Arneis',
 'Violeta Portray Port',
 'Petite Sirah',
 'Albariño',
 'Pinot Noir',
 'D

## Build a Pipeline

In [46]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
('tfidf', TfidfTransformer()),
('clf', MultinomialNB()),])

In [47]:


text_clf.fit(X_train, Y_train)
tfidf_trans = TfidfTransformer()

X_train_tfidf = tfidf_trans.fit_transform(X_train)

In [48]:
predicted = text_clf.predict(X_test)
np.mean(predicted == Y_test)

0.23060796645702306

## Use a SVM for better Accuracy

In [49]:
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-3, random_state=42,
                                           max_iter=5, tol=None)),])


In [50]:
text_clf.fit(X_train, Y_train) 
predicted = text_clf.predict(X_test)
np.mean(predicted == Y_test)



0.23060796645702306

In [51]:
d = {'Y_test': Y_test,'Predicted': predicted}
results = pd.DataFrame(data=d)
results

Unnamed: 0,Y_test,Predicted
964,Syrah,Pinot Noir
2223,White Blend,Pinot Noir
2598,Grenache,Pinot Noir
3965,Syrah,Pinot Noir
598,Pinot Noir,Pinot Noir
2634,Riesling,Pinot Noir
1056,Sauvignon Blanc,Pinot Noir
4257,Cabernet Sauvignon,Pinot Noir
642,Syrah,Pinot Noir
2281,Chardonnay,Pinot Noir


In [52]:
d

{'Y_test': 964                     Syrah
 2223              White Blend
 2598                 Grenache
 3965                    Syrah
 598                Pinot Noir
 2634                 Riesling
 1056          Sauvignon Blanc
 4257       Cabernet Sauvignon
 642                     Syrah
 2281               Chardonnay
 4193                 Riesling
 4342               Pinot Noir
 1657                   Merlot
 1417               Chardonnay
 2750    Rhône-style Red Blend
 2677               Pinot Noir
 2059               Pinot Noir
 802                Pinot Noir
 949                Pinot Noir
 3414                Zinfandel
 2402          Sauvignon Blanc
 1846       Cabernet Sauvignon
 742        Cabernet Sauvignon
 33            Sauvignon Blanc
 2460               Chardonnay
 4252             Pinot Grigio
 3933               Chardonnay
 3524               Chardonnay
 2374          Sauvignon Blanc
 227                     Syrah
                 ...          
 4544           Cabernet Fran

## Try to predict the outcome on a new review

In [53]:

count_vect = CountVectorizer()
# X_train_counts = count_vect.fit_transform(X)
# X_train_counts.shape


In [54]:
def predict_review(docs_new):
    X_new_counts = count_vect.transform(docs_new)
    X_new_tfidf = tfidf_transformer.transform(X_new_counts)
    predicted = clf.predict(X_new_tfidf)
    for doc, category in zip(docs_new, predicted):
        print('%r => %s' % (doc, category))

In [55]:
docs_new = ['buttery, goes well with fish', 'bold blackberry cherry smooth figs raisins caramel']

In [56]:
predict_review(docs_new)

NotFittedError: CountVectorizer - Vocabulary wasn't fitted.

In [None]:
predict_review(['refreshing','yellow'])

In [None]:
predict_review(['red meat','harsh'])