In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('Cleaned Data with States.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Alcohol,Appellation,Category,Points,Price,Review,Wine,Winery,Year,Variety,State
0,0,14.9,"Russian River Valley, Sonoma, California, US",Red,87,42.0,"Baked plum, licorice and lavender aromas and f...",V. Sattui 2015 Gilsson Vineyard Old Vine Zinfa...,V. Sattui,2015.0,Zinfandel,California
1,1,14.5,"Central Coast, Central Coast, California, US",Red,87,19.98,Pomegranate and light baking-spice aromas show...,Wente 2014 Coastal Selection Pinot Noir (Centr...,Wente,2014.0,Pinot Noir,California
2,2,13.8,California Republic,Red,86,15.0,Butter and vanilla notes dominate the jammy fr...,California Republic 2016 Cabernet Sauvignon (C...,California Republic,2016.0,Cabernet Sauvignon,California
3,3,13.8,"Lodi, Central Valley, California, US",Red,86,10.0,An aroma like toasted almonds and wood smoke m...,Collier Creek 2015 Red Wagon Pinot Noir (Lodi),Collier Creek,2015.0,Pinot Noir,California
4,4,13.0,"New Jersey, US",Red,86,27.0,Plum skin and pomegranate lead the nose while ...,DiLuca 2016 Rosso Black Label Red,DiLuca,2016.0,Red Blend,


# Potential target variables to Predict based on data above:
- color: there are 3 unique colors
- country: there are 7 unique countries
- points - below mean or above mean?
- price - below mean or above mean?
- taster name - based on the language they used?
- the variety of wine? - pinot noir etc (there are 434 total wine varieties)

In [3]:
len(df)

4768

In [4]:
df.Variety.describe()

count           4768
unique           117
top       Pinot Noir
freq            1058
Name: Variety, dtype: object

In [5]:

# df.isna().sum()

In [6]:
df.columns

Index(['Unnamed: 0', 'Alcohol', 'Appellation', 'Category', 'Points', 'Price',
       'Review', 'Wine', 'Winery', 'Year', 'Variety', 'State'],
      dtype='object')

In [7]:
# region_2 and twitter handle have the most null values so dropping the column
reviews_only = df.drop(['Unnamed: 0', 'Alcohol', 'Appellation', 'Points', 'Price', 'Wine', 'Winery', 'Year','Variety', 'State','Category'],axis=1)
reviews_only.head()

Unnamed: 0,Review
0,"Baked plum, licorice and lavender aromas and f..."
1,Pomegranate and light baking-spice aromas show...
2,Butter and vanilla notes dominate the jammy fr...
3,An aroma like toasted almonds and wood smoke m...
4,Plum skin and pomegranate lead the nose while ...


In [8]:
len(reviews_only)

4768

## Word Vectorization

In [9]:
import pandas as pd
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.manifold import TSNE
from nltk.tokenize import word_tokenize
import nltk
np.random.seed(0)

In [10]:
reviews_array = reviews_only

In [11]:
review_array_2 = reviews_array['Review']

In [12]:
review_array_2[0:10]

0    Baked plum, licorice and lavender aromas and f...
1    Pomegranate and light baking-spice aromas show...
2    Butter and vanilla notes dominate the jammy fr...
3    An aroma like toasted almonds and wood smoke m...
4    Plum skin and pomegranate lead the nose while ...
5    This is a big ripe red wine moderate in struct...
6    This wine is a blend of 48% Cabernet Sauvignon...
7    Honey-dipped pineapple and guava, with a touch...
8    Thick and oaky, this estate wine evokes carame...
9    With an ashy undercurrent of char and smoke, t...
Name: Review, dtype: object

In [13]:
len(reviews_array)

4768

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()

In [15]:
vectorizer = TfidfVectorizer()
reviews_vect = vectorizer.fit_transform(review_array_2)
# >>> print(vectorizer.get_feature_names())
# ['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
print(reviews_vect.shape)

(4768, 6721)


In [16]:
# # experiment_line= df_cleaned4.description[2:3].to_string()

# # clean_line = []
# # # for word in experiment_line:

# pattern = "([a-zA-Z]+(?:'[a-z]+)?)"
# line_tokens_raw = nltk.regexp_tokenize(reviews_only.to_string(), pattern)
# print(line_tokens_raw)    
    

In [17]:
# categories = ['Red', 'White', 'Orange']

In [18]:
# from nltk.corpus import stopwords
# from sklearn.feature_extraction.text import CountVectorizer
# from nltk.stem import *
# stemmer = SnowballStemmer("english")

In [19]:
# arta_tokens = [i.lower() for i in line_tokens_raw]

# # stop words
# from nltk.corpus import stopwords
# stopwords.words("english")

# stop_words = set(stopwords.words('english'))
# arta_tokens_stopped = [w for w in arta_tokens if not w in stop_words]

# # stem words
# stemmer = SnowballStemmer("english")
# arta_stemmed = [stemmer.stem(word) for word in arta_tokens_stopped]
# arta_tokens_stopped[0:20]

In [20]:
# len(arta_tokens_stopped)

In [21]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# tfidf = TfidfVectorizer()
# response = tfidf.fit_transform(arta_tokens_stopped)

# import pandas as pd
# reviews_vectorized = pd.DataFrame(response.toarray(), columns=tfidf.get_feature_names())
# reviews_vectorized

In [22]:
type(reviews_vect)

scipy.sparse.csr.csr_matrix

In [23]:

reviews_df = pd.DataFrame(reviews_vect.toarray())
reviews_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,6711,6712,6713,6714,6715,6716,6717,6718,6719,6720
0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
# import scipy.sparse as sp
# data = data.apply(lambda col: col.str.strip())

# vect = CountVectorizer(ngram_range=(1, 3))
# train = sp.hstack(data.apply(lambda col: vect.fit_transform(col)))

In [25]:
df.columns

Index(['Unnamed: 0', 'Alcohol', 'Appellation', 'Category', 'Points', 'Price',
       'Review', 'Wine', 'Winery', 'Year', 'Variety', 'State'],
      dtype='object')

In [26]:
one_hot_df = pd.get_dummies(df, columns=['Year','Variety','State','Appellation', 'Winery',])
one_hot_df.head()

Unnamed: 0.1,Unnamed: 0,Alcohol,Category,Points,Price,Review,Wine,Year_2004.0,Year_2005.0,Year_2007.0,...,Winery_ZD,Winery_Zaca Mesa,Winery_Zanoli,Winery_Zeitgeist,Winery_Ziata,Winery_Zinfandelic,Winery_Zotovich,Winery_van Loben Sels,Winery_Écluse,Winery_Órale
0,0,14.9,Red,87,42.0,"Baked plum, licorice and lavender aromas and f...",V. Sattui 2015 Gilsson Vineyard Old Vine Zinfa...,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,14.5,Red,87,19.98,Pomegranate and light baking-spice aromas show...,Wente 2014 Coastal Selection Pinot Noir (Centr...,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,13.8,Red,86,15.0,Butter and vanilla notes dominate the jammy fr...,California Republic 2016 Cabernet Sauvignon (C...,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,13.8,Red,86,10.0,An aroma like toasted almonds and wood smoke m...,Collier Creek 2015 Red Wagon Pinot Noir (Lodi),0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,13.0,Red,86,27.0,Plum skin and pomegranate lead the nose while ...,DiLuca 2016 Rosso Black Label Red,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
labels = one_hot_df.Category
one_hot_df.drop(['Category','Review','Wine'], axis=1, inplace=True)
one_hot_df.head()

Unnamed: 0.1,Unnamed: 0,Alcohol,Points,Price,Year_2004.0,Year_2005.0,Year_2007.0,Year_2008.0,Year_2009.0,Year_2010.0,...,Winery_ZD,Winery_Zaca Mesa,Winery_Zanoli,Winery_Zeitgeist,Winery_Ziata,Winery_Zinfandelic,Winery_Zotovich,Winery_van Loben Sels,Winery_Écluse,Winery_Órale
0,0,14.9,87,42.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,14.5,87,19.98,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,13.8,86,15.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,13.8,86,10.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,13.0,86,27.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
frames = [reviews_df, one_hot_df]

result = pd.concat(frames, axis=1, sort=False)

In [29]:
result.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,Winery_ZD,Winery_Zaca Mesa,Winery_Zanoli,Winery_Zeitgeist,Winery_Ziata,Winery_Zinfandelic,Winery_Zotovich,Winery_van Loben Sels,Winery_Écluse,Winery_Órale
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
result.shape

(4768, 9324)

In [31]:
len(labels)

4768

In [32]:
# from sklearn.preprocessing import StandardScaler

# scaler = StandardScaler()
# #* Use the scaler's `.fit_transform()` method to create a scaled version of our dataset. 
# scaled_data = scaler.fit_transform(one_hot_df)
# scaled_df = pd.DataFrame(scaled_data, columns=one_hot_df.columns)
# scaled_df.head()

In [33]:
# X = data([:,1:3])
# Y = data.Category

In [34]:
result = result.as_matrix()

  """Entry point for launching an IPython kernel.


In [35]:
# labels= labels.as_matrix()

In [36]:
X = result
Y = labels
Y

0           Red
1           Red
2           Red
3           Red
4           Red
5           Red
6           Red
7         White
8         White
9           Red
10          Red
11        White
12          Red
13          Red
14          Red
15        White
16          Red
17          Red
18          Red
19          Red
20          Red
21        White
22          Red
23          Red
24          Red
25          Red
26        White
27          Red
28         Rose
29          Red
         ...   
4738      White
4739        Red
4740        Red
4741        Red
4742      White
4743      White
4744        Red
4745        Red
4746    Dessert
4747        Red
4748        Red
4749        Red
4750        Red
4751        Red
4752        Red
4753        Red
4754        Red
4755        Red
4756        Red
4757      White
4758        Red
4759      White
4760      White
4761        Red
4762      White
4763        Red
4764      White
4765        Red
4766        Red
4767        Red
Name: Category, Length: 

In [37]:
X.shape, Y.shape

((4768, 9324), (4768,))

In [38]:
type(Y)

pandas.core.series.Series

In [39]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline 


In [40]:
type(X)

numpy.ndarray

In [41]:

# count_vect = CountVectorizer()
# X_train_counts = count_vect.fit_transform(X)
# X_train_counts.shape


In [42]:
# count_vect.vocabulary_.get(u'algorithm')

In [43]:
# from sklearn.feature_extraction.text import TfidfTransformer
# tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
# X_train_tf = tf_transformer.transform(X_train_counts)
# X_train_tf.shape

In [44]:
# tfidf_transformer = TfidfTransformer()
# X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
# X_train_tfidf.shape

In [45]:
# from sklearn.naive_bayes import MultinomialNB
# clf = MultinomialNB().fit(X_train_tfidf, Y)

In [46]:
target_names = list(set(Y))
target_names

['White', 'Rose', 'Sparkling', 'Red', 'Port/Sherry', 'Dessert', 'Fortified']

## Build a Pipeline

In [47]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
('tfidf', TfidfTransformer()),
('clf', MultinomialNB()),])

In [48]:


text_clf.fit(X_train, Y_train)
tfidf_trans = TfidfTransformer()

X_train_tfidf = tfidf_trans.fit_transform(X_train)

In [49]:
predicted = text_clf.predict(X_test)
np.mean(predicted == Y_test)

0.6572327044025157

## Use a SVM for better Accuracy

In [59]:
from sklearn.linear_model import SGDClassifier
text_clf2 = Pipeline([
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-3, random_state=42,
                                           max_iter=5, tol=None)),])


In [60]:
text_clf2.fit(X_train, Y_train) 




Pipeline(memory=None,
     steps=[('tfidf', TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)), ('clf', SGDClassifier(alpha=0.001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', ...dom_state=42, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False))])

In [62]:
predicted2 = text_clf2.predict(X_test)
np.mean(predicted2 == Y_test)

0.6572327044025157

In [53]:
d = {'Y_test': Y_test,'Predicted': predicted}
results = pd.DataFrame(data=d)
results

Unnamed: 0,Y_test,Predicted
964,Red,Red
2223,White,Red
2598,Red,Red
3965,Red,Red
598,Red,Red
2634,White,Red
1056,White,Red
4257,Red,Red
642,Red,Red
2281,White,Red


In [54]:
d

{'Y_test': 964       Red
 2223    White
 2598      Red
 3965      Red
 598       Red
 2634    White
 1056    White
 4257      Red
 642       Red
 2281    White
 4193    White
 4342      Red
 1657      Red
 1417    White
 2750      Red
 2677      Red
 2059      Red
 802       Red
 949       Red
 3414      Red
 2402    White
 1846      Red
 742       Red
 33      White
 2460    White
 4252    White
 3933    White
 3524    White
 2374    White
 227       Red
         ...  
 4544      Red
 1740    White
 4261     Rose
 650       Red
 4127     Rose
 3273    White
 1052    White
 1281      Red
 1665      Red
 2251    White
 4737      Red
 4299      Red
 192       Red
 889     White
 2413      Red
 2628     Rose
 1586      Red
 3381      Red
 314       Red
 2526    White
 2771      Red
 627       Red
 2342      Red
 3944      Red
 1041    White
 2576      Red
 3690      Red
 113     White
 2692      Red
 1371      Red
 Name: Category, Length: 954, dtype: object,
 'Predicted': array(['Red', 'R

## Try to predict the outcome on a new review

In [55]:

count_vect = CountVectorizer()
# X_train_counts = count_vect.fit_transform(X)
# X_train_counts.shape


In [56]:
def predict_review(docs_new):
    X_new_counts = count_vect.transform(docs_new)
    X_new_tfidf = tfidf_transformer.transform(X_new_counts)
    predicted = clf.predict(X_new_tfidf)
    for doc, category in zip(docs_new, predicted):
        print('%r => %s' % (doc, category))

In [57]:
docs_new = ['buttery, goes well with fish', 'bold blackberry cherry smooth figs raisins caramel']

In [58]:
predict_review(docs_new)

NotFittedError: CountVectorizer - Vocabulary wasn't fitted.

In [None]:
predict_review(['refreshing','yellow'])

In [None]:
predict_review(['red meat','harsh'])