In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('winemag-data-130k-v2 2.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


In [3]:
len(df)

129971

In [4]:
df.variety.describe()

count         129970
unique           707
top       Pinot Noir
freq           13272
Name: variety, dtype: object

## Find out how many null values exist and get rid of the features with the most null vals

In [5]:

df.isna().sum()

Unnamed: 0                   0
country                     63
description                  0
designation              37465
points                       0
price                     8996
province                    63
region_1                 21247
region_2                 79460
taster_name              26244
taster_twitter_handle    31213
title                        0
variety                      1
winery                       0
dtype: int64

In [6]:
# region_2 and twitter handle have the most null values so dropping the column
df_cleaned1 = df.drop(['Unnamed: 0','region_2','taster_twitter_handle'],axis=1)
df_cleaned1

Unnamed: 0,country,description,designation,points,price,province,region_1,taster_name,title,variety,winery
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,Kerin O’Keefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,Roger Voss,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Paul Gregutt,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,Alexander Peartree,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Paul Gregutt,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks
5,Spain,Blackberry and raspberry aromas show a typical...,Ars In Vitro,87,15.0,Northern Spain,Navarra,Michael Schachner,Tandem 2011 Ars In Vitro Tempranillo-Merlot (N...,Tempranillo-Merlot,Tandem
6,Italy,"Here's a bright, informal red that opens with ...",Belsito,87,16.0,Sicily & Sardinia,Vittoria,Kerin O’Keefe,Terre di Giurfo 2013 Belsito Frappato (Vittoria),Frappato,Terre di Giurfo
7,France,This dry and restrained wine offers spice in p...,,87,24.0,Alsace,Alsace,Roger Voss,Trimbach 2012 Gewurztraminer (Alsace),Gewürztraminer,Trimbach
8,Germany,Savory dried thyme notes accent sunnier flavor...,Shine,87,12.0,Rheinhessen,,Anna Lee C. Iijima,Heinz Eifel 2013 Shine Gewürztraminer (Rheinhe...,Gewürztraminer,Heinz Eifel
9,France,This has great depth of flavor with its fresh ...,Les Natures,87,27.0,Alsace,Alsace,Roger Voss,Jean-Baptiste Adam 2012 Les Natures Pinot Gris...,Pinot Gris,Jean-Baptiste Adam


In [7]:
list(df_cleaned1.columns.values)

['country',
 'description',
 'designation',
 'points',
 'price',
 'province',
 'region_1',
 'taster_name',
 'title',
 'variety',
 'winery']

In [8]:
rwo_df = pd.read_excel('Types of Wine - O, R, W.xlsx')
rwo_df.head()

Unnamed: 0,variety,color
0,Alvar Roxo,Orange
1,Ar110,Orange
2,Ar99,Orange
3,Barbarossa,Orange
4,Barbaroux,Orange


In [9]:
df_cleaned2 = df_cleaned1.merge(rwo_df, on='variety', how='left')
df_cleaned2.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,taster_name,title,variety,winery,color
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,Kerin O’Keefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia,White
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,Roger Voss,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,Red
2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Paul Gregutt,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm,Orange
3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,Alexander Peartree,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian,White
4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Paul Gregutt,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,Red


In [10]:
df_cleaned2.isna().sum()

country           63
description        0
designation    37474
points             0
price           8998
province          63
region_1       21253
taster_name    26245
title              0
variety            1
winery             0
color          19918
dtype: int64

In [11]:
df_cleaned2.describe(include= 'all')

Unnamed: 0,country,description,designation,points,price,province,region_1,taster_name,title,variety,winery,color
count,129929,129992,92518,129992.0,120994.0,129929,108739,103747,129992,129991,129992,110074
unique,43,119955,37979,,,425,1229,19,118840,707,16757,3
top,US,"Cigar box, café au lait, and dried tobacco aro...",Reserve,,,California,Napa Valley,Roger Voss,Gloria Ferrer NV Sonoma Brut Sparkling (Sonoma...,Pinot Noir,Wines & Winemakers,Red
freq,54512,3,2011,,,36249,4480,25516,11,13272,222,70747
mean,,,,88.446851,35.361357,,,,,,,
std,,,,3.039744,41.019738,,,,,,,
min,,,,80.0,4.0,,,,,,,
25%,,,,86.0,17.0,,,,,,,
50%,,,,88.0,25.0,,,,,,,
75%,,,,91.0,42.0,,,,,,,


In [12]:
df_cleaned3= df_cleaned2.drop(['designation','province' ,'region_1', 'title','variety', 'winery'],axis=1)
df_cleaned3.head()

Unnamed: 0,country,description,points,price,taster_name,color
0,Italy,"Aromas include tropical fruit, broom, brimston...",87,,Kerin O’Keefe,White
1,Portugal,"This is ripe and fruity, a wine that is smooth...",87,15.0,Roger Voss,Red
2,US,"Tart and snappy, the flavors of lime flesh and...",87,14.0,Paul Gregutt,Orange
3,US,"Pineapple rind, lemon pith and orange blossom ...",87,13.0,Alexander Peartree,White
4,US,"Much like the regular bottling from 2012, this...",87,65.0,Paul Gregutt,Red


In [13]:
df_cleaned3.isna().sum()

country           63
description        0
points             0
price           8998
taster_name    26245
color          19918
dtype: int64

In [14]:
df_cleaned4 = df_cleaned3.dropna()
len(df_cleaned4)

81558

In [15]:
df_cleaned4.describe(include='all')


Unnamed: 0,country,description,points,price,taster_name,color
count,81558,81558,81558.0,81558.0,81558,81558
unique,40,74704,,,19,3
top,US,"Stalky aromas suggest hay and green herbs, wit...",,,Roger Voss,Red
freq,34274,3,,,16095,51936
mean,,,88.661443,36.546188,,
std,,,2.966901,44.771244,,
min,,,80.0,4.0,,
25%,,,87.0,17.0,,
50%,,,89.0,26.0,,
75%,,,91.0,45.0,,


# Potential target variables to Predict based on data above:
- color: there are 3 unique colors
- country: there are 7 unique countries
- points - below mean or above mean?
- price - below mean or above mean?
- taster name - based on the language they used?
- the variety of wine? - pinot noir etc (there are 434 total wine varieties)

## Word Vectorization

In [16]:
import pandas as pd
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.manifold import TSNE
from nltk.tokenize import word_tokenize
import nltk
np.random.seed(0)

In [17]:
df_cleaned4.description

1         This is ripe and fruity, a wine that is smooth...
2         Tart and snappy, the flavors of lime flesh and...
3         Pineapple rind, lemon pith and orange blossom ...
4         Much like the regular bottling from 2012, this...
5         Blackberry and raspberry aromas show a typical...
6         Here's a bright, informal red that opens with ...
9         This has great depth of flavor with its fresh ...
10        Soft, supple plum envelopes an oaky structure ...
12        Slightly reduced, this wine offers a chalky, t...
14        Building on 150 years and six generations of w...
15        Zesty orange peels and apple notes abound in t...
16        Baked plum, molasses, balsamic vinegar and che...
17        Raw black-cherry aromas are direct and simple ...
18        Desiccated blackberry, leather, charred wood a...
20        Ripe aromas of dark berries mingle with ample ...
21        A sleek mix of tart berry, stem and herb, alon...
22        Delicate aromas recall white f

In [18]:
experiment_line= str(df_cleaned4.description[2:3].values)
experiment_line

"['Pineapple rind, lemon pith and orange blossom start off the aromas. The palate is a bit more opulent, with notes of honey-drizzled guava and mango giving way to a slightly astringent, semidry finish.']"

In [19]:
# experiment_line= df_cleaned4.description[2:3].to_string()

# clean_line = []
# # for word in experiment_line:

pattern = "([a-zA-Z]+(?:'[a-z]+)?)"
line_tokens_raw = nltk.regexp_tokenize(experiment_line, pattern)
print(line_tokens_raw)    
    

['Pineapple', 'rind', 'lemon', 'pith', 'and', 'orange', 'blossom', 'start', 'off', 'the', 'aromas', 'The', 'palate', 'is', 'a', 'bit', 'more', 'opulent', 'with', 'notes', 'of', 'honey', 'drizzled', 'guava', 'and', 'mango', 'giving', 'way', 'to', 'a', 'slightly', 'astringent', 'semidry', 'finish']


In [20]:
descriptions = str(df_cleaned4.description[0:20].values)

In [21]:
categories = ['Red', 'White', 'Orange']

In [22]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import *
stemmer = SnowballStemmer("english")

In [23]:
data = df_cleaned4.drop(['country','points','price','taster_name'], axis = 1)
data

Unnamed: 0,description,color
1,"This is ripe and fruity, a wine that is smooth...",Red
2,"Tart and snappy, the flavors of lime flesh and...",Orange
3,"Pineapple rind, lemon pith and orange blossom ...",White
4,"Much like the regular bottling from 2012, this...",Red
5,Blackberry and raspberry aromas show a typical...,Red
6,"Here's a bright, informal red that opens with ...",Red
9,This has great depth of flavor with its fresh ...,Orange
10,"Soft, supple plum envelopes an oaky structure ...",Red
12,"Slightly reduced, this wine offers a chalky, t...",Red
14,Building on 150 years and six generations of w...,White


In [24]:
numpy_array = data.as_matrix()
numpy_array

  """Entry point for launching an IPython kernel.


array([["This is ripe and fruity, a wine that is smooth while still structured. Firm tannins are filled out with juicy red berry fruits and freshened with acidity. It's  already drinkable, although it will certainly be better from 2016.",
        'Red'],
       ['Tart and snappy, the flavors of lime flesh and rind dominate. Some green pineapple pokes through, with crisp acidity underscoring the flavors. The wine was all stainless-steel fermented.',
        'Orange'],
       ['Pineapple rind, lemon pith and orange blossom start off the aromas. The palate is a bit more opulent, with notes of honey-drizzled guava and mango giving way to a slightly astringent, semidry finish.',
        'White'],
       ...,
       ["Notes of honeysuckle and cantaloupe sweeten this deliciously feather-light spätlese. It's intensely juicy, quenching the palate with streams of tart tangerine and grapefruit acidity, yet wraps up with a kiss of honey and peach.",
        'White'],
       ['Citation is given as 

In [25]:
X = numpy_array[:,0]
Y = numpy_array[:,1]
Y

array(['Red', 'Orange', 'White', ..., 'White', 'Red', 'Orange'],
      dtype=object)

In [29]:
type(Y)

numpy.ndarray

In [26]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline 

text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')),
 ('tfidf', TfidfTransformer()),
 ('clf', MultinomialNB()),
])

text_clf = text_clf.fit(X_train,Y_train)

predicted = text_clf.predict(X_test)
np.mean(predicted == Y_test)

0.9560446297204512

In [27]:
#Creates a datafram for word vectors

# pattern = "([a-zA-Z]+(?:'[a-z]+)?)"
# stop_words = set(stopwords.words('english'))

# #figure out how to concatenate eveything back together
# for line in descriptions:
#     article_tokens_raw = nltk.regexp_tokenize(descriptions, pattern)
#     article_tokens = [i.lower() for i in article_tokens_raw]
#     article_tokens_stopped = [w for w in article_tokens if not w in stop_words]
# #     art_stemmed = [stemmer.stem(word) for word in article_tokens_stopped]
#     docs = article_tokens_stopped
#     vec = CountVectorizer()
#     X = vec.fit_transform(docs)
#     df = pd.DataFrame(X.toarray(), columns=vec.get_feature_names())

In [28]:
# import seaborn as sns
# fig = plt.figure(figsize = (20,20))
# sns.scatterplot(x = X_test, y = Y_test)

In [None]:
https://stackoverflow.com/questions/24386489/adding-words-to-scikit-learns-countvectorizers-stop-list