In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('winemag-data-130k-v2 2.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


In [3]:
len(df)

129971

In [4]:
df.variety.describe()

count         129970
unique           707
top       Pinot Noir
freq           13272
Name: variety, dtype: object

In [5]:
nullvals = df.variety.isnull().values
len(nullvals)

129971

## Find out how many null values exist and get rid of the features with the most null vals

In [6]:

df.isna().sum()

Unnamed: 0                   0
country                     63
description                  0
designation              37465
points                       0
price                     8996
province                    63
region_1                 21247
region_2                 79460
taster_name              26244
taster_twitter_handle    31213
title                        0
variety                      1
winery                       0
dtype: int64

In [7]:
# region_2 and twitter handle have the most null values so dropping the column
df_cleaned1 = df.drop(['Unnamed: 0','region_2','taster_twitter_handle'],axis=1)
df_cleaned1

Unnamed: 0,country,description,designation,points,price,province,region_1,taster_name,title,variety,winery
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,Kerin O’Keefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,Roger Voss,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Paul Gregutt,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,Alexander Peartree,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Paul Gregutt,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks
5,Spain,Blackberry and raspberry aromas show a typical...,Ars In Vitro,87,15.0,Northern Spain,Navarra,Michael Schachner,Tandem 2011 Ars In Vitro Tempranillo-Merlot (N...,Tempranillo-Merlot,Tandem
6,Italy,"Here's a bright, informal red that opens with ...",Belsito,87,16.0,Sicily & Sardinia,Vittoria,Kerin O’Keefe,Terre di Giurfo 2013 Belsito Frappato (Vittoria),Frappato,Terre di Giurfo
7,France,This dry and restrained wine offers spice in p...,,87,24.0,Alsace,Alsace,Roger Voss,Trimbach 2012 Gewurztraminer (Alsace),Gewürztraminer,Trimbach
8,Germany,Savory dried thyme notes accent sunnier flavor...,Shine,87,12.0,Rheinhessen,,Anna Lee C. Iijima,Heinz Eifel 2013 Shine Gewürztraminer (Rheinhe...,Gewürztraminer,Heinz Eifel
9,France,This has great depth of flavor with its fresh ...,Les Natures,87,27.0,Alsace,Alsace,Roger Voss,Jean-Baptiste Adam 2012 Les Natures Pinot Gris...,Pinot Gris,Jean-Baptiste Adam


In [8]:
df_cleaned2 = df_cleaned1.dropna()

In [9]:
len(df_cleaned2) 

54170

In [10]:
len(df) - len(df_cleaned2) #number of lost data points - lost over half our data points

75801

In [11]:
df_cleaned2

Unnamed: 0,country,description,designation,points,price,province,region_1,taster_name,title,variety,winery
3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,Alexander Peartree,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Paul Gregutt,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks
5,Spain,Blackberry and raspberry aromas show a typical...,Ars In Vitro,87,15.0,Northern Spain,Navarra,Michael Schachner,Tandem 2011 Ars In Vitro Tempranillo-Merlot (N...,Tempranillo-Merlot,Tandem
6,Italy,"Here's a bright, informal red that opens with ...",Belsito,87,16.0,Sicily & Sardinia,Vittoria,Kerin O’Keefe,Terre di Giurfo 2013 Belsito Frappato (Vittoria),Frappato,Terre di Giurfo
9,France,This has great depth of flavor with its fresh ...,Les Natures,87,27.0,Alsace,Alsace,Roger Voss,Jean-Baptiste Adam 2012 Les Natures Pinot Gris...,Pinot Gris,Jean-Baptiste Adam
10,US,"Soft, supple plum envelopes an oaky structure ...",Mountain Cuvée,87,19.0,California,Napa Valley,Virginie Boone,Kirkland Signature 2011 Mountain Cuvée Caberne...,Cabernet Sauvignon,Kirkland Signature
16,Argentina,"Baked plum, molasses, balsamic vinegar and che...",Felix,87,30.0,Other,Cafayate,Michael Schachner,Felix Lavaque 2010 Felix Malbec (Cafayate),Malbec,Felix Lavaque
17,Argentina,Raw black-cherry aromas are direct and simple ...,Winemaker Selection,87,13.0,Mendoza Province,Mendoza,Michael Schachner,Gaucho Andino 2011 Winemaker Selection Malbec ...,Malbec,Gaucho Andino
18,Spain,"Desiccated blackberry, leather, charred wood a...",Vendimia Seleccionada Finca Valdelayegua Singl...,87,28.0,Northern Spain,Ribera del Duero,Michael Schachner,Pradorey 2010 Vendimia Seleccionada Finca Vald...,Tempranillo Blend,Pradorey
20,US,Ripe aromas of dark berries mingle with ample ...,Vin de Maison,87,23.0,Virginia,Virginia,Alexander Peartree,Quiévremont 2012 Vin de Maison Red (Virginia),Red Blend,Quiévremont


In [12]:
list(df_cleaned2.columns.values)

['country',
 'description',
 'designation',
 'points',
 'price',
 'province',
 'region_1',
 'taster_name',
 'title',
 'variety',
 'winery']

In [13]:
#double check no null vals
df_cleaned2.isna().sum()

country        0
description    0
designation    0
points         0
price          0
province       0
region_1       0
taster_name    0
title          0
variety        0
winery         0
dtype: int64

In [14]:
rwo_df = pd.read_excel('Types of Wine - O, R, W.xlsx')
rwo_df.head()

Unnamed: 0,variety,color
0,Alvar Roxo,Orange
1,Ar110,Orange
2,Ar99,Orange
3,Barbarossa,Orange
4,Barbaroux,Orange


In [15]:
df_cleaned2.describe(include= 'all')

Unnamed: 0,country,description,designation,points,price,province,region_1,taster_name,title,variety,winery
count,54170,54170,54170,54170.0,54170.0,54170,54170,54170,54170,54170,54170
unique,7,49537,23943,,,63,1004,17,48994,434,8583
top,US,"Seductively tart in lemon pith, cranberry and ...",Reserve,,,California,Columbia Valley (WA),Roger Voss,Segura Viudas NV Extra Dry Sparkling (Cava),Pinot Noir,Columbia Crest
freq,25888,3,1159,,,13998,2488,9237,8,6736,157
mean,,,,88.96655,39.209341,,,,,,
std,,,,2.955188,36.154076,,,,,,
min,,,,80.0,4.0,,,,,,
25%,,,,87.0,20.0,,,,,,
50%,,,,89.0,30.0,,,,,,
75%,,,,91.0,48.0,,,,,,


In [16]:
df_cleaned3 = df_cleaned2.merge(rwo_df, on='variety', how='left')
df_cleaned3

Unnamed: 0,country,description,designation,points,price,province,region_1,taster_name,title,variety,winery,color
0,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,Alexander Peartree,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian,White
1,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Paul Gregutt,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,Red
2,Spain,Blackberry and raspberry aromas show a typical...,Ars In Vitro,87,15.0,Northern Spain,Navarra,Michael Schachner,Tandem 2011 Ars In Vitro Tempranillo-Merlot (N...,Tempranillo-Merlot,Tandem,Red
3,Italy,"Here's a bright, informal red that opens with ...",Belsito,87,16.0,Sicily & Sardinia,Vittoria,Kerin O’Keefe,Terre di Giurfo 2013 Belsito Frappato (Vittoria),Frappato,Terre di Giurfo,Red
4,France,This has great depth of flavor with its fresh ...,Les Natures,87,27.0,Alsace,Alsace,Roger Voss,Jean-Baptiste Adam 2012 Les Natures Pinot Gris...,Pinot Gris,Jean-Baptiste Adam,Orange
5,US,"Soft, supple plum envelopes an oaky structure ...",Mountain Cuvée,87,19.0,California,Napa Valley,Virginie Boone,Kirkland Signature 2011 Mountain Cuvée Caberne...,Cabernet Sauvignon,Kirkland Signature,Red
6,Argentina,"Baked plum, molasses, balsamic vinegar and che...",Felix,87,30.0,Other,Cafayate,Michael Schachner,Felix Lavaque 2010 Felix Malbec (Cafayate),Malbec,Felix Lavaque,Red
7,Argentina,Raw black-cherry aromas are direct and simple ...,Winemaker Selection,87,13.0,Mendoza Province,Mendoza,Michael Schachner,Gaucho Andino 2011 Winemaker Selection Malbec ...,Malbec,Gaucho Andino,Red
8,Spain,"Desiccated blackberry, leather, charred wood a...",Vendimia Seleccionada Finca Valdelayegua Singl...,87,28.0,Northern Spain,Ribera del Duero,Michael Schachner,Pradorey 2010 Vendimia Seleccionada Finca Vald...,Tempranillo Blend,Pradorey,Red
9,US,Ripe aromas of dark berries mingle with ample ...,Vin de Maison,87,23.0,Virginia,Virginia,Alexander Peartree,Quiévremont 2012 Vin de Maison Red (Virginia),Red Blend,Quiévremont,Red


In [17]:
df_cleaned3.isna().sum()

country           0
description       0
designation       0
points            0
price             0
province          0
region_1          0
taster_name       0
title             0
variety           0
winery            0
color          7828
dtype: int64

In [18]:
df_cleaned4 = df_cleaned3.dropna()
df_cleaned4.color.describe()

count     46348
unique        3
top         Red
freq      30919
Name: color, dtype: object

In [19]:
df_cleaned4.groupby('color').count()


Unnamed: 0_level_0,country,description,designation,points,price,province,region_1,taster_name,title,variety,winery
color,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Orange,655,655,655,655,655,655,655,655,655,655,655
Red,30919,30919,30919,30919,30919,30919,30919,30919,30919,30919,30919
White,14774,14774,14774,14774,14774,14774,14774,14774,14774,14774,14774


# Potential target variables to Predict based on data above:
- color: there are 3 unique colors
- country: there are 7 unique countries
- points - below mean or above mean?
- price - below mean or above mean?
- taster name - based on the language they used?
- the variety of wine? - pinot noir etc (there are 434 total wine varieties)

In [20]:
df_cleaned4.describe(include= 'all')

Unnamed: 0,country,description,designation,points,price,province,region_1,taster_name,title,variety,winery,color
count,46348,46348,46348,46348.0,46348.0,46348,46348,46348,46348,46348,46348,46348
unique,7,42427,20508,,,62,906,17,41926,264,7734,3
top,US,"Cigar box, café au lait, and dried tobacco aro...",Reserve,,,California,Columbia Valley (WA),Roger Voss,Segura Viudas NV Extra Dry Sparkling (Cava),Pinot Noir,Columbia Crest,Red
freq,23577,3,1113,,,12732,2263,7385,8,6736,150,30919
mean,,,,89.086131,41.197808,,,,,,,
std,,,,2.966685,37.211296,,,,,,,
min,,,,80.0,4.0,,,,,,,
25%,,,,87.0,20.0,,,,,,,
50%,,,,89.0,32.0,,,,,,,
75%,,,,91.0,50.0,,,,,,,


In [22]:
# Remove some columns:
df_cleaned5 = df_cleaned4.drop(['description', 'designation', 'region_1', 'title', 'variety','winery'], axis=1, inplace=False)
df_cleaned5.head()

Unnamed: 0,country,points,price,province,taster_name,color
0,US,87,13.0,Michigan,Alexander Peartree,White
1,US,87,65.0,Oregon,Paul Gregutt,Red
2,Spain,87,15.0,Northern Spain,Michael Schachner,Red
3,Italy,87,16.0,Sicily & Sardinia,Kerin O’Keefe,Red
4,France,87,27.0,Alsace,Roger Voss,Orange


In [30]:
one_hot_df = pd.get_dummies(df_cleaned5)
one_hot_df.head()

Unnamed: 0,points,price,country_Argentina,country_Australia,country_Canada,country_France,country_Italy,country_Spain,country_US,province_Alsace,...,taster_name_Matt Kettmann,taster_name_Michael Schachner,taster_name_Paul Gregutt,taster_name_Roger Voss,taster_name_Sean P. Sullivan,taster_name_Susan Kostrzewa,taster_name_Virginie Boone,color_Orange,color_Red,color_White
0,87,13.0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
1,87,65.0,0,0,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,1,0
2,87,15.0,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,1,0
3,87,16.0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,87,27.0,0,0,0,1,0,0,0,1,...,0,0,0,1,0,0,0,1,0,0


In [31]:
labels = one_hot_df.color_Red
one_hot_df.drop('color_Red', axis=1, inplace=True)
one_hot_df.head()

Unnamed: 0,points,price,country_Argentina,country_Australia,country_Canada,country_France,country_Italy,country_Spain,country_US,province_Alsace,...,taster_name_Lauren Buzzeo,taster_name_Matt Kettmann,taster_name_Michael Schachner,taster_name_Paul Gregutt,taster_name_Roger Voss,taster_name_Sean P. Sullivan,taster_name_Susan Kostrzewa,taster_name_Virginie Boone,color_Orange,color_White
0,87,13.0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
1,87,65.0,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
2,87,15.0,0,0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0
3,87,16.0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,87,27.0,0,0,0,1,0,0,0,1,...,0,0,0,0,1,0,0,0,1,0


In [32]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
#* Use the scaler's `.fit_transform()` method to create a scaled version of our dataset. 
scaled_data = scaler.fit_transform(one_hot_df)
scaled_df = pd.DataFrame(scaled_data, columns=one_hot_df.columns)
scaled_df.head()

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


Unnamed: 0,points,price,country_Argentina,country_Australia,country_Canada,country_France,country_Italy,country_Spain,country_US,province_Alsace,...,taster_name_Lauren Buzzeo,taster_name_Matt Kettmann,taster_name_Michael Schachner,taster_name_Paul Gregutt,taster_name_Roger Voss,taster_name_Sean P. Sullivan,taster_name_Susan Kostrzewa,taster_name_Virginie Boone,color_Orange,color_White
0,-0.703193,-0.757784,-0.237666,-0.171083,-0.058486,-0.48058,-0.385883,-0.312105,0.982758,-0.173011,...,-0.059772,-0.30396,-0.410758,-0.385551,-0.435361,-0.264943,-0.056021,-0.398662,-0.119728,1.461894
1,-0.703193,0.639657,-0.237666,-0.171083,-0.058486,-0.48058,-0.385883,-0.312105,0.982758,-0.173011,...,-0.059772,-0.30396,-0.410758,2.593693,-0.435361,-0.264943,-0.056021,-0.398662,-0.119728,-0.684044
2,-0.703193,-0.704036,-0.237666,-0.171083,-0.058486,-0.48058,-0.385883,3.204048,-1.017544,-0.173011,...,-0.059772,-0.30396,2.434526,-0.385551,-0.435361,-0.264943,-0.056021,-0.398662,-0.119728,-0.684044
3,-0.703193,-0.677162,-0.237666,-0.171083,-0.058486,-0.48058,2.59146,-0.312105,-1.017544,-0.173011,...,-0.059772,-0.30396,-0.410758,-0.385551,-0.435361,-0.264943,-0.056021,-0.398662,-0.119728,-0.684044
4,-0.703193,-0.38155,-0.237666,-0.171083,-0.058486,2.080819,-0.385883,-0.312105,-1.017544,5.779993,...,-0.059772,-0.30396,-0.410758,-0.385551,2.296947,-0.264943,-0.056021,-0.398662,8.352263,-0.684044


In [33]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(one_hot_df, labels, test_size=0.25)

In [34]:
from sklearn.neighbors import KNeighborsClassifier
clf1 = KNeighborsClassifier()
clf1.fit(X_train, y_train)
test_preds = clf1.predict(X_test)

In [35]:
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

In [36]:
def print_metrics(labels, preds):
    print("Precision Score: {}".format(precision_score(labels, preds)))
    print("Recall Score: {}".format(recall_score(labels, preds)))
    print("Accuracy Score: {}".format(accuracy_score(labels, preds)))
    print("F1 Score: {}".format(f1_score(labels, preds)))
    
print_metrics(y_test, test_preds)

Precision Score: 0.9325191216382926
Recall Score: 0.9694754392715147
Accuracy Score: 0.9322516613446103
F1 Score: 0.9506382443564109


> The precision score tells us how often our model was correct when predicting that the wine is red. 

> The recall score tells us how many of the actual red wines our model made correct classifications for. 

> The accuracy score tells us the overall percentage of correct predictions made by the model, 

> and f1-score is the harmonic mean of precision and recall, which represents a "balanced" metric between the two. Overall, f1-score is the most informative about the performance of the model, followed by accuracy. For multicategorical models, accuracy is best.



In [37]:
# take in six parameters: X_train, y_train, X_test, and  y_test,min_k and max_k. 
#Set these to 1 and 25, by default
def find_best_k(X_train, y_train, X_test, y_test, min_k=1, max_k=25):

# Create two variables, best_k and best_score
    best_k = 0
    best_score = 0.0
    
# For each iteration:Iterate through every odd number between min_k and max_k + 1.
    for k in range(min_k, max_k+1, 2):
        
# Create a new KNN classifier, and set the n_neighbors parameter to the current value for k, as determined by our loop
        knn = KNeighborsClassifier(n_neighbors=k)

    #     Fit this classifier to the training data.
        knn.fit(X_train, y_train)
    
# Generate predictions for X_test using the fitted classifier.
        preds = knn.predict(X_test)
    
# Calculate the F1-score for these predictions.  
        f1 = f1_score(y_test, preds)
    
 # Compare this F1-score to best_score. If better, update best_score and best_k. 
        if f1 > best_score:
            best_k = k
            best_score = f1

            # Once it has checked every value for k, print out the best value for k and the F1-score it achieved.        
    print("Best Value for k: {}".format(best_k))
    print("F1-Score: {}".format(best_score))


In [38]:
find_best_k(X_train, y_train, X_test, y_test)

Best Value for k: 1
F1-Score: 0.9646012074992056
