# Wine Classification

In [54]:
import numpy as np
import pandas as pd
import pickle
import tensorflow as tf
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing import text, sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D, MaxPooling1D
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import seaborn as sns
plt.style.use('fivethirtyeight')
print(tf.__version__)

2.2.0


In [None]:
# Mount your Google Drive
from google.colab import drive
drive.mount('/content/gdrive/', force_remount=True)

In [56]:
# Change to top directory of your Drive
import os
os.chdir('/content/gdrive/Shared drives/AI4ALL SFU NLP GROUP 3/WINE')

In [57]:
# Load data
df = pd.read_csv('winemag-data-130k-v2.csv')

In [58]:
df.head(10)

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks
5,5,Spain,Blackberry and raspberry aromas show a typical...,Ars In Vitro,87,15.0,Northern Spain,Navarra,,Michael Schachner,@wineschach,Tandem 2011 Ars In Vitro Tempranillo-Merlot (N...,Tempranillo-Merlot,Tandem
6,6,Italy,"Here's a bright, informal red that opens with ...",Belsito,87,16.0,Sicily & Sardinia,Vittoria,,Kerin O’Keefe,@kerinokeefe,Terre di Giurfo 2013 Belsito Frappato (Vittoria),Frappato,Terre di Giurfo
7,7,France,This dry and restrained wine offers spice in p...,,87,24.0,Alsace,Alsace,,Roger Voss,@vossroger,Trimbach 2012 Gewurztraminer (Alsace),Gewürztraminer,Trimbach
8,8,Germany,Savory dried thyme notes accent sunnier flavor...,Shine,87,12.0,Rheinhessen,,,Anna Lee C. Iijima,,Heinz Eifel 2013 Shine Gewürztraminer (Rheinhe...,Gewürztraminer,Heinz Eifel
9,9,France,This has great depth of flavor with its fresh ...,Les Natures,87,27.0,Alsace,Alsace,,Roger Voss,@vossroger,Jean-Baptiste Adam 2012 Les Natures Pinot Gris...,Pinot Gris,Jean-Baptiste Adam


In [59]:
# Get only the columns we want
wine_df = df[['description', 'variety']].copy()

In [60]:
wine_df

Unnamed: 0,description,variety
0,"Aromas include tropical fruit, broom, brimston...",White Blend
1,"This is ripe and fruity, a wine that is smooth...",Portuguese Red
2,"Tart and snappy, the flavors of lime flesh and...",Pinot Gris
3,"Pineapple rind, lemon pith and orange blossom ...",Riesling
4,"Much like the regular bottling from 2012, this...",Pinot Noir
...,...,...
129966,Notes of honeysuckle and cantaloupe sweeten th...,Riesling
129967,Citation is given as much as a decade of bottl...,Pinot Noir
129968,Well-drained gravel soil gives this wine its c...,Gewürztraminer
129969,"A dry style of Pinot Gris, this is crisp with ...",Pinot Gris


In [61]:
wine_df['variety'].value_counts()[:5]

Pinot Noir                  13272
Chardonnay                  11753
Cabernet Sauvignon           9472
Red Blend                    8946
Bordeaux-style Red Blend     6915
Name: variety, dtype: int64

In [62]:
topVarieties = wine_df['variety'].value_counts()[:5].index.tolist()

In [63]:
wine_df_short = wine_df[wine_df.variety.isin(topVarieties)].copy()

In [64]:
wine_df_short

Unnamed: 0,description,variety
4,"Much like the regular bottling from 2012, this...",Pinot Noir
10,"Soft, supple plum envelopes an oaky structure ...",Cabernet Sauvignon
12,"Slightly reduced, this wine offers a chalky, t...",Cabernet Sauvignon
14,Building on 150 years and six generations of w...,Chardonnay
20,Ripe aromas of dark berries mingle with ample ...,Red Blend
...,...,...
129956,"The blend is 44% Merlot, 33% Cabernet Sauvigno...",Bordeaux-style Red Blend
129958,This blend of Cabernet Sauvignon-Merlot and Ca...,Bordeaux-style Red Blend
129960,"Fresh and fruity, this is full of red cherry f...",Pinot Noir
129963,"A bouquet of black cherry, tart cranberry and ...",Cabernet Sauvignon


In [65]:
wine_df_short['variety_num'] = wine_df_short['variety'].astype('category').cat.codes

In [66]:
wine_df_short

Unnamed: 0,description,variety,variety_num
4,"Much like the regular bottling from 2012, this...",Pinot Noir,3
10,"Soft, supple plum envelopes an oaky structure ...",Cabernet Sauvignon,1
12,"Slightly reduced, this wine offers a chalky, t...",Cabernet Sauvignon,1
14,Building on 150 years and six generations of w...,Chardonnay,2
20,Ripe aromas of dark berries mingle with ample ...,Red Blend,4
...,...,...,...
129956,"The blend is 44% Merlot, 33% Cabernet Sauvigno...",Bordeaux-style Red Blend,0
129958,This blend of Cabernet Sauvignon-Merlot and Ca...,Bordeaux-style Red Blend,0
129960,"Fresh and fruity, this is full of red cherry f...",Pinot Noir,3
129963,"A bouquet of black cherry, tart cranberry and ...",Cabernet Sauvignon,1


In [67]:
# Split our data into training and test sets (80/20)
train_df, test_df = train_test_split(wine_df_short, test_size=0.2)

In [68]:
import string
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stops = stopwords.words('english')
for i in topVarieties:
  stops.extend(i.lower().split(' '))
print(stops[-15:])


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
["weren't", 'won', "won't", 'wouldn', "wouldn't", 'pinot', 'noir', 'chardonnay', 'cabernet', 'sauvignon', 'red', 'blend', 'bordeaux-style', 'red', 'blend']


In [69]:
def cleanText(text):
    # Remove new lines from the text
    text = text.replace("\n", " ")
    text = text.lower()
    text = text.split(' ')
    text = [w for w in text if not w in stops] 
    text = ' '.join(text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = ''.join([i for i in text if not i.isdigit()])
    return text

In [None]:
train_df['description'] = train_df['description'].apply(lambda x : cleanText(x))
test_df['description'] = test_df['description'].apply(lambda x : cleanText(x))

In [71]:
train_df

Unnamed: 0,description,variety,variety_num
18076,jumps immediately bright spicy scent forward f...,Pinot Noir,3
97344,released october would benefited additional t...,Pinot Noir,3
94841,rounded mouthfilling wine beginning seems soft...,Pinot Noir,3
21224,darcie kent really nails cab john maddens ranc...,Cabernet Sauvignon,1
72916,raspberry dark slate gravel bit shiitake mushr...,Pinot Noir,3
...,...,...,...
38648,lush body ripe concentrated plum currant flavo...,Bordeaux-style Red Blend,0
21537,miss presence oak chardonnay brilliantly forwa...,Chardonnay,2
107953,wine little candied stewed cherry brown sugar ...,Pinot Noir,3
115981,raspberry ripeness soft tannins lend warmth st...,Pinot Noir,3


In [72]:
X_train = train_df['description']
y_train = train_df['variety_num']
X_test = test_df['description']
y_test = test_df['variety_num']

In [73]:
vocab_size = 20000
max_seq_length = 400

In [74]:
# TfidfVectorizer converts text to a matrix of TF-IDF features
from sklearn.feature_extraction.text import TfidfVectorizer

vect = TfidfVectorizer()

In [None]:
''' 
TO DO: Define our X and y from the training set
(Re)load the (cleaned) training and test datasets as X_train, y_train, X_test, y_test
'''
# TO DO: Load our pre-processed data into a dataframe
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [76]:
X_train_vect = vect.fit_transform(X_train)
X_test_vect = vect.transform(X_test)

In [77]:
# Import classifiers from Scikit-learn
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

In [78]:
'''
TO DO: Define a list of models that we will be comparing. Choose from the ones
that were imported from above.
'''
NB = MultinomialNB()
SVC = LinearSVC()
RFC = RandomForestClassifier(max_depth=10)
models = [NB, SVC, RFC]

In [79]:
# Create a dictionary for storing the results of each run for each model
results = {}

# For each model, create a dictionary within results that stores the accuracy,
# f1 score and confusion matrix values
for model in models:
    results[model.__class__.__name__] = {'accuracy': [], 
                                       'f1_score': [], 
                                       'confusion_matrix': []}

In [80]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

In [81]:
for model in models:
    '''
    TO DO: For each model in our models list, we want to:

    1. Fit the model on X_train_vect and y_train

    2. Generate y_pred by calling model.predict
    '''
    model.fit(X_train_vect, y_train)
    y_pred = model.predict(X_test_vect)
    '''
    TO DO: Compute the following so we can add it to our results:
    acc - accuracy score
    f1  - f1 score
    cm - confusion matrix
    '''
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    cm = confusion_matrix(y_test, y_pred)
    print("done")
    results[model.__class__.__name__]['accuracy'] = acc
    results[model.__class__.__name__]['f1_score'] = f1
    results[model.__class__.__name__]['confusion_matrix'] = cm

done
done
done


In [82]:
'''
TO DO: Display the results by printing out the accuracy, F1 score and confusion
matrix for each of the models.
'''
for model in results:
  print(results[model])

{'accuracy': 0.7973590150913423, 'f1_score': 0.7953736384895485, 'confusion_matrix': array([[ 831,  220,    6,  117,  148],
       [  12, 1263,    7,  440,  160],
       [  15,    4, 2286,   69,    1],
       [ 181,   82,   42, 2366,   28],
       [  44,  240,   13,  212, 1285]])}
{'accuracy': 0.8565329626687848, 'f1_score': 0.8566095762345158, 'confusion_matrix': array([[1078,  110,    6,   76,   52],
       [  70, 1494,    6,  183,  129],
       [   7,    8, 2304,   51,    5],
       [ 116,  150,   33, 2356,   44],
       [  84,  194,   15,  106, 1395]])}
{'accuracy': 0.6167593328038126, 'f1_score': 0.5764948901841165, 'confusion_matrix': array([[ 187,   44,   19,  942,  130],
       [   4,  363,   17, 1429,   69],
       [   0,    0, 2225,  150,    0],
       [  16,    1,   63, 2619,    0],
       [   4,   44,   25,  903,  818]])}
