# Wine Classification

In [None]:
import numpy as np
import pandas as pd
import pickle
import tensorflow as tf
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing import text, sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D, MaxPooling1D
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import seaborn as sns
plt.style.use('fivethirtyeight')
print(tf.__version__)

In [None]:
# Mount your Google Drive
from google.colab import drive
drive.mount('/content/gdrive/', force_remount=True)

In [None]:
# Change to top directory of your Drive
import os
os.chdir('/content/gdrive/Shared drives/AI4ALL SFU NLP GROUP 3/WINE')

In [None]:
# Load data
df = pd.read_csv('winemag-data-130k-v2.csv')

In [None]:
df.head(10)

In [None]:
# Get only the columns we want
wine_df = df[['description', 'variety']].copy()

In [None]:
wine_df

In [None]:
wine_df['variety'].value_counts()[:5]

In [None]:
topVarieties = wine_df['variety'].value_counts()[:5].index.tolist()

In [None]:
wine_df_short = wine_df[wine_df.variety.isin(topVarieties)].copy()

In [None]:
wine_df_short

In [None]:
wine_df_short['variety_num'] = wine_df_short['variety'].astype('category').cat.codes


In [None]:
wineNames = dict( enumerate(wine_df_short['variety'].astype('category').cat.categories ) )
wineNames

In [None]:
wine_df_short

In [None]:
# Split our data into training and test sets (80/20)
train_df, test_df = np.split(wine_df_short.sample(frac=1), [int(.8*len(wine_df_short))])

In [None]:
import string
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stops = stopwords.words('english')
for i in topVarieties:
  stops.extend(i.lower().split(' '))
print(stops[-15:])


In [None]:
def cleanText(text):
    # Remove new lines from the text
    text = text.replace("\n", " ")
    text = text.lower()
    text = text.split(' ')
    text = [w for w in text if not w in stops] 
    text = ' '.join(text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = ''.join([i for i in text if not i.isdigit()])
    return text

In [None]:
train_df['description'] = train_df['description'].apply(lambda x : cleanText(x))
test_df['description'] = test_df['description'].apply(lambda x : cleanText(x))

In [None]:
train_df

In [None]:
X = train_df['description']
y = train_df['variety_num']

In [None]:
vocab_size = 20000
max_seq_length = 400

In [None]:
# Construct a tokenizer using Keras
X_tokenizer = text.Tokenizer(vocab_size)

In [None]:
# Fit the tokenizer on our text
X_tokenizer.fit_on_texts(list(X))

In [None]:
# Encode words in sentences as a list of integer sequences
X_tokenized = X_tokenizer.texts_to_sequences(X)

In [None]:
# Pad sentences to maximum sequence length
X_train_val = sequence.pad_sequences(X_tokenized, maxlen=max_seq_length)

In [None]:
# # Save our fitted tokenizer for future use
# with open('wine_tokenizer.pkl', 'wb') as f:
#     pickle.dump(X_tokenizer, f, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# # Download the GloVe embeddings and unzip the file
# !wget http://nlp.stanford.edu/data/glove.6B.zip
# !unzip -q glove.6B.zip

In [None]:
# Load the pre-trained word vectors
embedding_dim = 100
embeddings_index = dict()
f = open('glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:],dtype='float32')
    embeddings_index[word]= coefs
f.close()
print(f'Found {len(embeddings_index)} word vectors')

In [None]:
# Create embedding matrix for our neural network
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, index in X_tokenizer.word_index.items():
    if index > vocab_size - 1:
        break
    else:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[index]= embedding_vector

In [None]:
from tensorflow.keras.utils import to_categorical

y = to_categorical(np.array(y))

'''
sigmoid:
0.88
1

softmax:
[0.10, 0.35, 0.55] = 1
2
[0, 0, 1]

'''


In [None]:
# Hyperparameters
filters = 250
kernel_size = 3
hidden_dims = 300

In [None]:
model = Sequential()
model.add(Embedding(vocab_size,
                    embedding_dim,
                    embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
                    trainable=False))
model.add(Dropout(0.1))
model.add(Conv1D(filters,
                 kernel_size,
                 padding='same',
                 activation='relu'))
model.add(MaxPooling1D(pool_size=2,
                       strides=None,
                       padding='same',))
model.add(Conv1D(filters,
                 kernel_size,
                 padding='same',
                 activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(hidden_dims, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(5, activation='softmax'))

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# Number of samples processed before the model is updated
batch_size= 16
# Number of times we go through the entire dataset (forward and backward)
epochs = 3

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y, test_size=0.20)

In [None]:
# Fit the model
hist = model.fit(X_train, y_train,
                 batch_size=batch_size,
                 epochs=epochs,
                 validation_data=(X_val, y_val)
                 )

In [None]:
model = tf.keras.models.load_model('wine_cnn_5_wines.h5')

In [None]:
model.save('wine_cnn_5_wines.h5')


In [None]:
X_test = test_df['description'].values
y_test = test_df['variety_num'].values
print(y_test[0:1])

In [None]:
# Encode words in sentences as a list of integer sequences
X_test_tokenized = X_tokenizer.texts_to_sequences(X_test)

In [None]:
# Pad sentences to maximum sequence length
X_test_val = sequence.pad_sequences(X_test_tokenized, maxlen=max_seq_length)

In [None]:
# y_pred = model.predict(X_test_val)
y_pred = np.argmax(model.predict(X_test_val), axis=-1)


In [None]:
(test_df.head(10).description).tolist()

In [None]:

print(y_test[:10])
print(y_pred[:10])
cm = confusion_matrix(y_test, y_pred)
fig = sns.heatmap(cm, annot=True, fmt="d")
plt.title("Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("Actual Label")
plt.show(fig)


In [None]:
accuracy_score(y_test, y_pred)

In [None]:
!pip install tf-nightly

In [None]:
print((test_df.head(10).description).tolist())
test_df.head(10)

In [None]:
wineReview = input("Enter a description of what you want your wine to taste like: ")
while(wineReview != 'exit'):
  wineReview = cleanText(wineReview)
  wine_tokenized = X_tokenizer.texts_to_sequences([wineReview])
  wine_padded = sequence.pad_sequences(wine_tokenized[0:1], maxlen=max_seq_length)
  predictedWine = np.argmax(model.predict(wine_padded), axis=-1)
  print(wineNames[int(predictedWine)])
  wineReview = input("Enter a description of what you want your wine to taste like: ")