<a href="https://colab.research.google.com/github/Ashutowsh/Wines-Dataset---Text-Classification/blob/main/Text_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Using Wines Dataset

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
import tensorflow_hub as hub

In [2]:
df = pd.read_csv("wine-reviews.csv", usecols = ['country', 'description', 'points', 'price', 'variety', 'winery'])

In [3]:
df = df.dropna(subset=["description", "points"])

In [4]:
df.head()

Unnamed: 0,country,description,points,price,variety,winery
0,Italy,"Aromas include tropical fruit, broom, brimston...",87,,White Blend,Nicosia
1,Portugal,"This is ripe and fruity, a wine that is smooth...",87,15.0,Portuguese Red,Quinta dos Avidagos
2,US,"Tart and snappy, the flavors of lime flesh and...",87,14.0,Pinot Gris,Rainstorm
3,US,"Pineapple rind, lemon pith and orange blossom ...",87,13.0,Riesling,St. Julian
4,US,"Much like the regular bottling from 2012, this...",87,65.0,Pinot Noir,Sweet Cheeks


In [5]:
df["label"] = (df.points >= 90).astype(int)
df = df[["description", "label"]]

In [6]:
train, val, test = np.split(df.sample(frac=1), [int(0.8*len(df)), int(0.9*len(df))])

In [7]:
# Directly copied from tensorflow (made some changes)
def df_to_dataset(dataframe, shuffle=True, batch_size=1024):
  df = dataframe.copy()
  labels = df.pop('label')
  df = df["description"]
  ds = tf.data.Dataset.from_tensor_slices((df, labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  ds = ds.prefetch(tf.data.AUTOTUNE)
  return ds

In [8]:
train_data = df_to_dataset(train)
valid_data = df_to_dataset(val)
test_data = df_to_dataset(test)

### Embedding and Training the model.

In [9]:
embedding = "https://tfhub.dev/google/nnlm-en-dim50/2"
hub_layer = hub.KerasLayer(embedding, dtype=tf.string, trainable=True)
# TensorFLow hub - repository for trained models.

In [10]:
hub_layer(list(train_data)[0][0])

<tf.Tensor: shape=(1024, 50), dtype=float32, numpy=
array([[ 0.43572155, -0.12857345,  0.24460793, ...,  0.32117817,
         0.15438409,  0.05400416],
       [ 0.17023821, -0.26612547,  0.01714339, ..., -0.00515128,
        -0.02984395,  0.05329841],
       [ 0.5538189 , -0.17118831, -0.06063344, ...,  0.16640645,
        -0.02392466,  0.01637364],
       ...,
       [ 0.44069508, -0.26134527, -0.04948728, ..., -0.09422014,
        -0.15437178,  0.00856982],
       [ 0.10422249, -0.03484819,  0.39258265, ..., -0.16278188,
         0.02741763,  0.14139651],
       [ 0.5611988 , -0.22352731, -0.04765796, ..., -0.20175932,
        -0.21689312,  0.11405749]], dtype=float32)>

In [11]:
model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dropout(0.4))
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dropout(0.4))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=['accuracy'])

In [12]:
model.evaluate(train_data)



[0.7434207797050476, 0.36054667830467224]

In [13]:
model.evaluate(valid_data)



[0.7363486886024475, 0.38974854350090027]

In [14]:
history = model.fit(train_data, epochs=5, validation_data=valid_data)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [15]:
model.evaluate(test_data)



[0.6442040205001831, 0.6508703827857971]

### Using LSTM

In [16]:
encoder = tf.keras.layers.TextVectorization(max_tokens=2000)
encoder.adapt(train_data.map(lambda text, label: text))

In [17]:
vocab = np.array(encoder.get_vocabulary())
vocab[:20]

array(['', '[UNK]', 'and', 'the', 'a', 'of', 'with', 'this', 'is', 'wine',
       'in', 'flavors', 'to', 'it', 'its', 'on', 'fruit', 'aromas',
       'palate', 'that'], dtype='<U17')

In [18]:
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=32,
        mask_zero=True
    ),
    tf.keras.layers.LSTM(32),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [19]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=['accuracy'])

In [20]:
model.evaluate(train_data)
model.evaluate(valid_data)



[0.693300187587738, 0.4719535708427429]

In [21]:
history = model.fit(train_data, epochs=5, validation_data=valid_data)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [22]:
model.evaluate(test_data)



[0.5772401690483093, 0.6682785153388977]