<a href="https://colab.research.google.com/github/Bryan-Az/Adv-Keras-NNs/blob/main/Part1/NLP-Model/NLP_Augmentation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Keras NLP: Text Classification
In this notebook, I will use text classification techniques using the Keras machine learning library to classify text review data for Amazon fashion products.

## Imports

In [1]:
# loads the libraries used in this notebook
import tensorflow
from tensorflow import feature_column
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split

In [2]:
tensorflow.__version__

'2.16.1'

In [3]:
# importing pre-trained models for finetuning
import keras
import keras_nlp

In [41]:
# imported to augment the text data using AugLy
!pip install AugLy



In [45]:
! pip install nlpaug
import augly.text as txtaugs

Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl (410 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/410.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/410.5 kB[0m [31m2.2 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m409.6/410.5 kB[0m [31m6.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: nlpaug
Successfully installed nlpaug-1.1.11


## The Data
The dataset being used is the Amazon fashion reviews dataset from Julian McAuley's database from UC San Diego.

In [5]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# importing the file into the environment using google drive

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

fname_link_gdrive = {'combined_preprocessed.csv':'https://drive.google.com/file/d/1rckKZE69WtBwZtI1DfsMXQMHe2gtJ3df/view?usp=sharing'}



In [6]:
csv_name = list(fname_link_gdrive.keys())[0]
id = fname_link_gdrive[csv_name].split("/")[-2]
downloaded = drive.CreateFile({'id':id})
downloaded.GetContentFile(csv_name)

In [74]:
fashion_reviews = pd.read_csv('combined_preprocessed.csv')

In [75]:
fashion_reviews.fillna('', inplace=True)

## Augmenting the Text Data with FastAI

In [76]:
# we're going to augment the 'reviewText' feature in the fashion_reviews table (our x) using back translation to support more languages
# using AugLy
#first we inserted punctuation in random places
df_augmented = txtaugs.insert_punctuation_chars(texts=fashion_reviews['reviewText'].to_list())
# lets also apply word replacement to add variety
df_augmented = txtaugs.replace_words(df_augmented)
# and finally, word merging
df_augmented = txtaugs.merge_words(df_augmented)

In [77]:
df_augmented = pd.Series(df_augmented, name='reviewText')

In [78]:
fashion_reviews.shape

(3079, 20)

## The Keras Text Classification Model

In [17]:
reviewText_pretrained_classifier = keras_nlp.models.BertClassifier.from_preset("bert_tiny_en_uncased", num_classes=2)

In [18]:
def convert_prediction(prediction):
  """
  Converts a model prediction to a binary 0/1 if index 1 is greater.

  Args:
    prediction: A NumPy array containing the model prediction.

  Returns:
    A binary value (0 or 1) based on the prediction.
  """

  if prediction[1] > prediction[0]:
    return 1
  else:
    return 0

In [84]:
# sentiment is going to be used (the bert model is pretrained to predict sentiment)
# we can simulate a sentiment value from our review stars data
# assuming anything >= 4 is a good review sentiment
fashion_reviews['sentiment'] = fashion_reviews['overall'].copy().astype(int)
fashion_reviews['sentiment'] = fashion_reviews.sentiment.apply(lambda x: 1 if x >= 4 else 0)

text_vars = ['reviewText', 'sentiment']

fashion_reviews['reviewText'] = fashion_reviews['reviewText'].fillna(' ')

fashion_reviews['sentiment'] = fashion_reviews['sentiment'].fillna(0)

In [85]:
df_augmented = pd.DataFrame({'reviewText': df_augmented, 'sentiment': fashion_reviews['sentiment']})

In [91]:
fashion_reviews_aug = pd.concat([fashion_reviews.loc[:,text_vars], df_augmented])

In [92]:
fashion_reviews_aug.sample(5)

Unnamed: 0,reviewText,sentiment
829,Excellent,1
1190,I... ...l...o...v...e... ...m...y... ...t...e....,1
2064,L...o...v...e... ...t...h...e...s...e... ...s....,1
1614,"These shoes are extremely comfortable, and fit...",1
1321,"B,E,S,T, ,s,n,e,a,k,e,r,s, ,I,',v,e, ,e,v,e,r,...",1


In [93]:
X = fashion_reviews_aug.loc[:, 'reviewText'].astype(str)
y = fashion_reviews_aug.loc[:, 'sentiment']
X_train, X_test , y_train, y_test = train_test_split(X, y , test_size = 0.20)

X_train = X_train.apply(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)
X_test = X_test.apply(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)

X_train_tfdata = tensorflow.data.Dataset.from_tensor_slices(X_train.values)
X_test_tfdata = tensorflow.data.Dataset.from_tensor_slices(X_test.values)

# the labels must be transformed to a categorical format using tf.keras.utils
train_ds = tensorflow.data.Dataset.zip((X_train_tfdata, tensorflow.data.Dataset.from_tensor_slices(y_train)))
test_ds = tensorflow.data.Dataset.zip((X_test_tfdata, tensorflow.data.Dataset.from_tensor_slices(y_test)))

In [94]:
train_ds.batch(1).take(1).get_single_element()

(<tf.Tensor: shape=(1,), dtype=string, numpy=array([b'V;e;r;y; ;c;o;m;f;o;r;t;a;b;l;e;.'], dtype=object)>,
 <tf.Tensor: shape=(1,), dtype=int64, numpy=array([1])>)

In [95]:
reviewText_pretrained_classifier.fit(
    train_ds.batch(batch_size=10),
    validation_data=test_ds.batch(batch_size=10),
    epochs=1,
)

[1m493/493[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m508s[0m 1s/step - loss: 0.3623 - sparse_categorical_accuracy: 0.8612 - val_loss: 0.2460 - val_sparse_categorical_accuracy: 0.9148


<keras.src.callbacks.history.History at 0x7fb35934b7c0>

In [96]:
sentiment_preds = reviewText_pretrained_classifier.predict(test_ds.batch(batch_size=10))

[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 348ms/step


In [97]:
X_test_eval = X_test.copy()

In [98]:
sentiment_preds = [convert_prediction(pred) for pred in sentiment_preds]

In [99]:
test_predictions = pd.DataFrame({'reviewText': X_test, 'pretrained_finetuned_sentiment_inference': sentiment_preds})

In [102]:
test_predictions.sample(5)

Unnamed: 0,reviewText,pretrained_finetuned_sentiment_inference
2845,I love these and have them in a light blue too...,1
2405,A-b-s-o-l-u-t-e-l-y- -l-o-v-e- -t-h-e-s-e- -s-...,1
2783,E!x!t!r!e!m!e!l!y! !c!o!m!f!o!r!t!a!b!l!e! !s!...,1
2653,I: :l:o:v:e: :m:y: :t:e:n:n:i:s: :s:h:o:e:s,1
1402,Ordered 9(m) received 9 Wide for the second ti...,0
