#### Amazon's food product dataset (Sentiment analysis using transformers)

In [1]:
## libraries
import pandas as pd
import numpy as np
import transformers
import tensorflow as tf

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
## load data
## Data Loading
df = pd.read_csv('./validatedReviews.csv')

In [3]:
df = df.head(20000)

In [4]:
## independent and dependent features
X = df[['Text']]
y = df[['SentimentScore']]

In [5]:
## train test split
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,y,test_size=0.20)

In [29]:
X_test

Unnamed: 0,Text
18218,This is awesome stuff. I'm a weight watcher an...
14777,I bought this for the first time on Amazon a f...
18435,Ordered this because I've enjoyed Japanese-sty...
18948,My husband and I absolutely love this tea! It ...
14476,After getting used to using product where I li...
...,...
11341,Woooowwww ! The ORGANIC BLEND is probably the ...
13932,These taste amazing! Best jerky I've ever had....
7873,These chips are my family's favorite. They ar...
6737,"This one tasted pretty good. Very tangy, but ..."


In [6]:
## text cleaning
import re
def clean(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text)
    return text

In [30]:

def cleaned_text(val):
    corpus = []
    for sentence in val['Text']:
        corpus.append(clean(sentence))
    return corpus

In [31]:
X_cleaned_train = cleaned_text(X_train)
X_cleaned_test = cleaned_text(X_test)

In [34]:
len(X_cleaned_test)

4000

In [21]:
## maxlength
max_length = 0
for sentence in X_cleaned_train:
    max_length = max(len(sentence),max_length)

In [28]:
len(X_cleaned_test)

20000

In [35]:


tokenizer = transformers.AutoTokenizer.from_pretrained("distilbert-base-uncased")

Tokenized_Train_data = tokenizer(X_cleaned_train, truncation=True, padding="max_length", max_length=500)
Tokenized_Test_data = tokenizer(X_cleaned_test, truncation=True, padding="max_length", max_length=500)


In [37]:
len(Tokenized_Test_data['input_ids'])

4000

In [36]:


# Convert tokenized data to tensors
train_inputs = {
    "input_ids": tf.convert_to_tensor(Tokenized_Train_data["input_ids"]),
    "attention_mask": tf.convert_to_tensor(Tokenized_Train_data["attention_mask"])
}

test_inputs = {
    "input_ids": tf.convert_to_tensor(Tokenized_Test_data["input_ids"]),
    "attention_mask": tf.convert_to_tensor(Tokenized_Test_data["attention_mask"])
}

# Convert labels to tensors
Y_train = tf.convert_to_tensor(Y_train)
Y_test= tf.convert_to_tensor(Y_test)


In [38]:
len(Y_train)

16000

In [14]:
%pip install tf-keras

Collecting tf-keras
  Downloading tf_keras-2.18.0-py3-none-any.whl.metadata (1.6 kB)
Downloading tf_keras-2.18.0-py3-none-any.whl (1.7 MB)
   ---------------------------------------- 0.0/1.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.7 MB ? eta -:--:--
   ------ --------------------------------- 0.3/1.7 MB ? eta -:--:--
   ------ --------------------------------- 0.3/1.7 MB ? eta -:--:--
   ------ --------------------------------- 0.3/1.7 MB ? eta -:--:--
   ------------ --------------------------- 0.5/1.7 MB 419.4 kB/s eta 0:00:03
   ------------ --------------------------- 0.5/1.7 MB 419.4 kB/s eta 0:00:03
   ------------ --------------------------- 0.5/1.7 MB 419.4 kB/s eta 0:00:03
   ------------------ --------------------- 0.8/1.7 MB 466.0 kB/s eta 0:00:03
   ------------------ --------------------- 0.8/1.7 MB 466.0 kB/s eta 0:00:03
   ------------------------ --------------- 1.0/1.7 MB 47

In [15]:
from transformers import TFAutoModelForSequenceClassification

model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)






To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development





Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

In [17]:
from tensorflow.keras.optimizers import Adam

# Compile the model
model.compile(
    optimizer='Adam',
    loss="sparse_categorical_crossentropy",  # Use appropriate loss for your task
    metrics=["accuracy"]
)


TensorShape([4000, 1])

In [39]:
# Train the model
history = model.fit(
    train_inputs,
    Y_train,
    validation_data=(test_inputs, Y_test),
    epochs=3,  # Adjust based on performance
    batch_size=10  # Adjust based on memory constraints
)


Epoch 1/3


  21/1600 [..............................] - ETA: 14:52:43 - loss: 3.9514 - accuracy: 0.7238

KeyboardInterrupt: 

In [40]:
# Evaluate the model
results = model.evaluate(test_inputs, Y_test)
print(f"Test Loss: {results[0]}, Test Accuracy: {results[1]}")


Test Loss: 3.6225430965423584, Test Accuracy: 0.7752500176429749


In [41]:
# Make predictions
predictions = model.predict(test_inputs)
predicted_labels = tf.argmax(predictions.logits, axis=1)


  1/125 [..............................] - ETA: 35:25

KeyboardInterrupt: 