## Import & Install Statements

In [None]:
import pandas as pd
import numpy as np
import torch

from sklearn.model_selection import train_test_split

from transformers import (
    RobertaTokenizer,
    RobertaForSequenceClassification,
    TrainingArguments,
    Trainer
)

In [None]:
# Read the dataset as a dataframe.
amazon_imputed_df = pd.read_csv('/content/preprocessed_amazon_df_with_vader_scores_and_score_classification_and_imputations_fixed.csv')

In [None]:
# Let's take a look at the unique values in the column we're trying to predict numerical scores based on.
amazon_imputed_df['text_vader_csat_classification'].unique()

array(['Satisfied', 'Neutral', 'Very Satisfied', 'Dissatisfied'],
      dtype=object)

## Data Division into Training & Evaluation Subsets

In [None]:
# Let's divide the data into training & evaluation subsets for the training process.

train_text, eval_text, train_labels, eval_labels = train_test_split(
    amazon_imputed_df['cleaned_text'].tolist(),
    amazon_imputed_df['text_vader_sentiment'].astype(float).tolist(),
    test_size = 0.2,
    random_state = 42
)

'''
A few notes on the train-test split:
    --> We went with an 80-20 split, as it felt like a good split for the data size - 19,991 rows.
    --> We also ensured that the sentiment score column is of the float type, to avoid any potential issues.
'''

'\nA few notes on the train-test split:\n    --> We went with an 80-20 split, as it felt like a good split for the data size - 19,991 rows.\n    --> We also ensured that the sentiment score column is of the float type, to avoid any potential issues.\n'

## Model Loading

In [None]:
# Load the RoBERTa model.
model_name = 'roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels = 1)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Freeze the first 3 training layers, to make the algorithm more computtionall efficient.
for param in model.roberta.encoder.layer[:3].parameters():
  param.requires_grad = False

amazon_imputed_df.dtypes

Unnamed: 0,0
title,object
text,object
asin,object
parent_asin,object
user_id,object
timestamp,int64
helpful_vote,int64
verified_purchase,bool
csat_rating,object
num_images,int64


## Text Preparation for Model Training

In [None]:
# Let's tokenize the inputs.
train_encodings = tokenizer(train_text, padding = True, truncation = True, max_length = 512, return_tensors = "pt")
eval_encodings = tokenizer(eval_text, padding = True, truncation = True, max_length = 512, return_tensors = "pt")

In [None]:
# Check the encoding shapes to make sure that the preperation we've done so far has taken effect, and won't cause any issues during training.
print('Shape of the train encodings & input ids:', train_encodings['input_ids'].shape)
print('Shape of the train labels:', len(train_labels))

Shape of the train encodings & input ids: torch.Size([15992, 512])
Shape of the train labels: 15992


In [None]:
# Convert the labels to torch tensors while keeping them as 1-dimensional tensors
train_labels = torch.tensor(train_labels, dtype = torch.float32)
eval_labels = torch.tensor(eval_labels, dtype = torch.float32)

In [None]:
# Check the label shapes to make sure that the preperation we've done so far has taken effect, and won't cause any issues during training.
print('Shape of the train_labels tensor:', train_labels.shape)
print('Shape of the eval_labels tensor:', eval_labels.shape)

Shape of the train_labels tensor: torch.Size([15992])
Shape of the eval_labels tensor: torch.Size([3999])


In [None]:
# Define a custom dataset class
class CSATDataset(torch.utils.data.Dataset):
  def __init__(self, encodings, labels):
    self.encodings = encodings
    self.labels = labels

  def __getitem__(self, idx):
    item = {key: val[idx] for key, val in self.encodings.items()}
    item['labels'] = self.labels[idx]  # Keep label as a single scalar value, not a tensor with extra dimension
    return item

  def __len__(self):
    return len(self.labels)

In [None]:
# Check the layers & gradients one last time to make sure that the encodings took effect.

def inspect_model_layers(model):
  print('\nInspecing the RoBERTa layers:')
  for idx, layer in enumerate(model.roberta.encoder.layer):
    grad_status = all(param.requires_grad == False for param in layer.parameters())
    print(f'Layer {idx + 1} - Frozen: {grad_status}')

  print('\nInspecting the Output Layer:')
  output_grad_status = any(param.requires_grad == True for param in model.classifier.parameters())
  print(f'Output Layer = Trainable: {output_grad_status}')

inspect_model_layers(model)


Inspecing the RoBERTa layers:
Layer 1 - Frozen: True
Layer 2 - Frozen: True
Layer 3 - Frozen: True
Layer 4 - Frozen: False
Layer 5 - Frozen: False
Layer 6 - Frozen: False
Layer 7 - Frozen: False
Layer 8 - Frozen: False
Layer 9 - Frozen: False
Layer 10 - Frozen: False
Layer 11 - Frozen: False
Layer 12 - Frozen: False

Inspecting the Output Layer:
Output Layer = Trainable: True


In [None]:
# Create the training & testing datasets
train_dataset = CSATDataset(train_encodings, train_labels)
eval_dataset = CSATDataset(eval_encodings, eval_labels)

## Training Arguments Setup & Model Training

In [None]:
training_args = TrainingArguments(
    output_dir = './roberta_csat_regression',
    eval_strategy = 'steps',
    save_steps = 2000,
    save_total_limit = 5,
    learning_rate = 2e-5,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 8,
    num_train_epochs = 3,
    logging_dir = './logs',
    logging_steps = 1000
)

In [None]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = eval_dataset
)

In [None]:
# API key from Weights & Biases: ****************************************

In [None]:
# Train the model!
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss,Validation Loss
1000,0.1106,0.079825


Step,Training Loss,Validation Loss
1000,0.1106,0.079825
2000,0.0725,0.063962
3000,0.0512,0.058187
4000,0.0422,0.045123
5000,0.0324,0.042266


TrainOutput(global_step=5997, training_loss=0.05605305252978458, metrics={'train_runtime': 1082.4791, 'train_samples_per_second': 44.32, 'train_steps_per_second': 5.54, 'total_flos': 1.2622902654984192e+16, 'train_loss': 0.05605305252978458, 'epoch': 3.0})

In [None]:
model.save_pretrained('./results/BDAE_Codebase_2_CSAT_Score_Prediction_Model_fixed_3')
tokenizer.save_pretrained('./results/BDAE_Codebase_2_CSAT_Score_Prediction_Model_fixed_3')
!zip -r BDAE_Codebase_2_CSAT_Score_Prediction_Model_fixed_3.zip ./results/BDAE_Codebase_2_CSAT_Score_Prediction_Model_fixed_3

  adding: results/BDAE_Codebase_2_CSAT_Score_Prediction_Model_fixed_3/ (stored 0%)
  adding: results/BDAE_Codebase_2_CSAT_Score_Prediction_Model_fixed_3/config.json (deflated 48%)
  adding: results/BDAE_Codebase_2_CSAT_Score_Prediction_Model_fixed_3/model.safetensors (deflated 19%)
  adding: results/BDAE_Codebase_2_CSAT_Score_Prediction_Model_fixed_3/tokenizer_config.json (deflated 76%)
  adding: results/BDAE_Codebase_2_CSAT_Score_Prediction_Model_fixed_3/vocab.json (deflated 68%)
  adding: results/BDAE_Codebase_2_CSAT_Score_Prediction_Model_fixed_3/special_tokens_map.json (deflated 84%)
  adding: results/BDAE_Codebase_2_CSAT_Score_Prediction_Model_fixed_3/merges.txt (deflated 53%)


## Model Reloading & Testing

In [None]:
model_name = '/content/results/BDAE_Codebase_2_CSAT_Score_Prediction_Model_fixed_3'
model = RobertaForSequenceClassification.from_pretrained(model_name)

model.eval()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [None]:
test_reviews = [
    "The product is amazing! It exceeded all my expectations, and the quality is top-notch. I couldn't be happier with my purchase.",
    "The product works well and is good value for the money. There were a few minor issues, but overall I am satisfied.",
    "The product is okay. It does what it is supposed to, but there is nothing exceptional about it. It met my expectations but didn't exceed them.",
    "I am disappointed with the product. It does not work as advertised, and the quality is not worth the money spent. I would not recommend this.",
    "Terrible experience. The product broke within a week, and customer support was not helpful. I regret buying this."
]

In [None]:
inputs = tokenizer(test_reviews, padding = True, truncation = True, max_length = 512, return_tensors = 'pt')

with torch.no_grad():
  outputs = model(**inputs)
  predictions = outputs.logits.squeeze(-1)

def scale_score_to_1_10(score):
  return 1 + 4.5 * (score + 1)

for i, review in enumerate(test_reviews):
  raw_score = predictions[i].item()
  scaled_score = scale_score_to_1_10(raw_score)
  print(f'CSAT Score for Review #{i + 1} (on a scale from 1-10): {scaled_score:.2f}')
  print(f'Review Text: {review}\n')

CSAT Score for Review #1 (on a scale from 1-10): 8.59
Review Text: The product is amazing! It exceeded all my expectations, and the quality is top-notch. I couldn't be happier with my purchase.

CSAT Score for Review #2 (on a scale from 1-10): 9.28
Review Text: The product works well and is good value for the money. There were a few minor issues, but overall I am satisfied.

CSAT Score for Review #3 (on a scale from 1-10): 5.40
Review Text: The product is okay. It does what it is supposed to, but there is nothing exceptional about it. It met my expectations but didn't exceed them.

CSAT Score for Review #4 (on a scale from 1-10): 2.43
Review Text: I am disappointed with the product. It does not work as advertised, and the quality is not worth the money spent. I would not recommend this.

CSAT Score for Review #5 (on a scale from 1-10): 2.23
Review Text: Terrible experience. The product broke within a week, and customer support was not helpful. I regret buying this.

