'''
Author:
        
        PARK, JunHo, junho@ccnets.org

        
        KIM, JoengYoong, jeongyoong@ccnets.org
        
    COPYRIGHT (c) 2024. CCNets. All Rights reserved.
'''

In [1]:
import sys
path_append = "../"
sys.path.append(path_append)  # Go up one directory from where you are.

from nn.utils.init import set_random_seed
set_random_seed(0)

In [2]:
import pandas as pd
df = pd.read_csv('../../data/Amazon reviews/amazon_reviews.csv')
df.head()

Unnamed: 0,reviewId,userName,content,score,thumbsUpCount,reviewCreatedVersion,at,appVersion
0,0899edc2-6dd0-4e40-8471-6836dfc52b00,Quintasha Jackson,I love Amazon ❤️,5,0,28.9.2.100,2024-05-14 23:17:13,28.9.2.100
1,dc8496a1-bb8f-40cd-9ac7-5dc2ba1a6703,Tiffany Boisvert,difficult to figure out,1,0,28.7.0.100,2024-05-14 23:16:52,28.7.0.100
2,3492103d-2761-4385-b764-d7d2351d6996,Kim Hilliker,"wonderful and fast, efficient a d great custom...",5,0,28.9.2.100,2024-05-14 22:41:20,28.9.2.100
3,b49415d0-0f8e-48c9-bf29-718be6cc8b67,Joshua Dickenson,"""Your orders"" screen keeps flashing, can't see...",1,0,28.9.2.100,2024-05-14 22:39:32,28.9.2.100
4,bee4d6f7-dba0-4895-946e-80432f769eb5,Mohammed Abdalla,💙💙💙,5,0,28.7.0.100,2024-05-14 22:34:00,28.7.0.100


In [3]:
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModel

class PretrainedModelDataset(Dataset):
    def __init__(self, df, tokenizer, model, num_classes, device, max_length=512, precompute_batches=64, **kwargs):
        self.df = df.copy()
        self.df['score'] = self.df['score'] - 1
        self.model = model
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.num_classes = num_classes
        self.device = device
        self.precompute_batches = precompute_batches
        self.X_cache = None
        self.y_cache = None
        self.dataset_length = len(self.df)
        # Efficiently select a random subset of indices
        self.batch_indices = torch.randperm(len(self.df))
        self._precompute_batches(0)
        
    def _precompute_batches(self, start_idx):
        end_idx = min(start_idx + self.precompute_batches, self.dataset_length)
        batch_indices = self.batch_indices[start_idx:end_idx].tolist()

        # Gather batch data
        X_batch = [self.df.iloc[i]["content"] for i in batch_indices]
        y_batch = [self.df.iloc[i]["score"] for i in batch_indices]

        # Tokenize the batch
        X = self.tokenizer(X_batch, truncation=True, max_length=self.max_length, padding='max_length', return_tensors='pt')
        
        # Move inputs to the correct device
        input_ids = X['input_ids'].to(self.device)
        attention_mask = X['attention_mask'].to(self.device)

        # Get the last hidden state from the RoBERTa model
        with torch.no_grad():
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
            last_hidden_state = outputs.last_hidden_state

        # Reshape the labels for the entire batch
        y_one_hot = torch.nn.functional.one_hot(torch.tensor(y_batch, dtype=torch.long), num_classes=self.num_classes)
        y_one_hot = y_one_hot.unsqueeze(1).repeat(1, last_hidden_state.size(1), 1)  # Shape: (batch_size, sequence_length, num_classes)
        
        # Store precomputed batch in cache
        self.X_cache = last_hidden_state
        self.y_cache = y_one_hot

    def __len__(self):
        return self.dataset_length
    
    def __getitem__(self, idx):
        batch_idx = idx // self.precompute_batches
        batch_start_idx = batch_idx * self.precompute_batches
        cur_idx = idx % self.precompute_batches
        if cur_idx == 0:
            self._precompute_batches(batch_start_idx)

        if cur_idx >= len(self.X_cache):
            cur_idx = idx % len(self.X_cache)

        X = self.X_cache[cur_idx]
        y = self.y_cache[cur_idx]
        return X, y

In [4]:
from sklearn.model_selection import train_test_split

# Initialize tokenizer and model
TARGET_MODEL = "cardiffnlp/twitter-roberta-base-irony"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizer = AutoTokenizer.from_pretrained(TARGET_MODEL, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token
pretrained_model = AutoModel.from_pretrained(TARGET_MODEL).to(device)
pretrained_model.eval()

# Assuming df is your DataFrame
train_df, test_df = train_test_split(df, stratify=df["score"], test_size=0.2)
num_classes = 5
# Create datasets
trainset = PretrainedModelDataset(train_df, tokenizer, pretrained_model, num_classes, device, max_length=128)
testset = PretrainedModelDataset(test_df, tokenizer, pretrained_model, num_classes, device, max_length=128)

Some weights of RobertaModel were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-irony and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
from tools.setting.data_config import DataConfig
from tools.setting.ml_params import MLParameters
from trainer_hub import TrainerHub
import torch

data_config = DataConfig(dataset_name = 'amazon_reviews', task_type='multi_class_classification', obs_shape=[pretrained_model.config.hidden_size], label_size=num_classes)

#  Set training configuration from the AlgorithmConfig class, returning them as a Namespace object.
ml_params = MLParameters(core_model = 'gpt', encoder_model = 'none')

# Set the device to GPU if available, else CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 

# Initialize the TrainerHub class with the training configuration, data configuration, device, and use_print and use_wandb flags
trainer_hub = TrainerHub(ml_params, data_config, device, use_print=True, use_wandb=False) 

In [6]:
trainer_hub.train(trainset, testset)

Epochs:   0%|          | 0/100 [00:00<?, ?it/s]

Iterations:   0%|          | 0/633 [00:00<?, ?it/s]

[0/100][50/633][Time 24.71]
Unified LR across all optimizers: 0.0001995308238189185
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0742	Gen: 0.3940	Rec: 0.3872	E: 0.0810	R: 0.0673	P: 0.7070
--------------------Test Metrics------------------------
accuracy: 0.6438
precision: 0.2733
recall: 0.3588
f1_score: 0.3077

[0/100][100/633][Time 22.86]
Unified LR across all optimizers: 0.00019907191565870155
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0280	Gen: 0.3219	Rec: 0.3188	E: 0.0311	R: 0.0249	P: 0.6127
--------------------Test Metrics------------------------
accuracy: 0.6562
precision: 0.2766
recall: 0.3608
f1_score: 0.3110

[0/100][150/633][Time 24.42]
Unified LR across all optimizers: 0.00019861406295796434
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0221	Gen: 0.3011	Rec: 0.2993	E: 0.0240	R: 0.0203	P: 0.5783
--------------------Test Metrics------------------------
accuracy: 0.6672
precision

Iterations:   0%|          | 0/633 [00:00<?, ?it/s]

[1/100][17/633][Time 25.03]
Unified LR across all optimizers: 0.00019409305492778308
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0159	Gen: 0.2517	Rec: 0.2505	E: 0.0171	R: 0.0147	P: 0.4864
--------------------Test Metrics------------------------
accuracy: 0.6953
precision: 0.3069
recall: 0.3588
f1_score: 0.3251

[1/100][67/633][Time 23.82]
Unified LR across all optimizers: 0.00019364665328896346
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0162	Gen: 0.2485	Rec: 0.2472	E: 0.0175	R: 0.0149	P: 0.4795
--------------------Test Metrics------------------------
accuracy: 0.6875
precision: 0.2975
recall: 0.3753
f1_score: 0.3285

[1/100][117/633][Time 24.28]
Unified LR across all optimizers: 0.00019320127834542263
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0172	Gen: 0.2468	Rec: 0.2454	E: 0.0187	R: 0.0158	P: 0.4750
--------------------Test Metrics------------------------
accuracy: 0.6766
precision