In [37]:
import torch
import pandas as pd
from tqdm.notebook import tqdm
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import AdamW, get_linear_schedule_with_warmup
import numpy as np
import time
from sklearn.metrics import accuracy_score, precision_score, f1_score
from sklearn.exceptions import UndefinedMetricWarning
from torch import nn
import torch.utils.checkpoint as checkpoint
from transformers import BertModel
from sklearn.model_selection import train_test_split

2023-05-30 18:49:24.925384: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-05-30 18:49:27.247260: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
from datasets import load_dataset

dataset = load_dataset("amazon_us_reviews", "Apparel_v1_00")
train_data = dataset['train']

# Limit the dataset to the first 100,000 rows
train_data = train_data.select(range(100000))

df = train_data.to_pandas()  # Convert the dataset to a Pandas DataFrame
df = df[['customer_id', 'review_headline', 'review_body', 'star_rating']]  # Select specific columns
df.columns = ['customer_id', 'review_headline', 'review_body', 'star_rating']  # Rename the selected columns
df.set_index('customer_id', inplace=True)
df.head()  # Display the first few rows of the DataFrame

Found cached dataset amazon_us_reviews (/home/z123010/.cache/huggingface/datasets/amazon_us_reviews/Apparel_v1_00/0.1.0/17b2481be59723469538adeb8fd0a68b0ba363bbbdd71090e72c325ee6c7e563)


  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0_level_0,review_headline,review_body,star_rating
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
32158956,★ THESE REALLY DO WORK GREAT WITH SOME TWEAKING ★,"These Really Do Work Great, But You Do Need To...",4
2714559,Favorite for winter. Very warm!,I love this dress. Absolute favorite for winte...,5
12608825,Great Socks for the money.,"Nice socks, great colors, just enough support ...",5
25482800,Slick hat!,"I bought this for my husband and WOW, this is ...",5
9310286,I would do it again!,Perfect dress and the customer service was awe...,5


In [7]:
df.star_rating.value_counts()

star_rating
5    53374
4    17763
1    11741
3    10431
2     6691
Name: count, dtype: int64

In [8]:
df['sentiment'] = df['star_rating'].map({5: 'good', 4: 'good', 3: 'neutral', 2: 'bad', 1: 'bad'})

In [9]:
df['sentiment'].value_counts()

sentiment
good       71137
bad        18432
neutral    10431
Name: count, dtype: int64

In [10]:
possible_labels = df.sentiment.unique() #Get unique category labels from the DataFrame column 'category'

In [11]:
label_dict = {} #Create a dictionary to map each possible label to a unique index
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index

In [12]:
label_dict

{'good': 0, 'neutral': 1, 'bad': 2}

In [13]:
df['label'] = df.sentiment.replace(label_dict)
df.head(10)

Unnamed: 0_level_0,review_headline,review_body,star_rating,sentiment,label
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
32158956,★ THESE REALLY DO WORK GREAT WITH SOME TWEAKING ★,"These Really Do Work Great, But You Do Need To...",4,good,0
2714559,Favorite for winter. Very warm!,I love this dress. Absolute favorite for winte...,5,good,0
12608825,Great Socks for the money.,"Nice socks, great colors, just enough support ...",5,good,0
25482800,Slick hat!,"I bought this for my husband and WOW, this is ...",5,good,0
9310286,I would do it again!,Perfect dress and the customer service was awe...,5,good,0
26631939,Five Stars,Excellent for my 6 feet skinny 15 years old boy.,5,good,0
48785098,Love it!,Raw is the only way to go! Absolutely love thi...,5,good,0
39548589,Three Stars,A bit large.,4,good,0
29355866,Five Stars,Great fit!,5,good,0
27477484,Not my favorite.,"Shirt a bit too long, with heavy hem, which in...",3,neutral,1


In [14]:
from sklearn.model_selection import train_test_split

In [15]:
#Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    df.index.values,
    df.label.values,
    test_size=0.15,
    random_state=17,
    stratify=df.label.values
)

In [16]:
df['data_type'] = ['not_set']*df.shape[0] #Set a new column 'data_type' for later data split

In [17]:
df.head()

Unnamed: 0_level_0,review_headline,review_body,star_rating,sentiment,label,data_type
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
32158956,★ THESE REALLY DO WORK GREAT WITH SOME TWEAKING ★,"These Really Do Work Great, But You Do Need To...",4,good,0,not_set
2714559,Favorite for winter. Very warm!,I love this dress. Absolute favorite for winte...,5,good,0,not_set
12608825,Great Socks for the money.,"Nice socks, great colors, just enough support ...",5,good,0,not_set
25482800,Slick hat!,"I bought this for my husband and WOW, this is ...",5,good,0,not_set
9310286,I would do it again!,Perfect dress and the customer service was awe...,5,good,0,not_set


In [18]:
#Set the 'data_type' column of the dataframe for training and validation data
df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'

In [19]:
df.groupby(['star_rating', 'label', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,review_headline,review_body,sentiment
star_rating,label,data_type,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,2,train,9581,9581,9581
1,2,val,2160,2160,2160
2,2,train,5463,5463,5463
2,2,val,1228,1228,1228
3,1,train,8439,8439,8439
3,1,val,1992,1992,1992
4,0,train,14202,14202,14202
4,0,val,3561,3561,3561
5,0,train,41279,41279,41279
5,0,val,12095,12095,12095


In [20]:
from transformers import BertTokenizer
from torch.utils.data import TensorDataset

In [21]:
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased',
    do_lower_case=True
)

In [22]:
encoded_data_train_headline = tokenizer.batch_encode_plus(
    df[df.data_type=='train'].review_headline.values,
    add_special_tokens=True,
    return_attention_mask=True,
    padding='longest',
    max_length=256,
    truncation=True,
    return_tensors='pt'
)

encoded_data_train_body = tokenizer.batch_encode_plus(
    df[df.data_type=='train'].review_body.values,
    add_special_tokens=True,
    return_attention_mask=True,
    padding='longest',
    max_length=256,
    truncation=True,
    return_tensors='pt'
)

input_ids_train_headline = encoded_data_train_headline['input_ids']
attention_masks_train_headline = encoded_data_train_headline['attention_mask']

input_ids_train_body = encoded_data_train_body['input_ids']
attention_masks_train_body = encoded_data_train_body['attention_mask']

input_ids_train = torch.cat((input_ids_train_headline, input_ids_train_body), dim=1)
attention_masks_train = torch.cat((attention_masks_train_headline, attention_masks_train_body), dim=1)

labels_train = torch.tensor(df[df.data_type=='train'].label.values)


encoded_data_val_headline = tokenizer.batch_encode_plus(
    df[df.data_type=='val'].review_headline.values,
    add_special_tokens=True,
    return_attention_mask=True,
    padding='longest',
    max_length=256,
    truncation=True,
    return_tensors='pt'
)

encoded_data_val_body = tokenizer.batch_encode_plus(
    df[df.data_type=='val'].review_body.values,
    add_special_tokens=True,
    return_attention_mask=True,
    padding='longest',
    max_length=256,
    truncation=True,
    return_tensors='pt'
)

input_ids_val_headline = encoded_data_val_headline['input_ids']
attention_masks_val_headline = encoded_data_val_headline['attention_mask']

input_ids_val_body = encoded_data_val_body['input_ids']
attention_masks_val_body = encoded_data_val_body['attention_mask']

input_ids_val = torch.cat((input_ids_val_headline, input_ids_val_body), dim=1)
attention_masks_val = torch.cat((attention_masks_val_headline, attention_masks_val_body), dim=1)

labels_val = torch.tensor(df[df.data_type=='val'].label.values)

In [23]:
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

In [24]:
len(dataset_train)

78964

In [25]:
len(dataset_val)

21036

In [26]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

In [27]:
# Set the batch size and create data loaders for training and validation sets

batch_size = 4 #32

dataloader_train = DataLoader(
    dataset_train,
    sampler=RandomSampler(dataset_train),
    batch_size=batch_size
)

dataloader_val = DataLoader(
    dataset_val,
    sampler=RandomSampler(dataset_val),
    batch_size=32
)

In [33]:
from transformers import BertTokenizer, BertForSequenceClassification, BertModel

In [34]:
# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [35]:
# Instantiate a BERT model
bert_model = BertModel.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [66]:
# Define the RCNN model
class RCNN(nn.Module):
    def __init__(self, bert_model, hidden_size, num_classes, num_layers=1):
        super(RCNN, self).__init__()
        self.bert = bert_model
        self.lstm = nn.LSTM(bert_model.config.hidden_size, hidden_size, num_layers, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(2 * hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        bert_embeddings = outputs.last_hidden_state
        lstm_out, _ = self.lstm(bert_embeddings.permute(1, 0, 2))
        features = torch.cat([lstm_out[:, -1, :self.lstm.hidden_size], lstm_out[:, 0, self.lstm.hidden_size:]], dim=1)
        logits = self.fc(features)
        return logits


In [51]:
# Set the device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [52]:
# Initialize the RCNN model
hidden_size = 256
num_classes = len(label_dict)
model = RCNN(bert_model, hidden_size, num_classes)
model = model.to(device)

In [53]:
# Set the optimizer and scheduler
optimizer = AdamW(
    model.parameters(),  # Passes the model parameters to the optimizer
    lr=1e-5,             # Sets the learning rate for the optimizer to 1e-5
    eps=1e-8             # Sets the epsilon value for numerical stability to 1e-8
)
epochs= 1 #This sets the number of epochs or the number of times the model will iterate over the entire dataset during training to 10.

#This creates a linear learning rate scheduler that increases the learning rate linearly over the course of training and uses the specified number of warm-up steps and total training steps.
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0, #This sets the number of warm-up steps during training to 0. Warm-up steps gradually increase the learning rate from an initial low value to the target learning rate.
    num_training_steps=len(dataloader_train)*epochs #This sets the number of total training steps to the number of batches per epoch times the number of epochs.
)

In [56]:
import numpy as np
from sklearn.metrics import f1_score #F1 score is a measure of a model's accuracy, combining both precision and recall, used to evaluate binary classification models.

In [57]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten() #This line finds the index with the highest probability in each prediction, effectively giving the predicted class for each input.
    labels_flat = labels.flatten()  #This line flattens the labels array into a 1D vector, as required by the f1_score function.
    return f1_score(labels_flat, preds_flat, average='weighted') #This line computes the F1 score using the true labels and the predicted labels, with the weighted averaging scheme. The result is returned.

In [58]:
def accuracy_per_class(preds, labels):
    # Create a dictionary with keys and values reversed for easy lookup.
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    # Get the predicted labels and flatten them.
    preds_flat = np.argmax(preds, axis=1).flatten()
    
    # Get the actual labels and flatten them.
    labels_flat = labels.flatten()

    # Iterate over the unique labels in the actual labels.
    for label in np.unique(labels_flat):
        # Get the predicted labels for this class.
        y_preds = preds_flat[labels_flat==label]
        
        # Get the actual labels for this class.
        y_true = labels_flat[labels_flat==label]
        
        # Print the class name, accuracy numerator and denominator.
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

In [59]:
import random

seed_val = 17
random.seed(seed_val) #sets the seed value for the Python built-in pseudo-random generator.
np.random.seed(seed_val) #sets the seed value for the NumPy pseudo-random number generator.
torch.manual_seed(seed_val) #sets the seed value for the random number generator in PyTorch on the CPU.
torch.cuda.manual_seed_all(seed_val) #sets the seed value for the random number generator in PyTorch on the GPU.

In [60]:
#This code evaluates the performance of a trained model on a validation dataset by computing its loss and predictions for each batch in the dataset.
def evaluate(dataloader_val):

    model.eval() # setting the model to evaluation mode to disable dropout and other regularization techniques that are useful during training but not during evaluation.
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
    
        batch = tuple(b.to(device) for b in batch) # moving the input batch to the GPU for faster computation.
   
        #  creating a dictionary of inputs that will be passed to the model. The input IDs and attention mask are for the BERT model, and the labels are the true labels for each input.
        inputs = {'input_ids':  	batch[0],
                'attention_mask': batch[1],
                'labels':     	batch[2],
                } 

        with torch.no_grad():   
            outputs = model(**inputs)
        
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val)
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
       	 
    return loss_val_avg, predictions, true_vals




In [67]:
import time
from sklearn.metrics import accuracy_score, precision_score, f1_score
from sklearn.exceptions import UndefinedMetricWarning

total_training_time = 0

accuracy_list = []
precision_list = []

for epoch in tqdm(range(1, epochs + 1)):
    start_time = time.time()

    model.train()
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train,
                        desc='Epoch {:1d}'.format(epoch),
                        leave=False,
                        disable=False)
    for batch in progress_bar:
        model.zero_grad()
        batch = tuple(b.to(device) for b in batch)
        inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
        }
        labels = batch[2]
        logits = model(**inputs)
        logits = logits.view(-1, model.num_classes)  # Reshape logits
        labels = labels.view(-1)  # Reshape labels
        loss_fn = nn.CrossEntropyLoss()
        loss = loss_fn(logits, labels)
        loss_train_total += loss.item()  # Use item() instead of mean().item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item() / len(batch))})

    end_time = time.time()
    epoch_training_time = end_time - start_time
    total_training_time += epoch_training_time

    torch.save(model.state_dict(), f'Models/Body/finetuned_bert_ft_epoch{epoch}.model')

    tqdm.write(f'\nEpoch {epoch}')

    loss_train_avg = loss_train_total / len(dataloader_train)
    tqdm.write(f'Training loss: {loss_train_avg}')

    val_loss, predictions, true_vals = evaluate(dataloader_val)

    # Convert predictions to discrete labels
    predictions = np.argmax(predictions, axis=1)

    val_f1 = f1_score(true_vals, predictions, average='weighted')
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (weighted): {val_f1}')

    val_accuracy = accuracy_score(true_vals, predictions)
    val_precision = precision_score(true_vals, predictions, average='weighted', zero_division=1)

    accuracy_list.append(val_accuracy)
    precision_list.append(val_precision)

total_time_minutes = total_training_time / 60
tqdm.write(f'\nTotal training time: {total_time_minutes} minutes')

final_accuracy = accuracy_list[-1]
final_precision = precision_list[-1]
tqdm.write(f'Final Accuracy: {final_accuracy}')
tqdm.write(f'Final Precision: {final_precision}')


  0%|          | 0/1 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/19741 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 18.00 MiB (GPU 0; 7.80 GiB total capacity; 1.98 GiB already allocated; 12.69 MiB free; 2.09 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF