In [1]:
import torch
import pandas as pd
from tqdm.notebook import tqdm

In [2]:
from datasets import load_dataset

dataset = load_dataset("amazon_us_reviews", "Apparel_v1_00")
train_data = dataset['train']

# Limit the dataset to the first 100,000 rows
train_data = train_data.select(range(100000))

df = train_data.to_pandas()  # Convert the dataset to a Pandas DataFrame
df = df[['customer_id', 'review_headline', 'star_rating']]  # Select specific columns
df.columns = ['customer_id', 'review_headline', 'star_rating']  # Rename the selected columns
df.set_index('customer_id', inplace=True)
df.head()  # Display the first few rows of the DataFrame

Found cached dataset amazon_us_reviews (/home/z123010/.cache/huggingface/datasets/amazon_us_reviews/Apparel_v1_00/0.1.0/17b2481be59723469538adeb8fd0a68b0ba363bbbdd71090e72c325ee6c7e563)


  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0_level_0,review_headline,star_rating
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1
32158956,★ THESE REALLY DO WORK GREAT WITH SOME TWEAKING ★,4
2714559,Favorite for winter. Very warm!,5
12608825,Great Socks for the money.,5
25482800,Slick hat!,5
9310286,I would do it again!,5


In [3]:
df.star_rating.value_counts()

star_rating
5    53374
4    17763
1    11741
3    10431
2     6691
Name: count, dtype: int64

In [4]:
df['sentiment'] = df['star_rating'].map({5: 'good', 4: 'good', 3: 'neutral', 2: 'bad', 1: 'bad'})

In [5]:
df['sentiment'].value_counts()

sentiment
good       71137
bad        18432
neutral    10431
Name: count, dtype: int64

In [6]:
possible_labels = df.sentiment.unique() #Get unique category labels from the DataFrame column 'category'

In [7]:
label_dict = {} #Create a dictionary to map each possible label to a unique index
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index

In [8]:
label_dict

{'good': 0, 'neutral': 1, 'bad': 2}

In [9]:
df['label'] = df.sentiment.replace(label_dict)
df.head(10)

Unnamed: 0_level_0,review_headline,star_rating,sentiment,label
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
32158956,★ THESE REALLY DO WORK GREAT WITH SOME TWEAKING ★,4,good,0
2714559,Favorite for winter. Very warm!,5,good,0
12608825,Great Socks for the money.,5,good,0
25482800,Slick hat!,5,good,0
9310286,I would do it again!,5,good,0
26631939,Five Stars,5,good,0
48785098,Love it!,5,good,0
39548589,Three Stars,4,good,0
29355866,Five Stars,5,good,0
27477484,Not my favorite.,3,neutral,1


In [10]:
from sklearn.model_selection import train_test_split

In [11]:
#Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    df.index.values,
    df.label.values,
    test_size=0.15,
    random_state=17,
    stratify=df.label.values
)

In [12]:
df['data_type'] = ['not_set']*df.shape[0] #Set a new column 'data_type' for later data split

In [13]:
df.head()

Unnamed: 0_level_0,review_headline,star_rating,sentiment,label,data_type
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
32158956,★ THESE REALLY DO WORK GREAT WITH SOME TWEAKING ★,4,good,0,not_set
2714559,Favorite for winter. Very warm!,5,good,0,not_set
12608825,Great Socks for the money.,5,good,0,not_set
25482800,Slick hat!,5,good,0,not_set
9310286,I would do it again!,5,good,0,not_set


In [14]:
#Set the 'data_type' column of the dataframe for training and validation data
df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'

In [15]:
df.groupby(['star_rating', 'label', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,review_headline,sentiment
star_rating,label,data_type,Unnamed: 3_level_1,Unnamed: 4_level_1
1,2,train,9581,9581
1,2,val,2160,2160
2,2,train,5463,5463
2,2,val,1228,1228
3,1,train,8439,8439
3,1,val,1992,1992
4,0,train,14202,14202
4,0,val,3561,3561
5,0,train,41279,41279
5,0,val,12095,12095


In [16]:
from transformers import BertTokenizer
from torch.utils.data import TensorDataset

In [17]:
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased',
    do_lower_case=True
)

In [18]:
encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type=='train'].review_headline.values,
    add_special_tokens=True,
    return_attention_mask=True,
    padding='longest',
    max_length=256,
    truncation=True,
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type=='val'].review_headline.values,
    add_special_tokens=True,
    return_attention_mask=True,
    padding='longest',
    max_length=256,
    truncation=True,
    return_tensors='pt'
)

input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type=='train'].label.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type=='val'].label.values)

In [19]:
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

In [20]:
len(dataset_train)

78964

In [21]:
len(dataset_val)

21036

In [22]:
from transformers import BertForSequenceClassification

In [23]:
#Define a BERT model for sequence classification task
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels = len(label_dict),
    output_attentions=False,
    output_hidden_states=False
)

"""
Here, I use the BERTForSequenceClassification model, which is a BERT model for sequence classification
task such as sentiment analysis. The pre-trained BERT model is loaded from 'bert-base-uncased', and we set the number of labels to be the length of unique labels in the dataset.

I also set output_attentions and output_hidden_states to False, which means I only get the output
from the last layer of BERT.
"""

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

"\nHere, I use the BERTForSequenceClassification model, which is a BERT model for sequence classification\ntask such as sentiment analysis. The pre-trained BERT model is loaded from 'bert-base-uncased', and we set the number of labels to be the length of unique labels in the dataset.\n\nI also set output_attentions and output_hidden_states to False, which means I only get the output\nfrom the last layer of BERT.\n"

In [24]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

In [25]:
# Set the batch size and create data loaders for training and validation sets

batch_size = 4 #32

dataloader_train = DataLoader(
    dataset_train,
    sampler=RandomSampler(dataset_train),
    batch_size=batch_size
)

dataloader_val = DataLoader(
    dataset_val,
    sampler=RandomSampler(dataset_val),
    batch_size=32
)

In [26]:
from transformers import AdamW, get_linear_schedule_with_warmup

2023-05-24 17:02:50.775617: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-05-24 17:02:53.230317: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [27]:
optimizer = AdamW(
    model.parameters(),  # Passes the model parameters to the optimizer
    lr=1e-5,             # Sets the learning rate for the optimizer to 1e-5
    eps=1e-8             # Sets the epsilon value for numerical stability to 1e-8
)



In [28]:
epochs= 100 #This sets the number of epochs or the number of times the model will iterate over the entire dataset during training to 10.

#This creates a linear learning rate scheduler that increases the learning rate linearly over the course of training and uses the specified number of warm-up steps and total training steps.
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0, #This sets the number of warm-up steps during training to 0. Warm-up steps gradually increase the learning rate from an initial low value to the target learning rate.
    num_training_steps=len(dataloader_train)*epochs #This sets the number of total training steps to the number of batches per epoch times the number of epochs.
)

In [29]:
import numpy as np

In [30]:
from sklearn.metrics import f1_score #F1 score is a measure of a model's accuracy, combining both precision and recall, used to evaluate binary classification models.

In [31]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten() #This line finds the index with the highest probability in each prediction, effectively giving the predicted class for each input.
    labels_flat = labels.flatten()  #This line flattens the labels array into a 1D vector, as required by the f1_score function.
    return f1_score(labels_flat, preds_flat, average='weighted') #This line computes the F1 score using the true labels and the predicted labels, with the weighted averaging scheme. The result is returned.

In [32]:
def accuracy_per_class(preds, labels):
    # Create a dictionary with keys and values reversed for easy lookup.
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    # Get the predicted labels and flatten them.
    preds_flat = np.argmax(preds, axis=1).flatten()
    
    # Get the actual labels and flatten them.
    labels_flat = labels.flatten()

    # Iterate over the unique labels in the actual labels.
    for label in np.unique(labels_flat):
        # Get the predicted labels for this class.
        y_preds = preds_flat[labels_flat==label]
        
        # Get the actual labels for this class.
        y_true = labels_flat[labels_flat==label]
        
        # Print the class name, accuracy numerator and denominator.
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

In [33]:
import random

seed_val = 17
random.seed(seed_val) #sets the seed value for the Python built-in pseudo-random generator.
np.random.seed(seed_val) #sets the seed value for the NumPy pseudo-random number generator.
torch.manual_seed(seed_val) #sets the seed value for the random number generator in PyTorch on the CPU.
torch.cuda.manual_seed_all(seed_val) #sets the seed value for the random number generator in PyTorch on the GPU.

In [34]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(device)

cuda


In [35]:
#This code evaluates the performance of a trained model on a validation dataset by computing its loss and predictions for each batch in the dataset.
def evaluate(dataloader_val):

    model.eval() # setting the model to evaluation mode to disable dropout and other regularization techniques that are useful during training but not during evaluation.
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
    
        batch = tuple(b.to(device) for b in batch) # moving the input batch to the GPU for faster computation.
   
        #  creating a dictionary of inputs that will be passed to the model. The input IDs and attention mask are for the BERT model, and the labels are the true labels for each input.
        inputs = {'input_ids':  	batch[0],
                'attention_mask': batch[1],
                'labels':     	batch[2],
                } 

        with torch.no_grad():   
            outputs = model(**inputs)
        
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val)
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
       	 
    return loss_val_avg, predictions, true_vals




In [None]:
import time
from sklearn.metrics import accuracy_score, precision_score, f1_score
from sklearn.exceptions import UndefinedMetricWarning

total_training_time = 0

accuracy_list = []
precision_list = []

for epoch in tqdm(range(1, epochs + 1)):
    start_time = time.time()

    model.train()
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train,
                        desc='Epoch {:1d}'.format(epoch),
                        leave=False,
                        disable=False)
    for batch in progress_bar:
        model.zero_grad()
        batch = tuple(b.to(device) for b in batch)
        inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
            'labels': batch[2],
        }
        output = model(**inputs)
        loss = output[0]
        loss_train_total += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item() / len(batch))})

    end_time = time.time()
    epoch_training_time = end_time - start_time
    total_training_time += epoch_training_time

    torch.save(model.state_dict(), f'Models/finetuned_bert_ft_epoch{epoch}.model')

    tqdm.write(f'\nEpoch {epoch}')

    loss_train_avg = loss_train_total / len(dataloader_train)
    tqdm.write(f'Training loss: {loss_train_avg}')

    val_loss, predictions, true_vals = evaluate(dataloader_val)

    # Convert predictions to discrete labels
    predictions = np.argmax(predictions, axis=1)

    val_f1 = f1_score(true_vals, predictions, average='weighted')
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (weighted): {val_f1}')

    val_accuracy = accuracy_score(true_vals, predictions)
    val_precision = precision_score(true_vals, predictions, average='weighted', zero_division=1)

    accuracy_list.append(val_accuracy)
    precision_list.append(val_precision)

total_time_minutes = total_training_time / 60
tqdm.write(f'\nTotal training time: {total_time_minutes} minutes')

final_accuracy = accuracy_list[-1]
final_precision = precision_list[-1]
tqdm.write(f'Final Accuracy: {final_accuracy}')
tqdm.write(f'Final Precision: {final_precision}')

  0%|          | 0/100 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 1
Training loss: 0.40248726489438374
Validation loss: 0.3321993896528218
F1 Score (weighted): 0.8901567870812485


Epoch 2:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 2
Training loss: 0.35022265693117743
Validation loss: 0.4034658418143859
F1 Score (weighted): 0.8938177193971061


Epoch 3:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 3
Training loss: 0.319708567156445
Validation loss: 0.4410126832318082
F1 Score (weighted): 0.8902160438122877


Epoch 4:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 4
Training loss: 0.2819755646092962
Validation loss: 0.49994679924141683
F1 Score (weighted): 0.8915033688008531


Epoch 5:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 5
Training loss: 0.25000644682730166
Validation loss: 0.5611874821183971
F1 Score (weighted): 0.8861358326580606


Epoch 6:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 6
Training loss: 0.21996074552570985
Validation loss: 0.6465885732866969
F1 Score (weighted): 0.8888244372681046


Epoch 7:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 7
Training loss: 0.1932183188081572
Validation loss: 0.7111834480458521
F1 Score (weighted): 0.8846131178971764


Epoch 8:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 8
Training loss: 0.16993525075758156
Validation loss: 0.7070091919393008
F1 Score (weighted): 0.8840988899714673


Epoch 9:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 9
Training loss: 0.15511732886070392
Validation loss: 0.7736253415823187
F1 Score (weighted): 0.8851343010812257


Epoch 10:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 10
Training loss: 0.14313933776828563
Validation loss: 0.7867593841984647
F1 Score (weighted): 0.8870719896636882


Epoch 11:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 11
Training loss: 0.13560018132663088
Validation loss: 0.8105328124453346
F1 Score (weighted): 0.8851364938190902


Epoch 12:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 12
Training loss: 0.12743154419397756
Validation loss: 0.7974027424748109
F1 Score (weighted): 0.8841967963657925


Epoch 13:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 13
Training loss: 0.12163421532711331
Validation loss: 0.8360806921973839
F1 Score (weighted): 0.8878325599851258


Epoch 14:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 14
Training loss: 0.11357953972485223
Validation loss: 0.8169919040251484
F1 Score (weighted): 0.886572268419865


Epoch 15:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 15
Training loss: 0.11393318447521875
Validation loss: 0.8244268729934953
F1 Score (weighted): 0.8829868315797385


Epoch 16:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 16
Training loss: 0.10389587747292145
Validation loss: 0.8973579606451154
F1 Score (weighted): 0.8814510908390131


Epoch 17:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 17
Training loss: 0.10218857722457428
Validation loss: 0.8798774946358028
F1 Score (weighted): 0.8772473156337927


Epoch 18:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 18
Training loss: 0.10015661057683817
Validation loss: 0.856372763103915
F1 Score (weighted): 0.8842792533060341


Epoch 19:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 19
Training loss: 0.10022137789721351
Validation loss: 0.895490861637677
F1 Score (weighted): 0.8819772293339296


Epoch 20:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 20
Training loss: 0.09524072345710369
Validation loss: 0.8881859842599361
F1 Score (weighted): 0.8835319396255483


Epoch 21:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 21
Training loss: 0.09183675723717453
Validation loss: 0.8522554514133499
F1 Score (weighted): 0.886887275978614


Epoch 22:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 22
Training loss: 0.092028173371378
Validation loss: 0.8592299591618054
F1 Score (weighted): 0.8847870048906411


Epoch 23:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 23
Training loss: 0.08658097033984284
Validation loss: 0.9110322710295865
F1 Score (weighted): 0.8821733867379784


Epoch 24:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 24
Training loss: 0.08823470360568339
Validation loss: 0.9075680251897316
F1 Score (weighted): 0.8803686612313097


Epoch 25:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 25
Training loss: 0.08416038199785951
Validation loss: 0.9490826073966745
F1 Score (weighted): 0.8781832930400377


Epoch 26:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 26
Training loss: 0.08432001497978281
Validation loss: 0.8897870918019545
F1 Score (weighted): 0.8826917834930439


Epoch 27:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 27
Training loss: 0.08032894053522285
Validation loss: 0.9142411797572418
F1 Score (weighted): 0.8837197760918803


Epoch 28:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 28
Training loss: 0.07995465715691367
Validation loss: 0.9454307280464098
F1 Score (weighted): 0.8816864170718424


Epoch 29:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 29
Training loss: 0.08130397118719583
Validation loss: 0.9402518472549997
F1 Score (weighted): 0.8827551005507239


Epoch 30:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 30
Training loss: 0.07974988833138581
Validation loss: 0.9172742213772739
F1 Score (weighted): 0.8833119110645938


Epoch 31:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 31
Training loss: 0.07822999343918965
Validation loss: 0.8964759115084859
F1 Score (weighted): 0.8841866882820139


Epoch 32:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 32
Training loss: 0.0752516511519344
Validation loss: 0.938049729444196
F1 Score (weighted): 0.882386141408038


Epoch 33:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 33
Training loss: 0.07476936006658436
Validation loss: 0.938407412336172
F1 Score (weighted): 0.8830554041956031


Epoch 34:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 34
Training loss: 0.0752971777609822
Validation loss: 0.8955335325643423
F1 Score (weighted): 0.8836214031578413


Epoch 35:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 35
Training loss: 0.07558250480613295
Validation loss: 0.9196050693683384
F1 Score (weighted): 0.8823620709525523


Epoch 36:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 36
Training loss: 0.06972563476378743
Validation loss: 0.9507474251883697
F1 Score (weighted): 0.8825773164472211


Epoch 37:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 37
Training loss: 0.07335253117847153
Validation loss: 0.9207226504185999
F1 Score (weighted): 0.882436969815787


Epoch 38:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 38
Training loss: 0.07049286229575506
Validation loss: 0.9767937019736962
F1 Score (weighted): 0.8806845228054841


Epoch 39:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 39
Training loss: 0.07116919876168924
Validation loss: 0.9067756109203086
F1 Score (weighted): 0.8826479689359275


Epoch 40:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 40
Training loss: 0.06708578217750516
Validation loss: 0.9729548663686772
F1 Score (weighted): 0.8819292967817939


Epoch 41:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 41
Training loss: 0.06829642347939817
Validation loss: 0.9698231922221127
F1 Score (weighted): 0.8795081669369181


Epoch 42:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 42
Training loss: 0.06841140700228912
Validation loss: 0.9786070038620985
F1 Score (weighted): 0.8791591263770563


Epoch 43:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 43
Training loss: 0.06612032316920355
Validation loss: 0.9648005935836476
F1 Score (weighted): 0.8831912441945651


Epoch 44:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 44
Training loss: 0.06619705244845286
Validation loss: 0.9563861321215921
F1 Score (weighted): 0.8841235538588859


Epoch 45:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 45
Training loss: 0.06649216452507178
Validation loss: 0.9676619000468133
F1 Score (weighted): 0.8823181811992863


Epoch 46:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 46
Training loss: 0.06478667205665024
Validation loss: 0.9695219613124518
F1 Score (weighted): 0.8836622747120046


Epoch 47:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 47
Training loss: 0.06557764698539216
Validation loss: 0.9214003849083272
F1 Score (weighted): 0.8843731730468738


Epoch 48:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 48
Training loss: 0.0648024273094572
Validation loss: 0.9445848072795022
F1 Score (weighted): 0.8848849872501509


Epoch 49:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 49
Training loss: 0.06422928448448749
Validation loss: 0.9546126622577188
F1 Score (weighted): 0.8832197648334251


Epoch 50:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 50
Training loss: 0.06126587570066656
Validation loss: 0.9717373750988171
F1 Score (weighted): 0.8812297210076003


Epoch 51:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 51
Training loss: 0.06261773711946568
Validation loss: 0.9549680918833827
F1 Score (weighted): 0.8804551680018872


Epoch 52:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 52
Training loss: 0.06350096311788807
Validation loss: 0.9835256397888292
F1 Score (weighted): 0.881416996220458


Epoch 53:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 53
Training loss: 0.061807250518922344
Validation loss: 0.9825684533992854
F1 Score (weighted): 0.8812389049922066


Epoch 54:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 54
Training loss: 0.06201600671597905
Validation loss: 0.983243716805708
F1 Score (weighted): 0.8836498421868303


Epoch 55:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 55
Training loss: 0.061111368540663485
Validation loss: 0.9780646223128544
F1 Score (weighted): 0.8831616756746614


Epoch 56:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 56
Training loss: 0.06201208908170244
Validation loss: 0.9875650537186959
F1 Score (weighted): 0.8830355528734218


Epoch 57:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 57
Training loss: 0.06130377791820635
Validation loss: 0.9581942146305499
F1 Score (weighted): 0.8841290143201608


Epoch 58:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 58
Training loss: 0.059681220678837604
Validation loss: 0.9801816611820909
F1 Score (weighted): 0.882059852045514


Epoch 59:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 59
Training loss: 0.05887511537175541
Validation loss: 0.9911259225131052
F1 Score (weighted): 0.8829516782630128


Epoch 60:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 60
Training loss: 0.05923197151090053
Validation loss: 0.9785439742780906
F1 Score (weighted): 0.8830099424613046


Epoch 61:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 61
Training loss: 0.05926607447252387
Validation loss: 0.9904958782117707
F1 Score (weighted): 0.8819930065136963


Epoch 62:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 62
Training loss: 0.05814884647902838
Validation loss: 0.9691837468313419
F1 Score (weighted): 0.8828963023180578


Epoch 63:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 63
Training loss: 0.058468233945310595
Validation loss: 0.9925575348381885
F1 Score (weighted): 0.8848190116419552


Epoch 64:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 64
Training loss: 0.058100257701319984
Validation loss: 0.9874093850748563
F1 Score (weighted): 0.8835838893316756


Epoch 65:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 65
Training loss: 0.05661319663919274
Validation loss: 0.9913407509659872
F1 Score (weighted): 0.8837693608978052


Epoch 66:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 66
Training loss: 0.05695688753413871
Validation loss: 1.0224131537800125
F1 Score (weighted): 0.8818263815547381


Epoch 67:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 67
Training loss: 0.05651503713207077
Validation loss: 1.0280049172569137
F1 Score (weighted): 0.8799684163917268


Epoch 68:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 68
Training loss: 0.0570712956536378
Validation loss: 0.9728618741115983
F1 Score (weighted): 0.8851447900638507


Epoch 69:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 69
Training loss: 0.05710264110366265
Validation loss: 0.9965815016759711
F1 Score (weighted): 0.8839249149101682


Epoch 70:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 70
Training loss: 0.055373200305525165
Validation loss: 0.9915326310393873
F1 Score (weighted): 0.8821876647938235


Epoch 71:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 71
Training loss: 0.05475921211346666
Validation loss: 0.9985221735087263
F1 Score (weighted): 0.8855460351426884


Epoch 72:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 72
Training loss: 0.05581781667493296
Validation loss: 1.0098001688790914
F1 Score (weighted): 0.8832809429349819


Epoch 73:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 73
Training loss: 0.056024007330740554
Validation loss: 1.011221915228208
F1 Score (weighted): 0.883586864541925


Epoch 74:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 74
Training loss: 0.05575656185461493
Validation loss: 1.044988492791616
F1 Score (weighted): 0.8808083058228474


Epoch 75:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 75
Training loss: 0.05520967582424527
Validation loss: 0.9823881524464848
F1 Score (weighted): 0.8850902625273295


Epoch 76:   0%|          | 0/19741 [00:00<?, ?it/s]


Epoch 76
Training loss: 0.05545012734384326
Validation loss: 1.0119410346076365
F1 Score (weighted): 0.8835460207086884


Epoch 77:   0%|          | 0/19741 [00:00<?, ?it/s]

In [37]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels = len(label_dict),
    output_attentions = False,
    output_hidden_states = False
)

model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [41]:
model.load_state_dict(
    torch.load(
        "Models/finetuned_bert_ft_epoch1.model", 
        map_location = torch.device('cuda')
    )
)

<All keys matched successfully>

In [42]:
_, predictions, true_vals = evaluate(dataloader_val)

In [43]:
accuracy_per_class(predictions, true_vals)

Class: good
Accuracy: 15040/15656

Class: neutral
Accuracy: 983/1992

Class: bad
Accuracy: 2781/3388



In [44]:
model.load_state_dict(
    torch.load(
        "Models/finetuned_bert_ft_epoch75.model", 
        map_location = torch.device('cuda')
    )
)

<All keys matched successfully>

In [45]:
_, predictions, true_vals = evaluate(dataloader_val)

In [46]:
accuracy_per_class(predictions, true_vals)

Class: good
Accuracy: 14963/15656

Class: neutral
Accuracy: 955/1992

Class: bad
Accuracy: 2772/3388



In [47]:
model.load_state_dict(
    torch.load(
        "Models/Body/finetuned_bert_ft_epoch3.model", 
        map_location = torch.device('cuda')
    )
)

<All keys matched successfully>

In [48]:
_, predictions, true_vals = evaluate(dataloader_val)

In [49]:
accuracy_per_class(predictions, true_vals)

Class: good
Accuracy: 14818/15656

Class: neutral
Accuracy: 1103/1992

Class: bad
Accuracy: 2743/3388

