In [2]:
HOME = "./TextData/"

Mounted at /content/drive


# Install dependencies

In [3]:

!pip install datasets==2.16.0
!pip install huggingface-hub==0.20.1


!pip install python-dateutil==2.8.2
!pip install pytz==2023.3.post1
!pip install PyYAML==6.0.1
!pip install regex==2023.12.25

!pip install safetensors==0.4.1
!pip install scikit-learn==1.3.2
#!pip install setuptools==68.2.2

!pip install tokenizers==0.13.3
!pip install torch==2.1.0
!pip install tqdm==4.66.1
!pip install transformers==4.28.1


Collecting datasets==2.16.0
  Downloading datasets-2.16.0-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets==2.16.0)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets==2.16.0)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets==2.16.0)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
INFO: pip is looking at multiple versions of multiprocess to deter

# Imports

In [4]:
import numpy as np
import torch
import torch.distributed as dist
from packaging import version
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import f1_score
from torch import nn
from torch.func import functional_call, jvp
from torch.optim import SGD, Adam, Adagrad
from torch.utils.data import DataLoader, RandomSampler
from torch.utils.data.distributed import DistributedSampler
from tqdm.auto import tqdm
from transformers import Trainer
from transformers.debug_utils import DebugOption, DebugUnderflowOverflow
from transformers.deepspeed import deepspeed_init, is_deepspeed_zero3_enabled
from transformers.dependency_versions_check import dep_version_check
# Integrations must be imported before ML frameworks:
from transformers.integrations import (  # isort: split
    hp_params,
    is_fairscale_available,
)
from transformers.pytorch_utils import is_torch_greater_or_equal_than_1_10, is_torch_less_than_1_11
from transformers.trainer_callback import (
    DefaultFlowCallback,
    ProgressCallback,
    TrainerState,
)
from transformers.trainer_pt_utils import (
    IterableDatasetShard,
)
from transformers.trainer_utils import (
    HPSearchBackend,
    ShardedDDPOption,
    TrainOutput,
    has_length,
    speed_metrics,
)
from transformers.utils import (
    WEIGHTS_NAME,
    is_apex_available,
    is_in_notebook,
    is_sagemaker_mp_enabled,
    is_torch_tpu_available,
    logging,
)

from torch.nn import CrossEntropyLoss

from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModelForCausalLM,
    HfArgumentParser,
    TrainingArguments,
    DataCollatorForTokenClassification,
    AutoModelForSequenceClassification
)

from torch.utils.data import Dataset
import pandas as pd
from sklearn.model_selection import train_test_split

# from torch.optim.optimizer import StateDict, params_t

from torch.func import functional_call, jvp

import torch.nn.functional as F
import math
import time
from functools import partial

# Utility Functions

In [5]:
def set_seed(seed):
  np.random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)

In [6]:
def exponential_lr_decay(step: int, k: float=1e-4):
    return math.e ** (-step * k)

# Models

In [7]:
def load_model(args):
  config = AutoConfig.from_pretrained(args.model_name)
  model = AutoModelForSequenceClassification.from_pretrained(args.model_name, config=config, torch_dtype=torch.float32)
  tokenizer = AutoTokenizer.from_pretrained(args.model_name, use_fast=True)

  return model, tokenizer

# DataSet

In [8]:
class MovieReviewDataset(Dataset):
  def __init__(self, reviews, targets, tokenizer, max_len):
    self.reviews = reviews
    self.targets = targets
    self.tokenizer = tokenizer
    self.max_len = max_len



  def __len__(self):
    return len(self.reviews)

  def __getitem__(self, item):
    review = str(self.reviews[item])
    target = self.targets[item]

    encoding = self.tokenizer.encode_plus(
    review,
    add_special_tokens=True,
    max_length=self.max_len,
    return_token_type_ids=False,
    padding='max_length',
    return_attention_mask=True,
    return_tensors='pt',
    truncation = True
  )
    return {
    'input_ids': encoding['input_ids'].flatten(),
    'attention_mask': encoding['attention_mask'].flatten(),
    'labels': torch.tensor(target, dtype=torch.long)
  }







In [9]:
def to_sentiment(rating):
  rating = str(rating)
  if rating == 'positive':
    return 0
  else:
    return 1

def get_data(path: str, tokenizer, args):
  data = pd.read_csv(path)
  data = data[:4000]
  data['sentiment_score'] = data.sentiment.apply(to_sentiment)

  df_train, df_test = train_test_split(data, test_size=0.3, random_state=123)
  df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=123)

  df_train_ds = MovieReviewDataset(
    reviews=df_train.review.to_numpy(),
    targets=df_train.sentiment_score.to_numpy(),
    tokenizer=tokenizer,
    max_len=args.max_len
  )


  df_test_ds = MovieReviewDataset(
    reviews=df_test.review.to_numpy(),
    targets=df_test.sentiment_score.to_numpy(),
    tokenizer=tokenizer,
    max_len=args.max_len
  )

  df_val_ds = MovieReviewDataset(
    reviews=df_val.review.to_numpy(),
    targets=df_val.sentiment_score.to_numpy(),
    tokenizer=tokenizer,
    max_len=args.max_len
  )

  train_data_loader = DataLoader(
    df_train_ds,
    batch_size=args.batch_size,
    num_workers=4
  )
  val_data_loader = DataLoader(
    df_val_ds,
    batch_size=args.batch_size,
    num_workers=4
  )
  test_data_loader = DataLoader(
    df_test_ds,
    batch_size=args.batch_size,
    num_workers=4
  )

  return train_data_loader, test_data_loader, val_data_loader

# Train Functions

In [10]:
class MyArguments():
  def __init__(self, epochs=4, max_len=400, batch_size=16, optimizer="adam", learning_rate=1e-5, trainer='forward_grad', model_name='', max_length=2048, max_grad_norm=0, momentum = 0.0):
    self.epochs = epochs
    self.max_len = max_len
    self.batch_size = batch_size
    self.optimizer= optimizer
    self.learning_rate = learning_rate
    self.trainer=trainer
    self.model_name = model_name
    self.max_length = max_length
    self.weight_decay = 0
    self.max_grad_norm = max_grad_norm
    self.momentum = momentum


In [11]:
class MyTrainer():
  def __init__(self, model, args, train_loader, test_loader, val_loader):
    self.model = model
    self.args = args
    self.train_loader = train_loader
    self.test_loader = test_loader
    self.val_loader = val_loader

    use_cuda = torch.cuda.is_available()
    self.device = torch.device("cuda:0" if use_cuda else "cpu")

  def backward_grad_step(self, inputs):
    inputs = {k: v.to(device=self.device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}

    if "labels" in inputs:
        labels = inputs.pop("labels")
    else:
        labels = None

    y = self.model(**inputs).logits
    loss = F.cross_entropy(y, labels)
    loss.backward()

    return loss

  def _get_learning_rate(self):
    for param_group in self.optimizer.param_groups:
      return param_group['lr']

  @staticmethod
  @torch.no_grad()
  def functional_call_loss(params, names, buffers, model, batch, t: torch.Tensor):
        model_params = {k: v for k, v in zip(names, params)}

        outputs = functional_call(model, (model_params, buffers), tuple(), kwargs=batch)
        logits = outputs.logits

        return F.cross_entropy(logits, t)


  def forward_grad_step(self, inputs):
    with torch.no_grad():
      named_buffers = dict(self.model.named_buffers())
      named_params = dict(self.model.named_parameters())
      names = named_params.keys()
      params = named_params.values()

      v_params = tuple([torch.randn_like(p) for p in params])
      inputs = {
        k: v.to(device=self.device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()
      }

      if "labels" in inputs:
          labels = inputs.pop("labels")
      else:
          labels = None
      f = partial(
                self.functional_call_loss,
                names=names,
                buffers=named_buffers,
                model=self.model,
                batch=inputs,
                t=labels
      )

      # Forward AD
      loss, jvp_ = jvp(f, (tuple(params),), (v_params,))

      # Setting gradients
      for v, p in zip(v_params, params):
          p.grad = v * jvp_


      return loss




  def train(self, name):
    rows_list = []
    validation_row_list = []
    performance_row_list = []


    self.model.to(self.device)
    self.model.float()
    self.model.train()

    if self.args.optimizer == "adam":
      self.optimizer = Adam(self.model.parameters(), lr=self.args.learning_rate)
    elif self.args.optimizer == "sgd":
      self.optimizer = SGD(self.model.parameters(), lr=self.args.learning_rate)
    elif self.args.optimizer == "adagrad":
      self.optimizer = torch.optim.Adagrad(self.model.parameters(), lr=args.learning_rate)
    elif self.args.optimizer == "momentumSGD":
      self.optimizer = SGD(self.model.parameters(), lr=self.args.learning_rate, momentum=self.args.momentum)

    self.optimizer.zero_grad(set_to_none=True)
    self.scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer=self.optimizer, lr_lambda=exponential_lr_decay)



    # Train
    steps = 0
    t_total = 0.0
    validation_steps = 0

    for epoch in range(self.args.epochs):
        t0 = time.perf_counter()
        loss = 0
        with tqdm(total=len(self.train_loader)) as pbar:
          for batch in self.train_loader:
              pbar.update(1)
              steps += 1

              if self.args.trainer == "forward_grad":
                loss_step = self.forward_grad_step(batch)
              else:
                loss_step = self.backward_grad_step(batch)
              loss += loss_step

              rows_list.append({
                  "step": steps,
                  "loss_step": loss_step.item()
              })

              if self.args.max_grad_norm > 0:
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.args.max_grad_norm)

              # Optimizer step
              self.optimizer.step()

              # Lr scaling
              self.scheduler.step()

              # Zero out grads
              self.optimizer.zero_grad(set_to_none=True)

        t1 = time.perf_counter()
        t_total += t1 - t0
        print("Time/batch_time", t1 - t0, steps)
        print("Time/sps", steps / t_total, steps)

        acc = 0
        all_pred = []
        all_true = []
        with torch.no_grad():
          for batch in self.val_loader:
              inputs = {
                k: v.to(device=self.device) if isinstance(v, torch.Tensor) else v for k, v in batch.items()
              }

              validation_steps += 1


              if "labels" in inputs:
                labels = batch.pop("labels")
              else:
                labels = None

              out = self.model(**inputs)
              logits = out.logits
              pred = F.softmax(logits, dim=-1).argmax(dim=-1)



              labels=labels.to(device=self.device)

              validation_row_list.append({
                  "step": validation_steps,
                  "loss_step": F.cross_entropy(logits, labels).item()
              })

              acc += (pred == labels).sum()
              all_pred.extend(pred.cpu())
              all_true.extend(labels.cpu())

        f1 = f1_score(all_true, all_pred)
        performance_row_list.append({
            "accuracy": (acc / len(self.val_loader.dataset)).item(),
            "f1_score": f1,
            "epoch": epoch+1

        })
        print(f"Epoch [{epoch+1}/{self.args.epochs}], Loss: {loss.item():.4f}, Time (s): {t1 - t0:.4f}, Test accuracy: {(acc / len(self.val_loader.dataset)).item():.4f} F1: {f1}")
    print(f"Mean time: {t_total / self.args.epochs:.4f}")

    # Test
    acc = 0
    all_pred = []
    all_true = []
    with torch.no_grad():
      for batch in self.test_loader:
          inputs = {
                  k: v.to(device=self.device) if isinstance(v, torch.Tensor) else v for k, v in batch.items()
          }
          if "labels" in inputs:
            labels = batch.pop("labels")
          else:
            labels = None
          labels=labels.to(device=self.device)

          out = self.model(**inputs).logits
          pred = F.softmax(out, dim=-1).argmax(dim=-1)

          acc += (pred == labels).sum()
          all_pred.extend(pred.cpu())
          all_true.extend(labels.cpu())
    f1 = f1_score(all_true, all_pred)
    print()
    print( "------- Test/accuracy", acc / len(self.test_loader.dataset), steps)
    print(f"------- Test accuracy: {(acc / len(self.test_loader.dataset)).item():.4f}")
    print(f"------- Test F1: {f1}")

    train_pd = pd.DataFrame(rows_list)
    validation_pd = pd.DataFrame(validation_row_list)
    performance_pd = pd.DataFrame(performance_row_list)

    train_pd.to_csv(HOME + name + "_train_loss.csv")
    validation_pd.to_csv(HOME + name + "_validation_loss.csv")
    performance_pd.to_csv(HOME + name + "_performance_metrics.csv")

# Train

In [12]:
set_seed(32)

## Forward Gradiend

### Adam

In [None]:
args = MyArguments(
    epochs=10,
    max_len=400,
    batch_size=16,
    optimizer="adam",
    learning_rate=1e-5,
    trainer='forward_grad',
    model_name="FacebookAI/roberta-base"
)

In [None]:
model, tokenizer = load_model(args)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of the model checkpoint at FacebookAI/roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the

In [None]:
loaders = get_data(HOME +  "IMDB_Dataset.csv", tokenizer, args)

In [None]:
trainer = MyTrainer(model, args, loaders[0], loaders[1], loaders[2])
trainer.train("adam_forward")

  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 245.80276309699912 175
Time/sps 0.71195294062232 175
Epoch [1/10], Loss: 121.5476, Time (s): 245.8028, Test accuracy: 0.4783 F1: 0.31808278867102396


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 244.8693999370007 350
Time/sps 0.7133072270410166 350
Epoch [2/10], Loss: 121.9106, Time (s): 244.8694, Test accuracy: 0.4883 F1: 0.2909930715935335


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 244.90035735299898 525
Time/sps 0.7137297621229616 525
Epoch [3/10], Loss: 121.7487, Time (s): 244.9004, Test accuracy: 0.4700 F1: 0.3026315789473684


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 245.19669411300129 700
Time/sps 0.713725502035525 700
Epoch [4/10], Loss: 121.5672, Time (s): 245.1967, Test accuracy: 0.4917 F1: 0.3468950749464668


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 244.84002047699687 875
Time/sps 0.7139306518169493 875
Epoch [5/10], Loss: 121.6289, Time (s): 244.8400, Test accuracy: 0.5167 F1: 0.36123348017621143


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 244.29132495100203 1050
Time/sps 0.714334036345583 1050
Epoch [6/10], Loss: 121.6762, Time (s): 244.2913, Test accuracy: 0.4717 F1: 0.3123644251626898


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 244.6288727430001 1225
Time/sps 0.7144817561350463 1225
Epoch [7/10], Loss: 121.6443, Time (s): 244.6289, Test accuracy: 0.4783 F1: 0.32104121475054226


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 244.9331345419996 1400
Time/sps 0.714481625434295 1400
Epoch [8/10], Loss: 121.3631, Time (s): 244.9331, Test accuracy: 0.4933 F1: 0.36134453781512604


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 244.48785964899798 1575
Time/sps 0.7146258739778001 1575
Epoch [9/10], Loss: 121.5044, Time (s): 244.4879, Test accuracy: 0.4683 F1: 0.2926829268292683


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 244.87855190000118 1750
Time/sps 0.7146272831533993 1750
Epoch [10/10], Loss: 121.7100, Time (s): 244.8786, Test accuracy: 0.4583 F1: 0.3243243243243243
Mean time: 244.8829


  self.pid = os.fork()



------- Test/accuracy tensor(0.4683, device='cuda:0') 1750
------- Test accuracy: 0.4683
------- Test F1: 0.2989010989010989


### Adagrad

In [None]:
args = MyArguments(
    epochs=10,
    max_len=400,
    batch_size=16,
    optimizer="adagrad",
    learning_rate=1e-5,
    trainer='forward_grad',
    model_name="FacebookAI/roberta-base"
)

In [None]:
model, tokenizer = load_model(args)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of the model checkpoint at FacebookAI/roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the

In [None]:
loaders = get_data(HOME +  "IMDB_Dataset.csv", tokenizer, args)

In [None]:
trainer = MyTrainer(model, args, loaders[0], loaders[1], loaders[2])
trainer.train("adagrad_forward")

  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 243.11861269000292 175
Time/sps 0.7198132551995927 175
Epoch [1/10], Loss: 121.7590, Time (s): 243.1186, Test accuracy: 0.4833 F1: 0.07185628742514971


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 242.82690615499814 350
Time/sps 0.7202453493796642 350
Epoch [2/10], Loss: 121.8614, Time (s): 242.8269, Test accuracy: 0.4717 F1: 0.04804804804804805


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 243.00710062600047 525
Time/sps 0.7202114183785919 525
Epoch [3/10], Loss: 121.8867, Time (s): 243.0071, Test accuracy: 0.4733 F1: 0.04819277108433735


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 243.32270348599923 700
Time/sps 0.7199606772607122 700
Epoch [4/10], Loss: 121.8360, Time (s): 243.3227, Test accuracy: 0.4683 F1: 0.03625377643504532


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 243.3467810229995 875
Time/sps 0.7197960592648089 875
Epoch [5/10], Loss: 121.9242, Time (s): 243.3468, Test accuracy: 0.4733 F1: 0.03658536585365854


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 243.5215811679991 1050
Time/sps 0.7196001399228203 1050
Epoch [6/10], Loss: 121.8328, Time (s): 243.5216, Test accuracy: 0.4700 F1: 0.06470588235294117


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 242.37367509400065 1225
Time/sps 0.7199456371257789 1225
Epoch [7/10], Loss: 122.0228, Time (s): 242.3737, Test accuracy: 0.4783 F1: 0.05438066465256797


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 242.73050763900028 1400
Time/sps 0.7200727968527151 1400
Epoch [8/10], Loss: 121.8491, Time (s): 242.7305, Test accuracy: 0.4717 F1: 0.05373134328358208


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 242.8617412700005 1575
Time/sps 0.7201285172952024 1575
Epoch [9/10], Loss: 121.9099, Time (s): 242.8617, Test accuracy: 0.4717 F1: 0.030581039755351678


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 243.24333950800064 1750
Time/sps 0.7200600229549372 1750
Epoch [10/10], Loss: 121.9514, Time (s): 243.2433, Test accuracy: 0.4750 F1: 0.059701492537313425
Mean time: 243.0353


  self.pid = os.fork()



------- Test/accuracy tensor(0.5050, device='cuda:0') 1750
------- Test accuracy: 0.5050
------- Test F1: 0.06309148264984227


### Clipped SGD

In [None]:
args = MyArguments(
    epochs=10,
    max_len=400,
    batch_size=16,
    optimizer="sgd",
    learning_rate=1e-5,
    trainer='forward_grad',
    model_name="FacebookAI/roberta-base",
    max_grad_norm=1
)
model, tokenizer = load_model(args)
loaders = get_data(HOME +  "IMDB_Dataset.csv", tokenizer, args)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of the model checkpoint at FacebookAI/roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the

In [None]:
trainer = MyTrainer(model, args, loaders[0], loaders[1], loaders[2])
trainer.train("sgd_forward")

  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 238.0406420979998 175
Time/sps 0.7351685764986032 175
Epoch [1/10], Loss: 121.6390, Time (s): 238.0406, Test accuracy: 0.4867 F1: 0.2631578947368421


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 238.11305655899923 350
Time/sps 0.7350567705074684 350
Epoch [2/10], Loss: 121.7994, Time (s): 238.1131, Test accuracy: 0.5000 F1: 0.2857142857142857


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 237.77359376700042 525
Time/sps 0.735369001257069 525
Epoch [3/10], Loss: 122.0877, Time (s): 237.7736, Test accuracy: 0.4900 F1: 0.25728155339805825


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 238.52832632799982 700
Time/sps 0.7349423807454762 700
Epoch [4/10], Loss: 121.9055, Time (s): 238.5283, Test accuracy: 0.5150 F1: 0.3087885985748218


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 238.1130170360011 875
Time/sps 0.7349429286963071 875
Epoch [5/10], Loss: 122.0102, Time (s): 238.1130, Test accuracy: 0.4650 F1: 0.2552204176334107


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 238.23395592900124 1050
Time/sps 0.7348810858036084 1050
Epoch [6/10], Loss: 121.9903, Time (s): 238.2340, Test accuracy: 0.4867 F1: 0.2701421800947867


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 238.16403804899892 1225
Time/sps 0.7348677400770516 1225
Epoch [7/10], Loss: 122.0537, Time (s): 238.1640, Test accuracy: 0.4817 F1: 0.2577565632458234


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 237.86306768500071 1400
Time/sps 0.7349738414271089 1400
Epoch [8/10], Loss: 122.0670, Time (s): 237.8631, Test accuracy: 0.4650 F1: 0.20740740740740743


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 238.77723354699992 1575
Time/sps 0.73474291262285 1575
Epoch [9/10], Loss: 122.2287, Time (s): 238.7772, Test accuracy: 0.4717 F1: 0.21339950372208433


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 237.7288161960023 1750
Time/sps 0.7348816738933497 1750
Epoch [10/10], Loss: 121.9710, Time (s): 237.7288, Test accuracy: 0.5183 F1: 0.3167848699763593
Mean time: 238.1336


  self.pid = os.fork()



------- Test/accuracy tensor(0.4867, device='cuda:0') 1750
------- Test accuracy: 0.4867
------- Test F1: 0.23


### Momentum SGD

In [None]:
args = MyArguments(
    epochs=10,
    max_len=400,
    batch_size=16,
    optimizer="momentumSGD",
    learning_rate=1e-5,
    trainer='forward_grad',
    model_name="FacebookAI/roberta-base",
    momentum=0.9
)
model, tokenizer = load_model(args)
loaders = get_data(HOME +  "IMDB_Dataset.csv", tokenizer, args)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of the model checkpoint at FacebookAI/roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the

In [None]:
trainer = MyTrainer(model, args, loaders[0], loaders[1], loaders[2])
trainer.train("momentumSGD_forward")

  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 229.33362633500656 175
Time/sps 0.7630804204193024 175
Epoch [1/10], Loss: 121.5540, Time (s): 229.3336, Test accuracy: 0.4783 F1: 0.31509846827133475


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 235.86590655300097 350
Time/sps 0.7523653298341967 350
Epoch [2/10], Loss: 121.9162, Time (s): 235.8659, Test accuracy: 0.4867 F1: 0.27358490566037735


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 235.57153677500173 525
Time/sps 0.7491747629542769 525
Epoch [3/10], Loss: 121.7679, Time (s): 235.5715, Test accuracy: 0.4600 F1: 0.273542600896861


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 237.47217727800307 700
Time/sps 0.7460751806977932 700
Epoch [4/10], Loss: 121.5886, Time (s): 237.4722, Test accuracy: 0.4883 F1: 0.3252747252747253


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 240.73260482300248 875
Time/sps 0.7421695691992349 875
Epoch [5/10], Loss: 121.6368, Time (s): 240.7326, Test accuracy: 0.5017 F1: 0.3189066059225512


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 240.85001474599994 1050
Time/sps 0.7395273073739242 1050
Epoch [6/10], Loss: 121.6881, Time (s): 240.8500, Test accuracy: 0.4683 F1: 0.28953229398663693


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 240.87174157200207 1225
Time/sps 0.7376418163296957 1225
Epoch [7/10], Loss: 121.6613, Time (s): 240.8717, Test accuracy: 0.4700 F1: 0.28054298642533937


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 241.02551645899803 1400
Time/sps 0.7361744630085902 1400
Epoch [8/10], Loss: 121.3847, Time (s): 241.0255, Test accuracy: 0.4933 F1: 0.3183856502242152


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 241.4307431240013 1575
Time/sps 0.7348982374821158 1575
Epoch [9/10], Loss: 121.5249, Time (s): 241.4307, Test accuracy: 0.4600 F1: 0.2429906542056075


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 241.25841976200172 1750
Time/sps 0.7339334767010439 1750
Epoch [10/10], Loss: 121.7400, Time (s): 241.2584, Test accuracy: 0.4500 F1: 0.2600896860986547
Mean time: 238.4412


  self.pid = os.fork()



------- Test/accuracy tensor(0.4817, device='cuda:0') 1750
------- Test accuracy: 0.4817
------- Test F1: 0.2817551963048499


### SGD

In [26]:
args = MyArguments(
    epochs=10,
    max_len=400,
    batch_size=16,
    optimizer="sgd",
    learning_rate=1e-3,
    trainer='forward_grad',
    model_name="FacebookAI/roberta-base",
    max_grad_norm=1
)
model, tokenizer = load_model(args)
loaders = get_data(HOME +  "IMDB_Dataset.csv", tokenizer, args)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of the model checkpoint at FacebookAI/roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the

In [27]:
trainer = MyTrainer(model, args, loaders[0], loaders[1], loaders[2])
trainer.train("sgd_forward_lr3")

  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 249.53872042700004 175
Time/sps 0.7012939703327301 175
Epoch [1/10], Loss: 121.8505, Time (s): 249.5387, Test accuracy: 0.4667 F1: 0.2488262910798122


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 248.63740686500023 350
Time/sps 0.7025627701241723 350
Epoch [2/10], Loss: 121.8094, Time (s): 248.6374, Test accuracy: 0.4900 F1: 0.29816513761467894


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 249.02597444799994 525
Time/sps 0.7026211499906639 525
Epoch [3/10], Loss: 121.6285, Time (s): 249.0260, Test accuracy: 0.4817 F1: 0.2647754137115839


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 248.45404948299984 700
Time/sps 0.7030539600846788 700
Epoch [4/10], Loss: 121.8197, Time (s): 248.4540, Test accuracy: 0.4717 F1: 0.22871046228710462


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 249.376190684 875
Time/sps 0.7027929882205098 875
Epoch [5/10], Loss: 122.0891, Time (s): 249.3762, Test accuracy: 0.4750 F1: 0.2517814726840855


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 248.42881360400133 1050
Time/sps 0.7030648210202246 1050
Epoch [6/10], Loss: 121.9290, Time (s): 248.4288, Test accuracy: 0.4800 F1: 0.2941176470588235


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 248.9280688430008 1225
Time/sps 0.7030576078397028 1225
Epoch [7/10], Loss: 122.0259, Time (s): 248.9281, Test accuracy: 0.4467 F1: 0.23853211009174316


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 248.5495512189991 1400
Time/sps 0.7031858624568069 1400
Epoch [8/10], Loss: 121.6762, Time (s): 248.5496, Test accuracy: 0.4700 F1: 0.27397260273972596


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 248.6223428960002 1575
Time/sps 0.7032627897544026 1575
Epoch [9/10], Loss: 121.6504, Time (s): 248.6223, Test accuracy: 0.5000 F1: 0.3449781659388646


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 249.06689562800057 1750
Time/sps 0.7031987063100658 1750
Epoch [10/10], Loss: 121.7894, Time (s): 249.0669, Test accuracy: 0.4883 F1: 0.2974828375286041
Mean time: 248.8628


  self.pid = os.fork()



------- Test/accuracy tensor(0.4750, device='cuda:0') 1750
------- Test accuracy: 0.4750
------- Test F1: 0.2622950819672131


## Backward

### Adam

In [None]:
args = MyArguments(
    epochs=10,
    max_len=400,
    batch_size=16,
    optimizer="adam",
    learning_rate=1e-5,
    trainer='backward',
    model_name="FacebookAI/roberta-base"
)

In [None]:
model, tokenizer = load_model(args)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of the model checkpoint at FacebookAI/roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the

In [None]:
loaders = get_data(HOME +  "IMDB_Dataset.csv", tokenizer, args)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['sentiment_score'] = data.sentiment.apply(to_sentiment)


In [None]:
trainer = MyTrainer(model, args, loaders[0], loaders[1], loaders[2])
trainer.train("adam_backward")

  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 196.28936996899938 175
Time/sps 0.8915408920393311 175
Epoch [1/10], Loss: 68.2536, Time (s): 196.2894, Test accuracy: 0.8933 F1: 0.8926174496644296


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 195.54503728099735 350
Time/sps 0.8932344723282412 350
Epoch [2/10], Loss: 38.0617, Time (s): 195.5450, Test accuracy: 0.9517 F1: 0.9543307086614173


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 195.58918044600068 525
Time/sps 0.8937332633494746 525
Epoch [3/10], Loss: 25.8211, Time (s): 195.5892, Test accuracy: 0.9367 F1: 0.939297124600639


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 195.93968378200225 700
Time/sps 0.8935828695150397 700
Epoch [4/10], Loss: 18.4793, Time (s): 195.9397, Test accuracy: 0.9433 F1: 0.9465408805031447


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 195.65615585900014 875
Time/sps 0.8937514165373206 875
Epoch [5/10], Loss: 14.4880, Time (s): 195.6562, Test accuracy: 0.9383 F1: 0.9438543247344461


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 195.54792106899913 1050
Time/sps 0.8939461848866744 1050
Epoch [6/10], Loss: 9.2392, Time (s): 195.5479, Test accuracy: 0.9383 F1: 0.9420970266040689


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 195.76342298799864 1225
Time/sps 0.8939447508384003 1225
Epoch [7/10], Loss: 6.9413, Time (s): 195.7634, Test accuracy: 0.9300 F1: 0.9329073482428115


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 196.05094126900076 1400
Time/sps 0.8937795868542582 1400
Epoch [8/10], Loss: 4.6836, Time (s): 196.0509, Test accuracy: 0.9200 F1: 0.9210526315789473


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 195.70559520499955 1575
Time/sps 0.8938263121057489 1575
Epoch [9/10], Loss: 2.2572, Time (s): 195.7056, Test accuracy: 0.9483 F1: 0.9526717557251909


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 195.87933271099973 1750
Time/sps 0.8937843800456687 1750
Epoch [10/10], Loss: 0.5079, Time (s): 195.8793, Test accuracy: 0.9333 F1: 0.9395770392749246
Mean time: 195.7967


  self.pid = os.fork()



------- Test/accuracy tensor(0.9267, device='cuda:0') 1750
------- Test accuracy: 0.9267
------- Test F1: 0.9310344827586207


### Adagrad

In [None]:
args = MyArguments(
    epochs=10,
    max_len=400,
    batch_size=16,
    optimizer="adagrad",
    learning_rate=1e-5,
    trainer='backward',
    model_name="FacebookAI/roberta-base"
)

model, tokenizer = load_model(args)
loaders = get_data(HOME +  "IMDB_Dataset.csv", tokenizer, args)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of the model checkpoint at FacebookAI/roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the

In [None]:
trainer = MyTrainer(model, args, loaders[0], loaders[1], loaders[2])
trainer.train("adagrad_backward")

  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 195.63843879399792 175
Time/sps 0.8945072403908843 175
Epoch [1/10], Loss: 120.7808, Time (s): 195.6384, Test accuracy: 0.6067 F1: 0.6143790849673202


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 193.97252176099937 350
Time/sps 0.8983320168955928 350
Epoch [2/10], Loss: 109.9563, Time (s): 193.9725, Test accuracy: 0.8783 F1: 0.8918518518518518


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 194.16143604199897 525
Time/sps 0.8993230975983103 525
Epoch [3/10], Loss: 55.7648, Time (s): 194.1614, Test accuracy: 0.9367 F1: 0.940625


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 194.06447411699992 700
Time/sps 0.8999316262257564 700
Epoch [4/10], Loss: 39.9682, Time (s): 194.0645, Test accuracy: 0.9283 F1: 0.9339477726574502


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 194.51587579900297 875
Time/sps 0.8998791880188332 875
Epoch [5/10], Loss: 36.3473, Time (s): 194.5159, Test accuracy: 0.9300 F1: 0.9345794392523364


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 193.94116705600027 1050
Time/sps 0.9002876442927443 1050
Epoch [6/10], Loss: 34.2574, Time (s): 193.9412, Test accuracy: 0.9367 F1: 0.9413580246913581


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 194.4102611599992 1225
Time/sps 0.9002691567724294 1225
Epoch [7/10], Loss: 33.8061, Time (s): 194.4103, Test accuracy: 0.9400 F1: 0.9444444444444444


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 194.09416747199793 1400
Time/sps 0.900438315375445 1400
Epoch [8/10], Loss: 32.4413, Time (s): 194.0942, Test accuracy: 0.9467 F1: 0.9504643962848296


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 194.1255715360021 1575
Time/sps 0.900553756300714 1575
Epoch [9/10], Loss: 32.8372, Time (s): 194.1256, Test accuracy: 0.9400 F1: 0.9444444444444444


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 193.8340595209993 1750
Time/sps 0.9007812728547223 1750
Epoch [10/10], Loss: 31.0501, Time (s): 193.8341, Test accuracy: 0.9450 F1: 0.9489953632148377
Mean time: 194.2758


  self.pid = os.fork()



------- Test/accuracy tensor(0.9417, device='cuda:0') 1750
------- Test accuracy: 0.9417
------- Test F1: 0.9430894308943089


### Clipped SGD

In [None]:
args = MyArguments(
    epochs=10,
    max_len=400,
    batch_size=16,
    optimizer="sgd",
    learning_rate=1e-5,
    trainer='backward',
    model_name="FacebookAI/roberta-base",
    max_grad_norm=1
)

model, tokenizer = load_model(args)
loaders = get_data(HOME +  "IMDB_Dataset.csv", tokenizer, args)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of the model checkpoint at FacebookAI/roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the

In [None]:
trainer = MyTrainer(model, args, loaders[0], loaders[1], loaders[2])
trainer.train("sgd_backward")

  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 190.22871404200123 175
Time/sps 0.9199452400301732 175
Epoch [1/10], Loss: 121.8041, Time (s): 190.2287, Test accuracy: 0.4700 F1: 0.13114754098360656


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 188.282518222004 350
Time/sps 0.9246753336922928 350
Epoch [2/10], Loss: 121.9509, Time (s): 188.2825, Test accuracy: 0.4667 F1: 0.14438502673796794


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 188.89621968299616 525
Time/sps 0.9252610239758317 525
Epoch [3/10], Loss: 122.0193, Time (s): 188.8962, Test accuracy: 0.4867 F1: 0.20618556701030927


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 188.78181498800404 700
Time/sps 0.9256941755299538 700
Epoch [4/10], Loss: 121.8898, Time (s): 188.7818, Test accuracy: 0.4667 F1: 0.18367346938775514


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 188.93984418900072 875
Time/sps 0.9257994380888297 875
Epoch [5/10], Loss: 121.6317, Time (s): 188.9398, Test accuracy: 0.4667 F1: 0.17098445595854922


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 189.1919720760052 1050
Time/sps 0.9256638314769441 1050
Epoch [6/10], Loss: 121.8502, Time (s): 189.1920, Test accuracy: 0.4783 F1: 0.21158690176322423


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 188.96520895899448 1225
Time/sps 0.9257256024328289 1225
Epoch [7/10], Loss: 121.7495, Time (s): 188.9652, Test accuracy: 0.4700 F1: 0.20100502512562812


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 188.53616902100475 1400
Time/sps 0.9260346607810479 1400
Epoch [8/10], Loss: 121.7852, Time (s): 188.5362, Test accuracy: 0.4833 F1: 0.2091836734693878


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 188.87169008699857 1575
Time/sps 0.9260924422105129 1575
Epoch [9/10], Loss: 121.6214, Time (s): 188.8717, Test accuracy: 0.4933 F1: 0.2549019607843137


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 189.00686238599883 1750
Time/sps 0.9260724248737371 1750
Epoch [10/10], Loss: 121.5426, Time (s): 189.0069, Test accuracy: 0.4667 F1: 0.19999999999999998
Mean time: 188.9701


  self.pid = os.fork()



------- Test/accuracy tensor(0.4867, device='cuda:0') 1750
------- Test accuracy: 0.4867
------- Test F1: 0.23762376237623759


### MomentumSGD

In [None]:
args = MyArguments(
    epochs=10,
    max_len=400,
    batch_size=16,
    optimizer="momentumSGD",
    learning_rate=1e-5,
    trainer='backward',
    model_name="FacebookAI/roberta-base",
    momentum=0.9
)

model, tokenizer = load_model(args)
loaders = get_data(HOME +  "IMDB_Dataset.csv", tokenizer, args)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of the model checkpoint at FacebookAI/roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the

In [None]:
trainer = MyTrainer(model, args, loaders[0], loaders[1], loaders[2])
trainer.train("momentumSGD_backward")

  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 190.34832497099706 175
Time/sps 0.9193671655721917 175
Epoch [1/10], Loss: 121.6712, Time (s): 190.3483, Test accuracy: 0.4867 F1: 0.1348314606741573


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 190.5480777210032 350
Time/sps 0.9188850236609253 350
Epoch [2/10], Loss: 121.5102, Time (s): 190.5481, Test accuracy: 0.4717 F1: 0.18508997429305915


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()
  self.pid = os.fork()


Time/batch_time 190.78617558800033 525
Time/sps 0.918341786065175 525
Epoch [3/10], Loss: 121.4291, Time (s): 190.7862, Test accuracy: 0.4933 F1: 0.2897196261682243


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 190.89187994499662 700
Time/sps 0.9179431496163032 700
Epoch [4/10], Loss: 121.4069, Time (s): 190.8919, Test accuracy: 0.5133 F1: 0.38396624472573837


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 190.50649240500206 875
Time/sps 0.9180752164038252 875
Epoch [5/10], Loss: 121.2710, Time (s): 190.5065, Test accuracy: 0.5000 F1: 0.42748091603053434


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 190.92578915100603 1050
Time/sps 0.9178267605320221 1050
Epoch [6/10], Loss: 121.0103, Time (s): 190.9258, Test accuracy: 0.5167 F1: 0.4746376811594203


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 190.6435871010035 1225
Time/sps 0.9178434046180678 1225
Epoch [7/10], Loss: 121.2909, Time (s): 190.6436, Test accuracy: 0.5417 F1: 0.5027124773960217


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 190.88271323799563 1400
Time/sps 0.9177120148704682 1400
Epoch [8/10], Loss: 121.0471, Time (s): 190.8827, Test accuracy: 0.4967 F1: 0.4448529411764706


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 190.76294739500008 1575
Time/sps 0.9176738811120893 1575
Epoch [9/10], Loss: 121.1425, Time (s): 190.7629, Test accuracy: 0.5367 F1: 0.5335570469798658


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 190.8938960919986 1750
Time/sps 0.9175803704926773 1750
Epoch [10/10], Loss: 121.1467, Time (s): 190.8939, Test accuracy: 0.4883 F1: 0.4752136752136752
Mean time: 190.7190


  self.pid = os.fork()



------- Test/accuracy tensor(0.5317, device='cuda:0') 1750
------- Test accuracy: 0.5317
------- Test F1: 0.5044091710758377


### SGD

In [13]:
args = MyArguments(
    epochs=10,
    max_len=400,
    batch_size=16,
    optimizer="sgd",
    learning_rate=1e-3,
    trainer='backward',
    model_name="FacebookAI/roberta-base"
)

model, tokenizer = load_model(args)
loaders = get_data(HOME +  "IMDB_Dataset.csv", tokenizer, args)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of the model checkpoint at FacebookAI/roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_pro

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['sentiment_score'] = data.sentiment.apply(to_sentiment)


In [14]:
trainer = MyTrainer(model, args, loaders[0], loaders[1], loaders[2])
trainer.train("sgd_backward_lr3")

  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 189.583307129 175
Time/sps 0.9230770506652417 175
Epoch [1/10], Loss: 121.5022, Time (s): 189.5833, Test accuracy: 0.5117 F1: 0.665142857142857


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 194.97004769500006 350
Time/sps 0.9101467861597146 350
Epoch [2/10], Loss: 121.4245, Time (s): 194.9700, Test accuracy: 0.5350 F1: 0.6658682634730541


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()
  self.pid = os.fork()


Time/batch_time 195.13353359100006 525
Time/sps 0.9056613328541432 525
Epoch [3/10], Loss: 120.9270, Time (s): 195.1335, Test accuracy: 0.5417 F1: 0.6920492721164614


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 194.94635566500006 700
Time/sps 0.9036534454848514 700
Epoch [4/10], Loss: 120.7644, Time (s): 194.9464, Test accuracy: 0.5617 F1: 0.6623876765083441


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 195.10275948499998 875
Time/sps 0.9023074288087416 875
Epoch [5/10], Loss: 118.7973, Time (s): 195.1028, Test accuracy: 0.6550 F1: 0.7472527472527473


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 195.03167222800016 1050
Time/sps 0.9014673241898957 1050
Epoch [6/10], Loss: 71.8314, Time (s): 195.0317, Test accuracy: 0.9417 F1: 0.9457364341085271


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 194.69500007800002 1225
Time/sps 0.9010913074278771 1225
Epoch [7/10], Loss: 43.2941, Time (s): 194.6950, Test accuracy: 0.9283 F1: 0.9311999999999999


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 195.17392047600015 1400
Time/sps 0.9005319978248569 1400
Epoch [8/10], Loss: 35.8188, Time (s): 195.1739, Test accuracy: 0.9333 F1: 0.9395770392749246


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 195.2275287579996 1575
Time/sps 0.9000698839434134 1575
Epoch [9/10], Loss: 31.5080, Time (s): 195.2275, Test accuracy: 0.9383 F1: 0.9422776911076443


  0%|          | 0/175 [00:00<?, ?it/s]

  self.pid = os.fork()


Time/batch_time 195.000638128 1750
Time/sps 0.8998054944915187 1750
Epoch [10/10], Loss: 27.9675, Time (s): 195.0006, Test accuracy: 0.9383 F1: 0.9422776911076443
Mean time: 194.4865


  self.pid = os.fork()



------- Test/accuracy tensor(0.9250, device='cuda:0') 1750
------- Test accuracy: 0.9250
------- Test F1: 0.9258649093904449
