In [1]:
from dpm_preprocessing import DPMProprocessed
import torch
from transformers import RobertaForSequenceClassification, RobertaTokenizer, Trainer, TrainingArguments
import torch.nn as nn

cuda_available = torch.cuda.is_available()

if cuda_available:
    device = torch.device("cuda"  if torch.cuda.is_available() else "cpu")
else:
    device = torch.device("cpu")

import os
os.environ["WANDB_DISABLED"] = "true"

Collecting contractions
  Using cached contractions-0.1.66-py2.py3-none-any.whl (8.0 kB)
Collecting textsearch>=0.0.21
  Using cached textsearch-0.0.21-py2.py3-none-any.whl (7.5 kB)
Collecting anyascii
  Using cached anyascii-0.3.0-py3-none-any.whl (284 kB)
Collecting pyahocorasick
  Using cached pyahocorasick-1.4.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (109 kB)
Installing collected packages: pyahocorasick, anyascii, textsearch, contractions


ERROR: Could not install packages due to an OSError: [Errno 13] Permission denied: '/opt/miniconda3/lib/python3.9/site-packages/ahocorasick.cpython-39-x86_64-linux-gnu.so'
Consider using the `--user` option or check the permissions.



Collecting nlpaug
  Using cached nlpaug-1.1.10-py3-none-any.whl (410 kB)
Installing collected packages: nlpaug


ERROR: Could not install packages due to an OSError: [Errno 13] Permission denied: '/opt/miniconda3/lib/python3.9/site-packages/nlpaug'
Consider using the `--user` option or check the permissions.



Collecting google-cloud-translate==2.0.1
  Using cached google_cloud_translate-2.0.1-py2.py3-none-any.whl (90 kB)
Collecting google-api-core[grpc]<2.0.0dev,>=1.15.0
  Using cached google_api_core-1.31.5-py2.py3-none-any.whl (93 kB)
Collecting google-cloud-core<2.0dev,>=1.1.0
  Using cached google_cloud_core-1.7.2-py2.py3-none-any.whl (28 kB)
Collecting googleapis-common-protos<2.0dev,>=1.6.0
  Using cached googleapis_common_protos-1.55.0-py2.py3-none-any.whl (212 kB)
Collecting google-auth<2.0dev,>=1.25.0
  Using cached google_auth-1.35.0-py2.py3-none-any.whl (152 kB)
Collecting cachetools<5.0,>=2.0.0
  Using cached cachetools-4.2.4-py3-none-any.whl (10 kB)
Installing collected packages: googleapis-common-protos, cachetools, google-auth, google-api-core, google-cloud-core, google-cloud-translate


ERROR: Could not install packages due to an OSError: [Errno 13] Permission denied: '/opt/miniconda3/lib/python3.9/site-packages/googleapis_common_protos-1.55.0-py3.10-nspkg.pth'
Consider using the `--user` option or check the permissions.



DistributionNotFound: The 'google-cloud-translate' distribution was not found and is required by the application

In [None]:
dpm_pp = DPMProprocessed('.', 'task4_test.tsv')
train_df, val_df = dpm_pp.get_unbalanced_split()

print("Training set length: ",len(train_df))
print("Validation set length: ",len(val_df))

In [None]:
MAX_SEQ_LEN = 256
class PCLDataset(torch.utils.data.Dataset):

    def __init__(self, tokenizer, input_set):

        self.tokenizer = tokenizer
        self.texts = list(input_set['text'])
        self.labels = list(input_set['label'])
        
    def collate_fn(self, batch):

        texts = []
        labels = []

        for b in batch:
            texts.append(b['text'])
            labels.append(b['label'])

        encodings = self.tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=MAX_SEQ_LEN)
        encodings['labels'] =  torch.tensor(labels)
        return encodings
    
    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
       
        item = {'text': self.texts[idx],
                'label': self.labels[idx]}
        return item


In [None]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

train_dataset = PCLDataset(tokenizer, train_df)
test_dataset = PCLDataset(tokenizer, val_df)


In [None]:
# put all train set into one batch for the collate_fn function
# batch = [sample for sample in train_dataset]

# encodings = train_dataset.collate_fn(batch[:10])

# for key, value in encodings.items():
#   print(f"{key}: {value.numpy().tolist()}")

In [None]:
model = RobertaForSequenceClassification.from_pretrained("roberta-base").to(device)

In [None]:
# inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")


# inputs['labels'] =  torch.tensor([1]).unsqueeze(0)
# inputs.to(device)

# outputs = model(**inputs)

# loss = outputs.loss

# logits = outputs.logits

In [None]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs):
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.logits
        # compute custom loss (suppose one has 3 labels with different weights)
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([1.0, 10.0]).to(device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return loss

In [None]:
training_args = TrainingArguments(
        output_dir='./experiment/pcl',
        learning_rate = 0.0001,
        logging_steps= 100,
        per_device_train_batch_size=12,
        num_train_epochs = 3,
    )

trainer = CustomTrainer(
        model=model,                         
        args=training_args,                 
        train_dataset=train_dataset,                   
        data_collator=train_dataset.collate_fn
    )
trainer.train()