### Import libraries

In [1]:
# Connect drive
from google.colab import drive
drive.mount('/content/drive')

!pip install transformers
!pip install simpletransformers
import torch
import pandas as pd
from simpletransformers.classification import ClassificationModel
from transformers import RobertaTokenizer, RobertaModel, RobertaConfig

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


### Select GPU

In [2]:
# Check if a GPU is available
if torch.cuda.is_available():
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


### Load datasets

In [3]:
train = pd.read_csv('/content/drive/MyDrive/NCU/Data Science and Machine Learning/Assignment 3/Datasets/train.csv')
train = train.drop(train.columns[0], axis=1)

valid = pd.read_csv('/content/drive/MyDrive/NCU/Data Science and Machine Learning/Assignment 3/Datasets/test.csv')
valid = valid.drop(valid.columns[0], axis=1)

In [4]:
train.head(), valid.head()

(                                                TEXT  LABEL
 0  director dirk shafer and co-writer greg hinton...      0
 1  a charming , quirky and leisurely paced scotti...      1
 2  the price was good ,  and came quickly though ...      1
 3  i was looking forward to this game for a coupl...      0
 4  arguably the year 's silliest and most incoher...      0,
                                                 TEXT
 0   good to know if you can t find these elsewhere .
 1  love it !  the grill plates come out and pop i...
 2  i m convinced this was a poorly executed refur...
 3  i would never have complained about that if it...
 4  the photo shows the same whole ,  large candie...)

## Soft Prompt tunning

### Set up soft prompts and add them to the datasets

In [5]:
# Set up the soft prompts
prompt1 = "This text is about a positive event"
prompt2 = "This text is about a negative event"

# Load the tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-large')

# Encode the soft prompts
prompt1_enc = tokenizer.encode(prompt1, add_special_tokens=False)
prompt2_enc = tokenizer.encode(prompt2, add_special_tokens=False)

# For the training set (encoding + soft_prompt)
def add_prompts(input_text, label):
    if label == 1:  # positive sentiment
        prompts = prompt1_enc
    else:  # negative sentiment
        prompts = prompt2_enc
    input_text = prompts + tokenizer.encode(input_text, add_special_tokens=False)
    return input_text

# For the validation set (only encoding)
def encode_text(input_text):
    input_text = tokenizer.encode(input_text, add_special_tokens=False)
    return input_text

In [6]:
# Encode + softprompt training set
train['TEXT'] = train.apply(lambda x: add_prompts(x['TEXT'], x['LABEL']), axis=1)

# Encode validation sentences
valid['TEXT'] = valid['TEXT'].apply(encode_text)

In [7]:
train.head(), valid.head()

(                                                TEXT  LABEL
 0  [713, 2788, 16, 59, 10, 2430, 515, 19709, 385,...      0
 1  [713, 2788, 16, 59, 10, 1313, 515, 102, 18452,...      1
 2  [713, 2788, 16, 59, 10, 1313, 515, 627, 425, 2...      1
 3  [713, 2788, 16, 59, 10, 2430, 515, 118, 21, 54...      0
 4  [713, 2788, 16, 59, 10, 2430, 515, 5384, 44527...      0,
                                                 TEXT
 0  [8396, 7, 216, 114, 47, 64, 326, 465, 209, 514...
 1  [17693, 24, 27785, 1437, 5, 20212, 12957, 283,...
 2  [118, 475, 7013, 42, 21, 10, 12101, 9390, 1788...
 3  [118, 74, 393, 33, 7311, 59, 14, 114, 24, 74, ...
 4  [627, 1345, 924, 5, 276, 1086, 2156, 1437, 739...)

### Set up model

In [8]:
# Set up the model configuration
config = RobertaConfig.from_pretrained('roberta-large', num_labels=2) # 0 and 1

# Initialize the model with the modified input layer
model = RobertaModel.from_pretrained('roberta-large', config=config)

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Custom Class (not working)

In [14]:
class SPRoBERTa(torch.nn.Module):
    def __init__(self, model):
        super(SPRoBERTa, self).__init__()
        self.model = model
        self.dense = torch.nn.Linear(768, 2)

    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None):
        print("input_ids:", input_ids.shape)
        print("attention_mask:", attention_mask.shape if attention_mask is not None else None)
        print("token_type_ids:", token_type_ids.shape if token_type_ids is not None else None)
        print("position_ids:", position_ids.shape if position_ids is not None else None)
        print("head_mask:", head_mask.shape if head_mask is not None else None)
        print("inputs_embeds:", inputs_embeds.shape if inputs_embeds is not None else None)
        
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds)
        if hasattr(outputs, 'hidden_states'):
            last_hidden_state = self.remove_prompts(outputs.hidden_states[-1])
            pooled_output = self.calculate_pooled_output(last_hidden_state)
            logits = self.classify(pooled_output)
            loss = self.calculate_loss(logits, labels) if labels is not None else None
            output = (logits,) + outputs[2:]
        else:
            logits = outputs.logits
            loss = self.calculate_loss(logits, labels) if labels is not None else None
            output = (logits,) + outputs[2:]
        return ((loss,) + output) if loss is not None else output


    
    def remove_prompts(self, hidden_state):
        return hidden_state[:, 3:, :]
    
    def calculate_pooled_output(self, hidden_state):
        return hidden_state.mean(dim=1)
    
    def classify(self, pooled_output):
        return self.dense(pooled_output)
    
    def calculate_loss(self, logits, labels):
        loss_fct = torch.nn.CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, 2), labels.view(-1))
        return loss

# Replace the original model with the soft prompting version
model = ClassificationModel('roberta', 'roberta-large', num_labels=2, use_cuda=False, args={'overwrite_output_dir': True})
model.model = SPRoBERTa(model.model)

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.weight', 'classif

### Hyperparameters 

In [10]:
train_args = {
    'reprocess_input_data': True,
    'output_hidden_states': True,
    'overwrite_output_dir': True,
    'num_train_epochs': 3,
    'learning_rate': 2e-5,
    'per_device_train_batch_size': 8,
    'per_device_eval_batch_size': 16,
    'gradient_accumulation_steps': 1,
    'warmup_steps': 200,
    'weight_decay': 0.01,
    'adam_epsilon': 1e-8,
    'max_grad_norm': 1.0,
    'max_steps': -1,
    'lr_scheduler_type': 'linear',
    'logging_dir': './logs',
    'logging_steps': 100,
    'eval_steps': 200,
    'save_steps': 500,
    'save_model_every_epoch': True,
    'evaluate_during_training': False,
    'evaluate_during_training_steps': 500,
    'evaluate_during_training_verbose': False,
    'output_dir': './results',
    'cache_dir': './cache',
    'no_cache': False,
    'use_early_stopping': True,
    'early_stopping_patience': 5,
    'early_stopping_delta': 0.01,
    'early_stopping_metric': 'eval_loss',
    'early_stopping_metric_minimize': True,
    'manual_seed': None,
    'encoding': None,
    'fp16': False,
    'fp16_opt_level': 'O1',
    'fp16_backend': 'auto',
    'wandb_project': None,
    'wandb_kwargs': {},
    'args': {}
}


### Training

In [15]:
# Train the model
model.train_model(train, args=train_args)

# Evaluate the model on the validation set
result, model_outputs, wrong_predictions = model.eval_model(valid)

  0%|          | 0/2000 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/250 [00:00<?, ?it/s]

input_ids: torch.Size([8, 128])
attention_mask: torch.Size([8, 128])
token_type_ids: None
position_ids: None
head_mask: None
inputs_embeds: None


TypeError: ignored

### Predictions

In [None]:
# Get the predicted labels and row_ids
predicted_labels = [int(round(out[1])) for out in model_outputs]
row_ids = valid['row_id'].tolist()

# Create a DataFrame with the predicted labels and row_ids
predictions_df = pd.DataFrame({'row_id': row_ids, 'label': predicted_labels})

# Save the DataFrame to a csv file
predictions_df.to_csv('predictions.csv', index=False)