In [95]:
import torch
import torch.nn as nn
from datasets import load_dataset
from datasets import Dataset
from transformers import (
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    AutoConfig,
)
import numpy as np
from huggingface_hub import HfFolder, notebook_login
import sys

import os
os.environ["CODE_REPO_PATH"]="/home/haokunliu/past-interaction-learning"

code_repo_path = os.environ.get("CODE_REPO_PATH")
sys.path.append(f'{code_repo_path}/code')
import matplotlib.pyplot as plt


In [47]:
%load_ext autoreload
%autoreload 2
from data_loader import get_data
from dicts import LABEL_DICT, PROMPT_NAME_DICT, reverse_dict
from RoBERTa_trainer import prepare_trainer

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [109]:
train_sizes = [1000]
SEEDS = [49]

In [110]:
args = {
    'num_train_epochs':20,
    'per_device_train_batch_size':32,
    'per_device_eval_batch_size':32,
    'learning_rate':5e-5,
    'weight_decay':0.01,
    'warmup_steps':500,
    'logging_strategy':"steps",
    'logging_steps':10,
    'evaluation_strategy':"epoch",
    'save_strategy':"epoch",
    'save_total_limit':2,
    'load_best_model_at_end':True,
    'output_dir':"/net/scratch/haokunliu/outputs",
    'use_ood_reviews':"all"
}

In [111]:
TASK = 'hotel_reviews'

In [112]:
test_accs = []
for train_size in train_sizes:
    test_acc = 0
    for SEED in SEEDS:
        trainer, train_dataset, test_dataset, val_dataset = prepare_trainer(model_id="roberta-base",
                                                                    task_name=TASK,
                                                                    num_train=train_size,
                                                                    num_test=300,
                                                                    num_val=300,
                                                                    seed=SEED,
                                                                    **args)
        trainer.train()
        results = trainer.evaluate(test_dataset)
        test_acc += results['eval_accuracy']
    
    test_acc = test_acc/len(SEEDS)
    test_accs.append(test_acc)

Setting seed to 49


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


task_name: hotel_reviews
Loading all OOD hotel reviews.


Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy
1,0.691,0.693039,0.5
2,0.6928,0.688943,0.5
3,0.6883,0.657952,0.7
4,0.4732,0.430357,0.806667
5,0.2879,0.7355,0.746667
6,0.1783,0.30413,0.896667
7,0.1445,0.283222,0.91
8,0.1448,0.671071,0.81
9,0.1174,0.636937,0.863333
10,0.13,0.542364,0.89


In [113]:
print(train_dataset)
print(test_dataset)
print(val_dataset)

Dataset({
    features: ['review', 'label', 'input_ids', 'attention_mask'],
    num_rows: 800
})
Dataset({
    features: ['review', 'label', 'input_ids', 'attention_mask'],
    num_rows: 300
})
Dataset({
    features: ['review', 'label', 'input_ids', 'attention_mask'],
    num_rows: 300
})


In [114]:
test_accs

[0.7966666666666666]

In [115]:
args = {
    'num_train_epochs':20,
    'per_device_train_batch_size':32,
    'per_device_eval_batch_size':32,
    'learning_rate':5e-5,
    'weight_decay':0.01,
    'warmup_steps':500,
    'logging_strategy':"steps",
    'logging_steps':10,
    'evaluation_strategy':"epoch",
    'save_strategy':"epoch",
    'save_total_limit':2,
    'load_best_model_at_end':True,
    'output_dir':"/net/scratch/haokunliu/outputs",
}

In [116]:
_, _, ood_all, _ = prepare_trainer(model_id="roberta-base",
                                   task_name='hotel_reviews',
                                   num_train=10000,
                                   num_test=300,
                                   num_val=100,
                                   seed=49,
                                   use_ood_reviews='all',
                                   **args)

_, _, ood_chicago, _ = prepare_trainer(model_id="roberta-base",
                                       task_name='hotel_reviews',
                                       num_train=10000,
                                       num_test=300,
                                       num_val=100,
                                       seed=49,
                                       use_ood_reviews='Chicago',
                                       **args)

_, _, ood_non_chicago, _ = prepare_trainer(model_id="roberta-base",
                                           task_name='hotel_reviews',
                                           num_train=10000,
                                           num_test=300,
                                           num_val=100,
                                           seed=49,
                                           use_ood_reviews='non-Chicago',
                                           **args)

_, _, our_test_set, _ = prepare_trainer(model_id="roberta-base",
                                        task_name='hotel_reviews',
                                        num_train=10000,
                                        num_test=300,
                                        num_val=100,
                                        seed=49,
                                        use_ood_reviews='None',
                                        **args)

Setting seed to 49


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


task_name: hotel_reviews
Loading all OOD hotel reviews.


Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Setting seed to 49


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


task_name: hotel_reviews
Loading Chicago OOD hotel reviews.


Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/160 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Setting seed to 49


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


task_name: hotel_reviews
Loading non-Chicago OOD hotel reviews.


Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Setting seed to 49


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


task_name: hotel_reviews


Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [117]:
ood_results_all = trainer.evaluate(ood_all)
ood_results_chicago = trainer.evaluate(ood_chicago)
ood_results_non_chicago = trainer.evaluate(ood_non_chicago)
results_our_test_set = trainer.evaluate(our_test_set)

ood_preds_all = trainer.predict(ood_all)
ood_preds_chicago = trainer.predict(ood_chicago)
ood_preds_non_chicago = trainer.predict(ood_non_chicago)
preds_our_test_set = trainer.predict(our_test_set)

In [118]:
print(ood_results_all)
print(ood_results_chicago)
print(ood_results_non_chicago)
print(results_our_test_set)

{'eval_loss': 0.6060465574264526, 'eval_accuracy': 0.7966666666666666, 'eval_runtime': 1.8543, 'eval_samples_per_second': 161.786, 'eval_steps_per_second': 5.393, 'epoch': 20.0}
{'eval_loss': 0.522203803062439, 'eval_accuracy': 0.79375, 'eval_runtime': 1.312, 'eval_samples_per_second': 121.95, 'eval_steps_per_second': 3.811, 'epoch': 20.0}
{'eval_loss': 0.6615473031997681, 'eval_accuracy': 0.7933333333333333, 'eval_runtime': 2.1341, 'eval_samples_per_second': 140.575, 'eval_steps_per_second': 4.686, 'epoch': 20.0}
{'eval_loss': 0.24553726613521576, 'eval_accuracy': 0.9033333333333333, 'eval_runtime': 2.3908, 'eval_samples_per_second': 125.482, 'eval_steps_per_second': 4.183, 'epoch': 20.0}


In [119]:
preds = np.argmax(ood_preds_all[0],axis=1)
ct = 0
for i in range(300):
    if preds[i] == ood_all['label'][i]:
        ct += 1
print(ct/300)

0.7966666666666666


In [120]:
preds_label = []
for i in range(300):
    if preds[i] == 0:
        preds_label.append('deceptive')
    else:
        preds_label.append('truthful')


In [121]:
import json

with open(f'{code_repo_path}/outputs/hotel_reviews/RoBERTa/train_{train_size}_preds.json', 'w') as f:
    json.dump(preds_label, f)

In [67]:
results = trainer.evaluate(val_dataset)
print(results['eval_accuracy'])

0.47


In [68]:
results = trainer.evaluate(test_dataset)

In [69]:
results['eval_accuracy']

0.47