# Example Detection
This file provides an example of using RoBERTa classifier. The demonstration conducts binary classification on the G2D dataset.

In [1]:
from transformers.models.roberta.tokenization_roberta import RobertaTokenizer
from roberta.model import RoBERTClassifier
from roberta.dataloader import DataManager, en_binary_labels, id2label_binary
from roberta.trainer import SupervisedTrainer
from utils import append_single_utterances, append_progressing_utterances
import torch

model_name = 'roberta-base'

### Prepare data

In [2]:
import pandas as pd
import os
import sys
import warnings
warnings.filterwarnings('ignore')

dataset_path = os.path.abspath('../dataset')
if dataset_path not in sys.path:
    sys.path.append(dataset_path)

In [10]:
# human dataset = Missing Sentence Completion dataset
human_dataset_gpt = pd.read_csv(os.path.join(dataset_path, 'Missing_Sentence_gpt.csv'), index_col=0).sort_values(by='dia_no', ascending=True, ignore_index=True)
human_dataset_llama = pd.read_csv(os.path.join(dataset_path, 'Missing_Sentence_llama.csv'), index_col=0).sort_values(by='dia_no', ascending=True, ignore_index=True)
human_dataset = pd.concat([human_dataset_gpt, human_dataset_llama], ignore_index=True).groupby('dia_no').sample(n=1).reset_index(drop=True)
human_dataset['label'] = ['human']*human_dataset.shape[0]

# ai dataset = llama and gpt G2D datasets
llama_dataset = pd.read_csv(os.path.join(dataset_path, 'G2D_llama.csv'), index_col=0).sort_values(by='dia_no', ascending=True, ignore_index=True)
llama_dataset = pd.DataFrame({"dia_no": llama_dataset['dia_no'], 
                              "dia": llama_dataset['dia'],
                              "label":['ai']*llama_dataset.shape[0]})
gpt_dataset = pd.read_csv(os.path.join(dataset_path, 'G2D_gpt.csv'), index_col=0).sort_values(by='dia_no', ascending=True, ignore_index=True)
gpt_dataset = pd.DataFrame({"dia_no": gpt_dataset['dia_no'], 
                            "dia": gpt_dataset['dia'],
                            "label":['ai']*gpt_dataset.shape[0]})

# full dataset = concatenation of all classes
dataset_df = pd.concat([human_dataset, llama_dataset, gpt_dataset], ignore_index=True)
tokenizer = RobertaTokenizer.from_pretrained(model_name)


In [8]:
# Uncomment the following lines to expand dataset by adding progressive / single utterances
# dataset_df = append_single_utterances(dataset_df).reset_index(drop=True)
# dataset_df = append_progressing_utterances(dataset_df).reset_index(drop=True)

In [None]:
# Note: for next_response dataset, the name need to include "next_response" to ensure correct padding
dataset_name = 'g2d_binary'

print('Log INFO: initializing dataset...')
data = DataManager(dataset_df, id2label_binary, 0.2, tokenizer, dataset_name, 16, load_from_cache=False)
print("Log INFO: dataset initialization finished.")

### Run Model

In [None]:
# Build and train the model
print('-' * 32 + 'classify' + '-' * 32)
classifier = RoBERTClassifier(model_name, id2label_binary)
args = {
     'num_train_epochs': 10,
     'weight_decay': 0.1,
     'lr': 1e-5,
     'warm_up_ratio': 0.1
}
trainer = SupervisedTrainer(data, classifier, en_binary_labels, id2label_binary, args)
ckpt_name = f'g2d_binary_roberta.pt'
trainer.train(ckpt_name=ckpt_name)

In [None]:
# Load and test the model
saved_model = torch.load(os.path.join("trained_model", ckpt_name))
trainer.model.load_state_dict(saved_model.state_dict())
trainer.test(data.test_dataloader, content_level_eval=True)