In [None]:
# -*- coding: utf-8 -*-
"""MultipleCorrect_CS542_CompetitionTesting.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1R2uTvKyoEHEBKpk-mW63CM20oKcPV_BR
"""

!pip install transformers datasets evaluate

from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import DefaultDataCollator
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
from huggingface_hub import notebook_login
# from transformers import TFPegasusModel
from transformers import pipeline
import os
import json
import pickle
import numpy as np
import evaluate
import transformers

from google.colab import drive
drive.mount('/content/drive')
notebook_login()
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForMultipleChoice, TrainingArguments, Trainer


autocast_questions = json.load(open('/content/drive/Shareddrives/Autocast Competition/autocast-master/autocast/autocast_questions.json'))
test_questions = json.load(open('/content/drive/Shareddrives/Autocast Competition/autocast-master/competition/autocast_competition_test_set.json'))
test_ids = [q['id'] for q in test_questions]

mc_questions = [x for x in autocast_questions if x['qtype'] == 'mc' and x['id'] not in test_ids]


In [None]:
divided_data = {}
data = mc_questions
for entry in data:
    num_choices = len(entry["choices"])
    if num_choices is None:
        continue
    if num_choices not in divided_data:
        divided_data[num_choices] = []
    divided_data[num_choices].append(entry)

# Save divided datasets into separate JSON files
for num_choices, dataset in divided_data.items():
    with open(f"dataset_choices_{num_choices}.json", "w") as f:
        json.dump(dataset, f)

In [None]:
from transformers import AutoModelForMultipleChoice

model = AutoModelForMultipleChoice.from_pretrained("roberta-base")
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

In [None]:
import json
import random
from datasets import Dataset

from transformers import RobertaTokenizer, RobertaForMultipleChoice, TrainingArguments, Trainer

def preprocess_data(num_choices, dataset):
    def format_example(example):
        if not example["answer"] or len(example["answer"]) != 1:
            return None

        question = example["question"]
        choices = example["choices"]
        answer = ord(example["answer"]) - ord("A")

        input_ids = []
        attention_mask = []

        for choice in choices:
            encoded = tokenizer.encode_plus(question, choice, add_special_tokens=True,
                                            max_length=512, padding="max_length", truncation=True, return_attention_mask=True, return_tensors="pt")
            input_ids.append(encoded["input_ids"])
            attention_mask.append(encoded["attention_mask"])

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "label": answer
        }

    formatted_dataset = [format_example(example) for example in dataset]
    formatted_dataset = [example for example in formatted_dataset if example is not None]
    return Dataset.from_dict({k: [d[k] for d in formatted_dataset] for k in formatted_dataset[0]})


In [None]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
# newdataset = {}
# newdataset[9] = divided_data[9]
# newdataset[10] = divided_data[10]
# newdataset[11] = divided_data[11]
# newdataset[12] = divided_data[12]
# newdataset[13] = divided_data[13]
# newdataset[16] = divided_data[16]

# newdataset.keys()

In [None]:
# Iterate over divided datasets
for num_choices, dataset in divided_data.items():
    random.shuffle(dataset)
    split_idx = int(0.8 * len(dataset))
    train_dataset, eval_dataset = dataset[:split_idx], dataset[split_idx:]

    # Preprocess the dataset
    train_dataset = preprocess_data(num_choices, train_dataset)
    eval_dataset = preprocess_data(num_choices, eval_dataset)

    id2label = {i: chr(ord("A") + i) for i in range(num_choices)}
    label2id = {v: k for k, v in id2label.items()}

    # Initialize the tokenizer and the model
    tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
    model = RobertaForMultipleChoice.from_pretrained("roberta-base"
      , num_labels=num_choices, id2label=id2label, label2id=label2id)

    # Set up training arguments
    training_args = TrainingArguments(
        output_dir=f"roberta_mc_model_{num_choices}",
        num_train_epochs=10,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        push_to_hub=True,
        gradient_accumulation_steps=4,
    )

    # Initialize the Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
    )


    # Train the model
    trainer.train()

    # # Save the model
    # trainer.save_model(f"./model_choices_{num_choices}")