<a href="https://colab.research.google.com/github/Deezzznutz/AizenFormat/blob/main/aiken_format.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [51]:
!pip install transformers
!pip install torch



In [52]:
import json
from transformers import BertTokenizer, BertForMultipleChoice, Trainer, TrainingArguments, EarlyStoppingCallback
import torch
from torch.utils.data import Dataset, random_split

In [53]:
# Load dataset
with open('sql_questions.json', 'r') as f:
    dataset = json.load(f)

print(dataset[0])

{'question': 'Which operator can be used for pattern matching in SQL?', 'options': {'A': 'LIKE', 'B': 'MATCH', 'C': 'EQUAL', 'D': 'IN'}, 'answer': 'A'}


In [54]:
# Tokenizer setup
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")



In [55]:
# Preprocessing function
def preprocess_data(dataset):
    inputs = {
        "input_ids": [],
        "attention_mask": []
    }
    labels = []
    for entry in dataset:
        question = entry["question"]
        options = entry["options"]
        answer = entry["answer"]

        # Tokenize question and each option
        encoded_options = [tokenizer.encode_plus(
            question, options[opt],
            max_length=128,
            padding="max_length",
            truncation=True
        ) for opt in options]

        input_ids = [opt["input_ids"] for opt in encoded_options]
        attention_mask = [opt["attention_mask"] for opt in encoded_options]

        inputs["input_ids"].append(input_ids)
        inputs["attention_mask"].append(attention_mask)
        labels.append(ord(answer) - ord('A'))  # Convert answer to index

    return inputs, labels

In [56]:
# Dataset class
class SQLQuestionsDataset(Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {
            "input_ids": torch.tensor(self.inputs["input_ids"][idx]),
            "attention_mask": torch.tensor(self.inputs["attention_mask"][idx])
        }
        item["labels"] = torch.tensor(self.labels[idx])
        return item

In [57]:
# Split dataset into train and eval sets
train_size = int(0.8 * len(dataset))
eval_size = len(dataset) - train_size
train_dataset, eval_dataset = random_split(dataset, [train_size, eval_size])

train_inputs, train_labels = preprocess_data(train_dataset)
eval_inputs, eval_labels = preprocess_data(eval_dataset)

train_dataset = SQLQuestionsDataset(train_inputs, train_labels)
eval_dataset = SQLQuestionsDataset(eval_inputs, eval_labels)

In [58]:
# Load pre-trained model
model = BertForMultipleChoice.from_pretrained("bert-base-uncased")

Some weights of BertForMultipleChoice were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [91]:
# Training arguments with improvements
training_args = TrainingArguments(
    output_dir='./results',
    run_name='my_unique_run_name',
    per_device_train_batch_size=16,  # Increased batch size
    per_device_eval_batch_size=16,
    num_train_epochs=35,             # Increased number of epochs
    learning_rate=1e-5,  # Lower learning rate
    weight_decay=0.2,  # Slightly increased weight decay
    warmup_steps=1000,   # Increased warmup steps
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',
    eval_strategy='epoch',  # Evaluate at the end of each epoch
    save_strategy='epoch',
    logging_dir='./logs',
    logging_steps=100,
)

In [92]:
# Early stopping callback
early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=3)

In [93]:
for param in model.bert.parameters():
    param.requires_grad = False

In [94]:
# Setup Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    callbacks=[early_stopping_callback],
)

# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,1.291874
2,No log,1.291869
3,No log,1.291862
4,No log,1.291851
5,No log,1.291838
6,No log,1.291821
7,No log,1.291801
8,No log,1.291778
9,No log,1.291751
10,No log,1.291722


TrainOutput(global_step=105, training_loss=1.1570202509562175, metrics={'train_runtime': 165.4545, 'train_samples_per_second': 8.25, 'train_steps_per_second': 0.635, 'total_flos': 359143365934080.0, 'train_loss': 1.1570202509562175, 'epoch': 35.0})

In [95]:
# Evaluate the model on the eval dataset
eval_results = trainer.evaluate()

print(eval_results)  # Print out the evaluation metrics


{'eval_loss': 1.2900588512420654, 'eval_runtime': 0.2603, 'eval_samples_per_second': 38.41, 'eval_steps_per_second': 3.841, 'epoch': 35.0}


In [98]:
trainer.evaluate(eval_dataset)

{'eval_loss': 1.2900588512420654,
 'eval_runtime': 0.3079,
 'eval_samples_per_second': 32.476,
 'eval_steps_per_second': 3.248,
 'epoch': 35.0}

In [99]:
model.save_pretrained("path_to_save_model")
tokenizer.save_pretrained("path_to_save_model")


('path_to_save_model/tokenizer_config.json',
 'path_to_save_model/special_tokens_map.json',
 'path_to_save_model/vocab.txt',
 'path_to_save_model/added_tokens.json')