In [20]:
# !pip install auto-gptq
# !pip install optimum
# !pip install bitsandbytes
# !pip install --upgrade git+https://github.com/huggingface/transformers
# !pip install --upgrade torch torchvision
# !pip install --upgrade accelerate

In [21]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from peft import prepare_model_for_kbit_training
from peft import LoraConfig, get_peft_model
import transformers
from transformers import BertTokenizerFast, BertForQuestionAnswering, TrainingArguments, Trainer
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    GenerationConfig
)
import torch
import json
import requests
from tqdm import tqdm
import evaluate

In [22]:
# Check the available device and use GPU if available, otherwise use CPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# Print the device being used
print(f'Working on {device}')

Working on cuda


In [23]:
# Create Bitsandbytes configuration
compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=False,
    )

In [24]:
model = BertForQuestionAnswering.from_pretrained("bert-base-uncased",quantization_config=bnb_config)
#model = BertForQuestionAnswering.from_pretrained("bert-base-uncased")

`low_cpu_mem_usage` was None, now set to True since model is quantized.
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [26]:
# enable gradient check pointing
model.gradient_checkpointing_enable()
# enable quantized training
model = prepare_model_for_kbit_training(model)

In [27]:
#model.modules

In [28]:
# LoRA config
config = LoraConfig(
    r=2,
    lora_alpha=8,
    target_modules=['q_proj',
        'k_proj',
        'v_proj',
        'dense'],
    lora_dropout=0.1,
    bias="none",
    task_type="QUESTION_ANS"
)

# LoRA trainable version of model
model = get_peft_model(model, config)

# trainable parameter count
#model.print_trainable_parameters()

In [29]:
def read_data(path):
    """
    Read SQuAD data from a JSON file.

    Parameters:
    - path: Path to the JSON file containing SQuAD data

    Returns:
    - contexts: List of contexts (passages)
    - questions: List of questions
    - answers: List of answers
    """
    # Open the JSON file and load the data
    with open(path, 'r', encoding='utf-8') as f:
        squad = json.load(f)

    # Initialize lists to store contexts, questions, and answers
    contexts = []
    questions = []
    answers = []

    # Iterate over groups in the SQuAD data
    for group in squad.get('data', []):
        # Iterate over paragraphs in the group
        for passage in group.get('paragraphs', []):
            # Get the context (passage)
            context = passage.get('context', '')
            # Iterate over questions and answers in the paragraph
            for qa in passage.get('qas', []):
                # Get the question
                question = qa.get('question', '')
                # Iterate over answers for the question
                for answer in qa.get('answers', []):
                    # Append context, question, and answer to their respective lists
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)

    # Return the lists of contexts, questions, and answers
    return contexts, questions, answers

def add_end_index(answers, contexts):
    for answer, context in zip(answers, contexts):
        gold_text = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(gold_text)

        # Check if the answer is correctly positioned
        for offset in [0, -1, -2]:
            if context[start_idx + offset:end_idx + offset] == gold_text:
                # Update answer start and end indices
                answer['answer_start'] = start_idx + offset
                answer['answer_end'] = end_idx + offset
                break  # Break loop once correct offset is found

def add_token_positions(encodings, answers):
    """
    Adds token positions for answers to encodings.

    Parameters:
    - encodings: Encodings object containing tokenized inputs
    - answers: List of dictionaries containing answer positions

    Returns:
    None (modifies encodings in place)
    """
    start_positions = []
    end_positions = []

    # Loop through each answer
    for i, answer in enumerate(answers):
        # Convert character positions to token positions
        start_positions.append(encodings.char_to_token(i, answer['answer_start']))
        end_positions.append(encodings.char_to_token(i, answer['answer_end'] - 1))

        # Handle cases where answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        if end_positions[-1] is None:
            end_positions[-1] = tokenizer.model_max_length

    # Update encodings with start and end positions
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

class SQuAD_Dataset(torch.utils.data.Dataset):
    """
    Custom dataset class for SQuAD.

    Parameters:
    - encodings: Encodings object containing tokenized inputs
    """
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        """
        Retrieves an item from the dataset.

        Parameters:
        - idx: Index of the item to retrieve

        Returns:
        Dictionary containing tensors for each key in the encodings
        """
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        """
        Returns the length of the dataset.

        Returns:
        Integer representing the length of the dataset
        """
        return len(self.encodings.input_ids)


In [30]:
# Read training data
contexts, questions, answers = read_data('/accounts/grad/fangyuan_li/259/data/train-v2.0.json')
# Read validation data
valid_contexts, valid_questions, valid_answers = read_data('/accounts/grad/fangyuan_li/259/data/val-v2.0.json')
# Split train-v2.0 into train and test sets
train_contexts = contexts[5000:]
train_questions = questions[5000:]
train_answers = answers[5000:]

test_contexts = contexts[:5000]
test_questions = questions[:5000]
test_answers = answers[:5000]

# Add indexes
add_end_index(train_answers, train_contexts)
add_end_index(valid_answers, valid_contexts)
add_end_index(test_answers, test_contexts)

# Initialize tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
valid_encodings = tokenizer(valid_contexts, valid_questions, truncation=True, padding=True)
test_encodings = tokenizer(test_contexts, test_questions, truncation=True, padding=True)

# Add token positions for training data
add_token_positions(train_encodings, train_answers)
# Add token positions for validation data
add_token_positions(valid_encodings, valid_answers)
# Add token positions for test data
add_token_positions(test_encodings, test_answers)

# Create training dataset
train_dataset = SQuAD_Dataset(train_encodings)
# Create validation dataset
valid_dataset = SQuAD_Dataset(valid_encodings)
# Create test dataset
test_dataset = SQuAD_Dataset(test_encodings)

# Define the dataloaders
# train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
# valid_loader = DataLoader(valid_dataset, batch_size=16)
# test_loader = DataLoader(test_dataset, batch_size=16)

In [31]:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token = tokenizer.eos_token

In [32]:
lr = 2e-4
batch_size = 16
num_epochs = 5

training_args = transformers.TrainingArguments(
    output_dir = "../test_trainer",
    num_train_epochs = num_epochs,
    learning_rate = lr,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    weight_decay = 0.01,
    logging_strategy = "epoch",
    save_strategy = "epoch",
    load_best_model_at_end = True,
    gradient_accumulation_steps = 4,
    warmup_steps = 2,
    optim = "paged_adamw_8bit",
    eval_strategy = "epoch"
 )

In [33]:
trainer = transformers.Trainer(
    model = model,
    train_dataset = train_dataset,
    eval_dataset = valid_dataset,
    args = training_args
)

In [34]:
model.config.use_cache = False
trainer.train()



Epoch,Training Loss,Validation Loss
0,2.2897,1.462289
2,1.2643,1.236898
4,1.1596,1.198119




TrainOutput(global_step=6390, training_loss=1.464742597839642, metrics={'train_runtime': 13608.0216, 'train_samples_per_second': 30.064, 'train_steps_per_second': 0.47, 'total_flos': 1.0713658910662656e+17, 'train_loss': 1.464742597839642, 'epoch': 4.998044583496284})

In [35]:
trainer.save_model("../full_data/QLoRA2/")