## Fine-tuning Llama 3.1 (QLoRA) on LDC regeneration

In [1]:
import transformers
import torch
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import sentence_transformers
import nltk
import glob
import datasets
import peft

In [2]:
cache_directory = '/scratch/alpine/anra7539' # For caching model init
checkpoint_directory = "/scratch/alpine/anra7539/llama3.1_qlora_finetuned" # For fine-tuned model checkpoint
inference_results = '/projects/anra7539/projects/representation_efficacy/ldc_reconstructions_llama3.1_qlora/reconstructed_text.json' # For storing inference output

## Preparing Data

In [3]:
folder_path_train = "/projects/anra7539/projects/representation_efficacy/ldc_data_train/"
folder_path_val = "/projects/anra7539/projects/representation_efficacy/ldc_data_val/"
folder_path_test = "/projects/anra7539/projects/representation_efficacy/ldc_data_test/"

In [4]:
train_data = []
val_data = []
test_data = []

for file_path in os.listdir(folder_path_train):  
    with open(folder_path_train+file_path, "r") as f:
        data = f.read()
    train_data.append(data)

for file_path in os.listdir(folder_path_val):  
    with open(folder_path_val+file_path, "r") as f:
        data = f.read()
    val_data.append(data)

for file_path in os.listdir(folder_path_test):  
    with open(folder_path_test+file_path, "r") as f:
        data = f.read()
    test_data.append(data)

In [5]:
text_amr_pairs_train = sum([a.split("::snt")[1:] for a in train_data], [])
text_amr_pairs_val = sum([a.split("::snt")[1:] for a in val_data], [])
text_amr_pairs_test = sum([a.split("::snt")[1:] for a in test_data], [])

In [6]:
text_amr_train = []

for i in tqdm(range(len(text_amr_pairs_train))):
    try:
        text_amr_train.append([text_amr_pairs_train[i].split("\n#")[0].strip(), text_amr_pairs_train[i].split("\n#")[1].split(".txt\n")[1].strip()])
    except:
        pass

texts_train = []
amrs_train = []
for text,amr in text_amr_train:
    texts_train.append(text)
    amrs_train.append(amr)

100%|██████████| 62238/62238 [00:00<00:00, 176098.87it/s]


In [7]:
text_amr_val = []

for i in tqdm(range(len(text_amr_pairs_val))):
    try:
        text_amr_val.append([text_amr_pairs_val[i].split("\n#")[0].strip(), text_amr_pairs_val[i].split("\n#")[1].split(".txt\n")[1].strip()])
    except:
        pass

texts_val = []
amrs_val = []
for text,amr in text_amr_val:
    texts_val.append(text)
    amrs_val.append(amr)

100%|██████████| 2548/2548 [00:00<00:00, 713328.43it/s]


In [8]:
text_amr_test = []

for i in tqdm(range(len(text_amr_pairs_test))):
    try:
        text_amr_test.append([text_amr_pairs_test[i].split("\n#")[0].strip(), text_amr_pairs_test[i].split("\n#")[1].split(".txt\n")[1].strip()])
    except:
        pass

texts_test = []
amrs_test = []
for text,amr in text_amr_test:
    texts_test.append(text)
    amrs_test.append(amr)

100%|██████████| 2721/2721 [00:00<00:00, 664185.60it/s]


In [9]:
train_df = datasets.Dataset.from_pandas(pd.DataFrame({'Text':texts_train, 'amr':amrs_train}))
test_df = datasets.Dataset.from_pandas(pd.DataFrame({'Text':texts_test, 'amr':amrs_test}))
val_df = datasets.Dataset.from_pandas(pd.DataFrame({'Text':texts_val, 'amr':amrs_val}))

## Initializing model and tokenizing data

In [10]:
name = "meta-llama/Llama-3.1-8B-Instruct"
device = "cuda:0"

model = transformers.AutoModelForCausalLM.from_pretrained(name,
                                                          load_in_8bit = True,
                                                          trust_remote_code = True,
                                              device_map={"": torch.cuda.current_device()},
                                             cache_dir=cache_directory)

tokenizer = transformers.AutoTokenizer.from_pretrained(name, truncation_side = "left")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [11]:
def tokenize_function(example):
    start_prompt = 'Return the original text of the given Abstract Meaning Representation (AMR) structure.\n\nAMR:\n'
    end_prompt = '\n\nText: '
    
    full_texts = []
    prompt_texts = []
    
    for amr, text in zip(example["amr"], example["Text"]):
        prompt_text = start_prompt + amr + end_prompt
        full_text = prompt_text + text
        prompt_texts.append(prompt_text)
        full_texts.append(full_text)
    
    encoding = tokenizer(
        full_texts,
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=1024
    )
    
    input_ids = encoding.input_ids.to(device)
    labels = input_ids.clone()
    
    for i, prompt in enumerate(prompt_texts):
        prompt_encoding = tokenizer(
            prompt,
            return_tensors="pt",
            truncation=True,
            max_length=1024
        )
        prompt_len = prompt_encoding.input_ids.shape[1]
        labels[i, :prompt_len] = -100
    
    example["input_ids"] = input_ids
    example["labels"] = labels
    return example


In [12]:
batch_size = 8

tokenized_datasets_train = train_df.map(tokenize_function, batched=True, batch_size = batch_size)
tokenized_datasets_val = val_df.map(tokenize_function, batched=True, batch_size = batch_size)
tokenized_datasets_test = test_df.map(tokenize_function, batched=True, batch_size = batch_size)

tokenized_datasets_train = tokenized_datasets_train.remove_columns(['Text', 'amr'])
tokenized_datasets_val = tokenized_datasets_val.remove_columns(['Text', 'amr'])
tokenized_datasets_test = tokenized_datasets_test.remove_columns(['Text', 'amr'])

Map:   0%|          | 0/55635 [00:00<?, ? examples/s]

Map:   0%|          | 0/1722 [00:00<?, ? examples/s]

Map:   0%|          | 0/1898 [00:00<?, ? examples/s]

## QLoRA

In [13]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

In [14]:
lora_config = peft.LoraConfig(
    r=32,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=peft.TaskType.CAUSAL_LM 
)

In [15]:
peft_model = peft.get_peft_model(model, 
                            lora_config)
print(print_number_of_trainable_model_parameters(peft_model))

trainable model parameters: 13631488
all model parameters: 8043892736
percentage of trainable model parameters: 0.17%


In [20]:
peft_training_args = transformers.TrainingArguments(
    output_dir=checkpoint_directory,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    learning_rate=1e-3,
    num_train_epochs=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=10,
    metric_for_best_model="eval_loss",
    load_best_model_at_end = True,
    greater_is_better=False,
    save_total_limit=1,
)


peft_trainer = transformers.Trainer(
    model=peft_model,
    args=peft_training_args,
    train_dataset=tokenized_datasets_train,
    eval_dataset=tokenized_datasets_val,
    callbacks=[transformers.EarlyStoppingCallback(early_stopping_patience=3)],
)

In [22]:
peft_trainer.train()

peft_model_path=checkpoint_directory

peft_trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)

## Inference

In [11]:
base_model = transformers.AutoModelForCausalLM.from_pretrained(name,
                                                          load_in_8bit = True,
                                                          trust_remote_code = True,
                                              device_map={"": torch.cuda.current_device()},
                                             cache_dir=cache_directory)
tokenizer = transformers.AutoTokenizer.from_pretrained(name, truncation_side = "left")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

peft_model = peft.PeftModel.from_pretrained(base_model, 
                                       checkpoint_directory,
                                       is_trainable=False)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [29]:
def reconstruct_text(amr, prompt, model, tokenizer):
    with torch.no_grad():
        input_text = f'''{prompt}\n\nAMR:\n{amr}\n\nText:'''
        input_tokens = tokenizer(input_text, return_tensors = "pt", truncation = True, 
                                 padding = 'max_length', max_length = 1024).to(device)
    
        outputs = model.generate(**input_tokens, max_new_tokens = 200, 
                                 pad_token_id = tokenizer.eos_token_id)
    
        reconstruction = tokenizer.decode(outputs[0], 
                                          skip_special_tokens = True).split("Text:\n")[1].split("\n")[0].strip()
    return reconstruction

In [30]:
prompt = 'Return the original text of the given Abstract Meaning Representation (AMR) structure.'

In [None]:
output_file = inference_results

if os.path.exists(output_file):
    with open(output_file, 'r') as f:
        try:
            existing_data = [json.loads(line) for line in f]
        except json.JSONDecodeError:
            existing_data = []
else:
    existing_data = []

processed_indices = {item['index'] for item in existing_data}

In [None]:
with open(output_file, 'a') as f:
    for i in tqdm(range(len(amrs_test))):
        if i in processed_indices:
            continue 
        
        reconstructed_text = reconstruct_text(amrs_test[i], prompt, peft_model, tokenizer)
        
        result = {
            "index": i,
            "original_text": texts[i],
            "reconstructed_text": reconstructed_text
        }

        f.write(json.dumps(result) + "\n")

## Results

In [None]:
with open(inference_results, 'r') as f:
    data = [json.loads(line) for line in f]

full_reconstructions = pd.DataFrame(data)

### Average F1-score

In [None]:
def f1_score_strings(str1, str2):
    tokens1 = set(str1.lower().split())
    tokens2 = set(str2.lower().split())
    
    true_positives = len(tokens1 & tokens2)  
    false_positives = len(tokens1 - tokens2)  
    false_negatives = len(tokens2 - tokens1)  
    
    precision = true_positives / (true_positives + false_positives) if true_positives + false_positives > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if true_positives + false_negatives > 0 else 0
    
    if precision + recall > 0:
        f1 = 2 * (precision * recall) / (precision + recall)
    else:
        f1 = 0
    
    return f1

In [None]:
full_reconstructions['f1_scores'] = full_reconstructions.apply(lambda x: f1_score_strings(x['original_text'], x['reconstructed_text']), axis = 1)

In [None]:
np.mean(full_reconstructions.f1_scores)

### Cosine Similarity

In [None]:
similarity_model = sentence_transformers.SentenceTransformer('all-MiniLM-L6-v2')


def sent_similarity(str1, str2):
    embedding1 = similarity_model.encode(str1.lower())
    embedding2 = similarity_model.encode(str2.lower())
    
    return sentence_transformers.util.cos_sim(embedding1, embedding2)

In [None]:
full_reconstructions['cosine_similarity'] = full_reconstructions.apply(lambda x: sent_similarity(x['original_text'], x['reconstructed_text']), axis = 1)

In [None]:
np.mean(full_reconstructions.cosine_similarity)

### ROUGE scores

In [None]:
def compute_rouge_scores(reference_text, generated_text):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference_text.lower(), generated_text.lower())
    return scores

In [None]:
full_reconstructions['rouge_scores'] = full_reconstructions.apply(lambda x: compute_rouge_scores(x['original_text'], x['reconstructed_text']), axis = 1)
full_reconstructions['rouge_1'] = full_reconstructions.rouge_scores.map(lambda x: x['rouge1'].fmeasure)
full_reconstructions['rouge_2'] = full_reconstructions.rouge_scores.map(lambda x: x['rouge2'].fmeasure)
full_reconstructions['rouge_l'] = full_reconstructions.rouge_scores.map(lambda x: x['rougeL'].fmeasure)

In [None]:
print(f"ROUGE-1 score = {np.mean(full_reconstructions.rouge_1)}")
print(f"ROUGE-2 score = {np.mean(full_reconstructions.rouge_2)}")
print(f"ROUGE-L score = {np.mean(full_reconstructions.rouge_l)}")

### BLEU

In [None]:
from nltk.translate.bleu_score import sentence_bleu

In [None]:
full_reconstructions['bleu_scores'] = full_reconstructions.apply(lambda x: sentence_bleu([nltk.word_tokenize(x['original_text'].lower())], 
                                                                                         nltk.word_tokenize(x['reconstructed_text'].lower())), axis = 1)

In [None]:
print(np.mean(full_reconstructions.bleu_scores))