# Model Training Script

### Necessary Library

In [1]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.utils.tensorboard import SummaryWriter
from Llama3SP import LlamaForSequenceClassification as LLAMA3SP
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoConfig,
    get_linear_schedule_with_warmup,
    XLNetTokenizer,
    BertTokenizer
)
from peft import (
    LoraConfig,
    prepare_model_for_kbit_training,
    get_peft_model,
)
from huggingface_hub import login
from dotenv import load_dotenv
from tokenizers import Tokenizer

import torch.nn as nn
import os
import pandas as pd
import numpy as np
import torch
import time
import gc

### Login to huggingface hub to put your Llama token so we can access Llama 3.2 1B Param Pre-trained Model

In [2]:
load_dotenv()
token = os.getenv("HUGGINGFACE_TOKEN")
login(token=token)

### Hyperparameters

In [3]:
global EPOCHS, BATCH_SIZE_RATIO, SEQUENCE_LEN, LEARNING_RATE, TOKENIZER, MODEL_NAME, DEVICE

EPOCHS = 20
BATCH_SIZE_RATIO = 0.3 # within proj: 0.3 / cross proj: 0.4
SEQUENCE_LEN = 20
LEARNING_RATE = 5e-4
TOKENIZER = 'wordpiece' # available: llama3, wordlevel, sentencepiece, wordpiece, gpt
MODEL_NAME = 'llama3' # available: llama3
HF_MODEL_NAME = 'meta-llama/Llama-3.2-1B'

# define device
DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
if torch.cuda.is_available():
    # set up to release cache memory when possible
    torch.cuda.empty_cache()
    # set up more conservative memory limits  
    torch.cuda.set_per_process_memory_fraction(0.8)  # Use only 80% of GPU memory

# define files to be used
global DATA_PATH 
DATA_PATH = './sp_dataset/marked_data/'

### Configure dynamic memory allocation

In [4]:
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

### Static Methods and Variables

In [5]:
OUTPUT = ''
MODEL = None
DYNAMIC_BATCH = True
BATCH_SIZE = None
WITHIN_PROJECT = None
MAE_RECORDS = []
MDAE_RECORDS = []


def optimize_memory():
    """Aux function to optimize memory usage"""
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        gc.collect()


def data_processing(file_pair):
    global BATCH_SIZE, BATCH_SIZE_RATIO, DATA_PATH, WITHIN_PROJECT, DYNAMIC_BATCH, MODEL_NAME

    optimize_memory()

    train_data = pd.DataFrame(columns=['text', 'label'])
    for train_file_name in file_pair['train']:
        fname = DATA_PATH + train_file_name + '.csv'
        df = prepare_dataframe(fname)
        train_data = train_data.append(df)
        
    # data split
    if WITHIN_PROJECT:
        train_text, train_labels, val_text, val_labels, test_text, test_labels = within_project_split(train_data)
    else:
        train_text, train_labels, val_text, val_labels = train_val_split(train_data, 0.6)
    # define batch size dynamically based on training length
    if DYNAMIC_BATCH:
        # BATCH_SIZE = int(len(train_text) * BATCH_SIZE_RATIO)
        BATCH_SIZE = min(int(len(train_text) * BATCH_SIZE_RATIO), 32)

    optimize_memory()

    # process data in chunks for tokenization
    def process_in_chunks(texts, chunk_size=1000):
        all_tokens = {'input_ids': []}
        for i in range(0, len(texts), chunk_size):
            chunk = texts[i:i + chunk_size].tolist()
            tokens = tokenization(chunk)
            all_tokens['input_ids'].extend(tokens['input_ids'])
            optimize_memory()
        return all_tokens
    
    # tokenization
    tokens_train = process_in_chunks(train_text)
    tokens_val = process_in_chunks(val_text)
 
    train_seq = torch.tensor(tokens_train['input_ids'])
    train_y = torch.tensor(train_labels.tolist()).type(torch.LongTensor)
    train_dataloader = prepare_dataloader(train_seq, train_y, sampler_type='random')

    val_seq = torch.tensor(tokens_val['input_ids'])
    val_y = torch.tensor(val_labels.tolist()).type(torch.LongTensor)
    val_dataloader = prepare_dataloader(val_seq, val_y, sampler_type='sequential')
    
    # prepare testing datasets
    all_test_dataloader = []
    test_file_names = []

    if WITHIN_PROJECT:
        tokens_test = process_in_chunks(test_text)
        test_seq = torch.tensor(tokens_test['input_ids'])
        test_y = torch.tensor(test_labels.tolist()).type(torch.LongTensor)
        test_dataloader = prepare_dataloader(test_seq, test_y, sampler_type='sequential')
        all_test_dataloader.append(test_dataloader)
        test_file_names.append(file_pair['test'][0])
        return file_pair, train_dataloader, val_dataloader, all_test_dataloader, test_file_names

    for test_file_name in file_pair['test']:
        fname = DATA_PATH + test_file_name + '.csv'
        test_data = prepare_dataframe(fname)

        test_text = test_data['text']
        test_labels = test_data['label']

        # tokenization
        tokens_test = process_in_chunks(test_text)
        test_seq = torch.tensor(tokens_test['input_ids'])
        test_y = torch.tensor(test_labels.tolist()).type(torch.LongTensor)
        test_dataloader = prepare_dataloader(test_seq, test_y, sampler_type='sequential')

        all_test_dataloader.append(test_dataloader)
        test_file_names.append(test_file_name)

        optimize_memory()
    print('cross project data processing!')
    return file_pair, train_dataloader, val_dataloader, all_test_dataloader, test_file_names


def train_val_split(data, split_ratio):
    print('cross project split!')
    split_point = int(len(data) * split_ratio)
    train_text = data['text'][:split_point]
    train_labels = data['label'][:split_point]
    val_text = data['text'][split_point:]
    val_labels = data['label'][split_point:]
    return train_text, train_labels, val_text, val_labels


def tokenization(text_list):
    global TOKENIZER, SEQUENCE_LEN, MODEL

    if TOKENIZER == 'wordpiece':
        print('using wordpiece tokenizer!')
        tokenizer = BertTokenizer('all_tokenizers/sp_word_piece/vocab.txt')
    elif TOKENIZER == 'sentencepiece':
        print('using sentencepiece tokenizer!')
        tokenizer = XLNetTokenizer('all_tokenizers/sp_sentence_piece/spm_tokenizer.model', padding_side='right')
        tokenizer.pad_token_id = tokenizer.eos_token_id
        tokenizer.pad_token = tokenizer.eos_token
    elif TOKENIZER == 'wordlevel':
        print('using wordlevel tokenizer!')
        tokenizer = Tokenizer.from_file('all_tokenizers/sp_word_level/wordlevel.json')
        encoded_sentences = {'input_ids':[]}
        for sentence in text_list:
            encoded = tokenizer.encode(sentence)
            encoded = encoded.ids
            if len(encoded) > SEQUENCE_LEN:
                encoded = encoded[:SEQUENCE_LEN]
            elif len(encoded) < SEQUENCE_LEN:
                padding = SEQUENCE_LEN - len(encoded)
                for _ in range(padding):
                    encoded.append(3)
            encoded_sentences['input_ids'].append(encoded)
        return encoded_sentences
    elif TOKENIZER == 'llama3':
        print('using pretrained llama3 tokenizer')
        tokenizer = AutoTokenizer.from_pretrained(HF_MODEL_NAME, add_prefix_space=True)
        tokenizer.pad_token_id = tokenizer.eos_token_id
        tokenizer.pad_token = tokenizer.eos_token

    # update some model configs
    # must use .cache = False as below or it crashes from my experience
    MODEL.config.pad_token_id = tokenizer.pad_token_id
    MODEL.config.use_cache = False
    MODEL.config.pretraining_tp = 1
    return tokenizer.batch_encode_plus(text_list, truncation=True, max_length=SEQUENCE_LEN, padding='max_length')


def prepare_dataframe(file_name):
    data = pd.read_csv(file_name)
    # some rows have no description, fill blank to avoid Null
    data = data.fillna(' ')
    d = {'text': (data['title']).tolist(), 'label': data['storypoint']}
    return pd.DataFrame(data=d)


def prepare_dataloader(seq, y, sampler_type):
    global BATCH_SIZE
    tensor_dataset = TensorDataset(seq, y)
    if sampler_type == 'random':
        sampler = RandomSampler(tensor_dataset)
    elif sampler_type == 'sequential':
        sampler = SequentialSampler(tensor_dataset)
    dataloader = DataLoader(tensor_dataset, sampler=sampler, batch_size=BATCH_SIZE)
    return dataloader


def within_project_split(data):
    print('within project split!')
    train_val_split_point = int(len(data) * 0.6)
    val_test_split_point = int(len(data) * 0.8)
    train_text = data['text'][:train_val_split_point]
    train_labels = data['label'][:train_val_split_point]
    val_text = data['text'][train_val_split_point:val_test_split_point]
    val_labels = data['label'][train_val_split_point:val_test_split_point]
    test_text = data['text'][val_test_split_point:]
    test_labels = data['label'][val_test_split_point:]
    return train_text, train_labels, val_text, val_labels, test_text, test_labels   


def train_eval_test(file_pair, train_dataloader, val_dataloader, all_test_dataloader, model, test_file_names):
    global LEARNING_RATE, EPOCHS, MAE_RECORDS, MDAE_RECORDS, DEVICE
    optimizer = torch.optim.AdamW(MODEL.parameters(), lr=LEARNING_RATE)    
    # total number of training steps is [number of batches] x [number of epochs]
    total_steps = len(train_dataloader) * EPOCHS
    # create the learning rate scheduler
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
    print("Start training for ", file_pair, ".....")
    start_time = time.time()
    
    # tensorboard writer
    writer_path = 'tb/' + str(file_pair['train'][0]) + '_' + str(file_pair['test'][0])
    writer = SummaryWriter(writer_path)
    
    # vars for model selection
    min_eval_loss_epoch = [10000, 0]
    
    time_records = []
    MAE_RECORDS = []
    MDAE_RECORDS = []
    
    loss_fct = nn.L1Loss()
    for e in range(EPOCHS):
        # ---TRAINING---
        # clean GPU memory
        optimize_memory()
        print(">>> epoch ", e)
        # set model into train mode
        model.train()
        total_train_loss = 0
        for step, batch in enumerate(train_dataloader):            
            b_input_ids = batch[0].to(torch.long).to(DEVICE)
            b_labels = batch[1].to(torch.float).to(DEVICE)
            model.zero_grad()
            result = model(b_input_ids, 
                           labels=b_labels,
                           return_dict=True)
            loss = result.loss
            logits = result.logits
            total_train_loss += loss.item()
            loss.backward() 
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            # clean memory
            del step, batch, b_input_ids, b_labels, result, loss, logits
            optimize_memory()

        avg_train_loss = total_train_loss / len(train_dataloader)
        print(" Average training MAE loss: {0:.2f}".format(avg_train_loss))
        writer.add_scalar('loss/train', avg_train_loss, e)
        # clean memory
        del avg_train_loss, total_train_loss
        optimize_memory()
        
        time_records.append(time.time() - start_time)
        
        # ---EVAL---
        print("-")
        # set model into eval mode
        model.eval()
        total_eval_loss = 0
        for batch in val_dataloader:            
            b_input_ids = batch[0].to(torch.long).to(DEVICE)
            b_labels = batch[1].to(torch.float).to(DEVICE)
            model.zero_grad()
            result = model(b_input_ids, 
                           labels=b_labels,
                           return_dict=True)
            loss = result.loss
            logits = result.logits
            total_eval_loss += loss.item()  
            # clean memory
            del b_input_ids, b_labels, batch, result, loss, logits
            optimize_memory()
        avg_eval_loss = total_eval_loss / len(val_dataloader)
        print(" Average eval MAE loss: {0:.2f}".format(avg_eval_loss))
        
        if avg_eval_loss <= min_eval_loss_epoch[0]:
            min_eval_loss_epoch[0] = avg_eval_loss
            min_eval_loss_epoch[1] = e

        optimize_memory()
        
        writer.add_scalar('loss/eval', avg_eval_loss, e)
        # clean memory
        del avg_eval_loss, total_eval_loss
        optimize_memory()
        # save model state to dict
        torch.save(model.state_dict(), './models/' + 'epo_' + str(e))
        
        print("===============================")
        
        # testing on holdout data
        index = 0
        for test_dataloader in all_test_dataloader:
            test_file_name = test_file_names[index]
            index += 1
            testing_start_time = time.time()
            predictions = []
            true_labels = []
            for batch in test_dataloader:
                batch = tuple(t.to(DEVICE) for t in batch)
                b_input_ids, b_labels = batch
                with torch.no_grad():
                    logits = model(b_input_ids)
                logits = logits['logits'].detach().cpu().numpy()
                label_ids = b_labels.to('cpu').numpy()
                predictions.append(logits)
                true_labels.append(label_ids)
            # calculate errors
            distance_records = []
            for i in range(len(predictions)):
                for j in range(len(predictions[i])):
                    distance = abs(predictions[i][j] - true_labels[i][j])
                    distance_records.append(distance)

            ## MAE = mean value of all absolute errors (stored in distance_records)
            MAE = np.mean(np.array(distance_records)) 
            ## MdAE = median value of all absolute errors (stored in distance_records)
            MdAE = np.median(np.array(distance_records)) 

            MAE_RECORDS.append(MAE)
            MDAE_RECORDS.append(MdAE)
            
            global OUTPUT
            OUTPUT +=  'Epochs ' + str(e) + '\n'
            OUTPUT += 'MAE: ' + str(MAE) + '\n'
            OUTPUT += 'MdAE: ' + str(MdAE) + '\n\n'
            print('MAE: ', MAE)
            print('MdAE: ', MdAE)
    writer.flush()
    writer.close()
    
    # select model
    os.rename('models/epo_' + str(min_eval_loss_epoch[1]), 
              'models/' + str(file_pair['train'][0]) + '_' 
              + str(file_pair['test'][0]) + '_epo_' + str(min_eval_loss_epoch[1]))
    
    # del unwanted models
    for i in range(20):
        try:
            os.remove("models/epo_" + str(i))
        except:
            continue
            
    OUTPUT += 'MAE: ' + str(MAE_RECORDS[min_eval_loss_epoch[1]]) \
                + '  MdAE: ' + str(MDAE_RECORDS[min_eval_loss_epoch[1]]) + '\n'
    OUTPUT += 'training time: ' + str(time_records[min_eval_loss_epoch[1]]) + '\n'
    OUTPUT += 'Epochs: ' + str(min_eval_loss_epoch[1]) +'\n'
    global BATCH_SIZE
    OUTPUT += 'batch size: ' + str(BATCH_SIZE)
    print('all done for one project')

In [None]:
global WITHIN_PROJECT, BATCH_SIZE_RATIO
WITHIN_PROJECT = True
BATCH_SIZE_RATIO = 0.3

TRAIN_TEST_FILE_PAIRS = [
                        {'train': ['appceleratorstudio'], 'test': ['appceleratorstudio']},
                        {'train': ['aptanastudio'], 'test': ['aptanastudio']},
                        {'train': ['bamboo'], 'test': ['bamboo']},
                        {'train': ['clover'], 'test': ['clover']},
                        {'train': ['datamanagement'], 'test': ['datamanagement']},
                        {'train': ['duracloud'], 'test': ['duracloud']},
                        {'train': ['jirasoftware'], 'test': ['jirasoftware']},
                        {'train': ['mesos'], 'test': ['mesos']},
                        {'train': ['moodle'], 'test': ['moodle']},
                        {'train': ['mule'], 'test': ['mule']},
                        {'train': ['mulestudio'], 'test': ['mulestudio']},
                        {'train': ['springxd'], 'test': ['springxd']},
                        {'train': ['talenddataquality'], 'test': ['talenddataquality']},
                        {'train': ['talendesb'], 'test': ['talendesb']},
                        {'train': ['titanium'], 'test': ['titanium']},
                        {'train': ['usergrid'], 'test': ['usergrid']},
                        ]


def main():
    global TRAIN_TEST_FILE_PAIRS, MODEL, TOKENIZER, MODEL_NAME, HF_MODEL_NAME

    # Load LLama model with 4 bit quantization as specified in bits and bytes and prepare model for peft training
    # Quantization Config (for QLORA)
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.float16, # Changed to float16 for lower memory usage
    )
    # Lora Config
    lora_config = LoraConfig(
        r=8, # Reduced from 16 to 8 for lower memory usage
        lora_alpha=16,
        target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj'],
        lora_dropout=0.1,
        bias='none',
        task_type='SEQ_CLS'
    )

    for file in TRAIN_TEST_FILE_PAIRS:
        optimize_memory()

        if MODEL_NAME == 'llama3':
            # Config for LLama3 model
            config = AutoConfig.from_pretrained(HF_MODEL_NAME, num_labels=1)

            MODEL = AutoModelForSequenceClassification.from_pretrained(
                HF_MODEL_NAME,
                quantization_config=quantization_config,
                # num_labels=1, # For regression
                torch_dtype=torch.float16,
                device_map='auto',
                low_cpu_mem_usage=True,
                config=config,
            )
            # prepare_model_for_kbit_training() function to preprocess the quantized model for training.
            MODEL = prepare_model_for_kbit_training(MODEL)
            # get_peft_model prepares a model for training with a PEFT method such as LoRA by wrapping the base model and PEFT configuration with get_peft_model
            MODEL = get_peft_model(MODEL, lora_config)

            # additional memory optimizations
            MODEL.gradient_checkpointing_enable()  # Reduce memory usage during training
            MODEL.enable_input_require_grads()
        
        elif MODEL_NAME == 'llama3sp':
            # First load the base model
            MODEL = LLAMA3SP.from_pretrained(
                HF_MODEL_NAME,
                quantization_config=quantization_config,
                num_labels=1, # For regression
                torch_dtype=torch.float16,
                device_map='auto',
                low_cpu_mem_usage=True,
            )

            # Prepare the model for training with LoRA
            MODEL = prepare_model_for_kbit_training(MODEL)
            # get_peft_model prepares a model for training with a PEFT method such as LoRA by wrapping the base model and PEFT configuration with get_peft_model
            MODEL = get_peft_model(MODEL, lora_config)

            # Memory optimizations
            MODEL.gradient_checkpointing_enable()
            MODEL.enable_input_require_grads()

        if TOKENIZER == 'wordlevel':
            MODEL.config.pad_token_id = 3
        elif TOKENIZER == 'sentencepiece':
            MODEL.config.pad_token_id = 0
        elif TOKENIZER == 'wordpiece':
            MODEL.config.pad_token_id = 0
        
        MODEL.cuda()

        file_pair, train_dataloader, val_dataloader, all_test_dataloader, test_file_names = data_processing(file_pair=file)
        train_eval_test(file_pair, train_dataloader, val_dataloader, all_test_dataloader, MODEL, test_file_names)
        del MODEL
        optimize_memory()
        torch.cuda.empty_cache()            
        global OUTPUT
        with open('./results/' + str(file['train'][0]) + '_' + str(file['test'][0]) +'.txt', 'w+') as f:
            f.writelines(OUTPUT)
            print('results have been written into a text file!')
            OUTPUT = ""


if __name__ == "__main__":
    main()

### Cross Project Training Script - Within Repository

In [None]:
global WITHIN_PROJECT, BATCH_SIZE_RATIO
WITHIN_PROJECT = False
BATCH_SIZE_RATIO = 0.4

TRAIN_TEST_FILE_PAIRS = [
                        {'train': ['mesos'], 'test': ['usergrid']},
                        {'train': ['usergrid'], 'test': ['mesos']},
                        {'train': ['appceleratorstudio'], 'test': ['aptanastudio']},
                        {'train': ['appceleratorstudio'], 'test': ['titanium']},
                        {'train': ['titanium'], 'test': ['appceleratorstudio']},
                        {'train': ['aptanastudio'], 'test': ['titanium']},
                        {'train': ['mule'], 'test': ['mulestudio']},
                        {'train': ['mulestudio'], 'test': ['mule']}
                        ]


def main():
    global TRAIN_TEST_FILE_PAIRS, MODEL, TOKENIZER, MODEL_NAME, HF_MODEL_NAME
    
    # Load LLama model with 4 bit quantization as specified in bits and bytes and prepare model for peft training
    # Quantization Config (for QLORA)
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.float16, # Changed to float16 for lower memory usage
    )
    # Lora Config
    lora_config = LoraConfig(
        r=8, # Reduced from 16 to 8 for lower memory usage
        lora_alpha=16,
        target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj'],
        lora_dropout=0.1,
        bias='none',
        task_type='SEQ_CLS'
    )

    for file in TRAIN_TEST_FILE_PAIRS:
        optimize_memory()

        if MODEL_NAME == 'llama3':
            # Config for LLama3 model
            config = AutoConfig.from_pretrained(HF_MODEL_NAME, num_labels=1)

            MODEL = AutoModelForSequenceClassification.from_pretrained(
                HF_MODEL_NAME,
                quantization_config=quantization_config,
                # num_labels=1,  # Para regresión
                torch_dtype=torch.float16,
                device_map='auto',
                low_cpu_mem_usage=True,
                config=config,
            )
            # prepare_model_for_kbit_training() function to preprocess the quantized model for training.
            MODEL = prepare_model_for_kbit_training(MODEL)
            # get_peft_model prepares a model for training with a PEFT method such as LoRA by wrapping the base model and PEFT configuration with get_peft_model
            MODEL = get_peft_model(MODEL, lora_config)

            # additional memory optimizations
            MODEL.gradient_checkpointing_enable()  # Reduce memory usage during training
            MODEL.enable_input_require_grads()

        if TOKENIZER == 'wordlevel':
            MODEL.config.pad_token_id = 3
        elif TOKENIZER == 'sentencepiece':
            MODEL.config.pad_token_id = 0
        elif TOKENIZER == 'wordpiece':
            MODEL.config.pad_token_id = 0

        MODEL.cuda()

        file_pair, train_dataloader, val_dataloader, all_test_dataloader, test_file_names = data_processing(file_pair=file)
        train_eval_test(file_pair, train_dataloader, val_dataloader, all_test_dataloader, MODEL, test_file_names)
        del MODEL
        optimize_memory()
        torch.cuda.empty_cache()            
        global OUTPUT
        with open('./results/' + str(file['train'][0]) + '_' + str(file['test'][0]) +'.txt', 'w+') as f:
            f.writelines(OUTPUT)
            print('results have been written into a text file!')
            OUTPUT = ""

                
if __name__ == "__main__":
    main()

### Cross Project Training Script - Cross Repository

In [None]:
global WITHIN_PROJECT, BATCH_SIZE_RATIO
WITHIN_PROJECT = False
BATCH_SIZE_RATIO = 0.4

TRAIN_TEST_FILE_PAIRS = [
                        {'train': ['clover'], 'test': ['usergrid']},
                        {'train': ['talendesb'], 'test': ['mesos']},
                        {'train': ['talenddataquality'], 'test': ['aptanastudio']},
                        {'train': ['mule'], 'test': ['titanium']},
                        {'train': ['talenddataquality'], 'test': ['appceleratorstudio']},
                        {'train': ['mulestudio'], 'test': ['titanium']},
                        {'train': ['appceleratorstudio'], 'test': ['mulestudio']},
                        {'train': ['appceleratorstudio'], 'test': ['mule']}
                        ]


def main():
    global TRAIN_TEST_FILE_PAIRS, MODEL, TOKENIZER, MODEL_NAME, HF_MODEL_NAME
    
    # Load LLama model with 4 bit quantization as specified in bits and bytes and prepare model for peft training
    # Quantization Config (for QLORA)
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.float16,  # Cambiado a float16 para menor uso de memoria
    )
    # Lora Config
    lora_config = LoraConfig(
        r=8, # Reduced from 16 to 8 for lower memory usage
        lora_alpha=16,
        target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj'],
        lora_dropout=0.1,
        bias='none',
        task_type='SEQ_CLS'
    )

    for file in TRAIN_TEST_FILE_PAIRS:
        optimize_memory()

        if MODEL_NAME == 'llama3':
            # Config for LLama3 model
            config = AutoConfig.from_pretrained(HF_MODEL_NAME, num_labels=1)

            MODEL = AutoModelForSequenceClassification.from_pretrained(
                HF_MODEL_NAME,
                quantization_config=quantization_config,
                # num_labels=1,  # For regression
                torch_dtype=torch.float16,
                device_map='auto',
                low_cpu_mem_usage=True,
                config=config,
            )
            # prepare_model_for_kbit_training() function to preprocess the quantized model for training.
            MODEL = prepare_model_for_kbit_training(MODEL)
            # get_peft_model prepares a model for training with a PEFT method such as LoRA by wrapping the base model and PEFT configuration with get_peft_model
            MODEL = get_peft_model(MODEL, lora_config)

            # additional memory optimizations
            MODEL.gradient_checkpointing_enable()  # Reduce memory usage during training
            MODEL.enable_input_require_grads()

        if TOKENIZER == 'wordlevel':
            MODEL.config.pad_token_id = 3
        elif TOKENIZER == 'sentencepiece':
            MODEL.config.pad_token_id = 0
        elif TOKENIZER == 'wordpiece':
            MODEL.config.pad_token_id = 0

        MODEL.cuda()
        file_pair, train_dataloader, val_dataloader, all_test_dataloader, test_file_names = data_processing(file_pair=file)
        train_eval_test(file_pair, train_dataloader, val_dataloader, all_test_dataloader, MODEL, test_file_names)
        del MODEL 
        optimize_memory()
        torch.cuda.empty_cache()            
        global OUTPUT
        with open('./results/' + str(file['train'][0]) + '_' + str(file['test'][0]) +'.txt', 'w+') as f:
            f.writelines(OUTPUT)
            print('results have been written into a text file!')
            OUTPUT = ""

                
if __name__ == "__main__":
    main()