# Installing Dependencies

# Import Libraries

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

# Load GPT-2 in 4-bit mode
model = AutoModelForCausalLM.from_pretrained(
    "gpt2",
    load_in_4bit=True,
    device_map="auto"  # Automatically places the model on the available GPUs
)

tokenizer = AutoTokenizer.from_pretrained("gpt2")

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


# Import Model

In [4]:
# Prepare the model for 4-bit training (adds LoRA and disables weight decay on some params)
model = prepare_model_for_kbit_training(model)

# Define LoRA configuration
lora_config = LoraConfig(
    r=8,               # Rank of the low-rank matrix -> A (input_dim, r) * B (r, output_dim)
    lora_alpha=32,     # LoRA scaling factor
    target_modules=["c_attn", "q_proj", "v_proj"],  # Layers to apply LoRA to (GPT-2 uses 'c_attn')
    lora_dropout=0.1,  # Dropout probability for LoRA layers
    bias="none",       # Bias configuration: can be "none", "all", or "lora_only"
    task_type="CAUSAL_LM"  # Task type for causal language modeling
)

# Add LoRA to the model
model = get_peft_model(model, lora_config)

# Importing and Processing Dataset

In [5]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='./gpt2_qlora_python',
    overwrite_output_dir=True,
    num_train_epochs=3,  # Increase as needed; LoRA trains faster
    per_device_train_batch_size=4,  # You can increase this due to reduced memory usage
    gradient_accumulation_steps=16,
    evaluation_strategy="steps",
    eval_steps=100,
    save_steps=500,
    logging_steps=100,
    logging_dir='./logs',
    save_total_limit=2,
    learning_rate=2e-4,  # Often higher LR works for QLoRA
    fp16=True,  # Enable mixed precision
    optim="paged_adamw_32bit",  # Optimizer for quantized training
    lr_scheduler_type="cosine",
)




In [6]:
import pandas as pd
from datasets import Dataset
import torch
from transformers import AutoTokenizer

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the tokenizer and set the padding token
tokenizer = AutoTokenizer.from_pretrained("gpt2")  # Replace with your model name
tokenizer.pad_token = tokenizer.eos_token

# Load your dataset from JSON
df_glaive = pd.read_json("hf://datasets/glaiveai/glaive-code-assistant/c9bc9129-eba0-4b10-8292-4ae70fc7fa0d.json")

# Convert the DataFrame to a HuggingFace Dataset
dataset = Dataset.from_pandas(df_glaive)

# Split the dataset into training and validation sets (90% train, 10% validation)
train_data = dataset.select([i for i in range(len(dataset)) if i % 10 != 0])  # 90% for training
val_data = dataset.select([i for i in range(len(dataset)) if i % 10 == 0])  # 10% for validation

# Tokenize the input and target sequences
def tokenize_function(examples):
    inputs = tokenizer(examples['question'], return_tensors='pt', padding='max_length', max_length=512, truncation=True)
    labels = tokenizer(examples['answer'], return_tensors='pt', padding='max_length', max_length=512, truncation=True)
    return {'input_ids': inputs['input_ids'], 'labels': labels['input_ids']}

# Apply tokenization to the datasets
train_data = train_data.map(tokenize_function, batched=True)
val_data = val_data.map(tokenize_function, batched=True)

# Print a sample for debugging
print(train_data[0])
print(val_data[0])

Map:   0%|          | 0/122498 [00:00<?, ? examples/s]

Map:   0%|          | 0/13611 [00:00<?, ? examples/s]

{'question': "How can I install Python 3 on an AWS EC2 instance? I tried using the command `sudo yum install python3`, but I received an error message saying `No package python3 available.`. I searched online but didn't find a solution. Do I need to download and install it manually?", 'answer': "To install Python 3 on an AWS EC2 instance, you can use the Amazon Linux Extras Library. This library is a curated set of software that Amazon provides for the Amazon Linux 2 platform. It includes newer versions of software, like Python 3, that are not included in the default Amazon Linux 2 repositories. Here is a step by step process on how to do it:\n\n1. First, update your instance with the following command:\n\n```bash\nsudo yum update -y\n```\n\n2. Next, list available packages in the Amazon Linux Extras repository by typing:\n\n```bash\nsudo amazon-linux-extras list\n```\n\n3. You should see python3.8 available in the list. To install it, use the following command:\n\n```bash\nsudo amazon

# Train Model

In [7]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
)

trainer.train()

  0%|          | 0/5742 [00:00<?, ?it/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


{'loss': 5.8871, 'grad_norm': 1.0586110353469849, 'learning_rate': 0.00019985036436402656, 'epoch': 0.05}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.76005744934082, 'eval_runtime': 88.7801, 'eval_samples_per_second': 153.311, 'eval_steps_per_second': 19.171, 'epoch': 0.05}
{'loss': 4.6833, 'grad_norm': 1.3926193714141846, 'learning_rate': 0.00019940190527257726, 'epoch': 0.1}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.530312538146973, 'eval_runtime': 88.7131, 'eval_samples_per_second': 153.427, 'eval_steps_per_second': 19.185, 'epoch': 0.1}
{'loss': 4.5375, 'grad_norm': 1.4135888814926147, 'learning_rate': 0.00019865596483487923, 'epoch': 0.16}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.456700801849365, 'eval_runtime': 88.6586, 'eval_samples_per_second': 153.521, 'eval_steps_per_second': 19.197, 'epoch': 0.16}
{'loss': 4.4907, 'grad_norm': 1.0121597051620483, 'learning_rate': 0.00019761477543636836, 'epoch': 0.21}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.418543815612793, 'eval_runtime': 88.6788, 'eval_samples_per_second': 153.486, 'eval_steps_per_second': 19.193, 'epoch': 0.21}
{'loss': 4.4606, 'grad_norm': 2.107551097869873, 'learning_rate': 0.00019628145305780097, 'epoch': 0.26}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.391010284423828, 'eval_runtime': 88.6452, 'eval_samples_per_second': 153.545, 'eval_steps_per_second': 19.2, 'epoch': 0.26}


  return fn(*args, **kwargs)


{'loss': 4.4271, 'grad_norm': 1.0199593305587769, 'learning_rate': 0.00019465998795001853, 'epoch': 0.31}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.374568939208984, 'eval_runtime': 88.6444, 'eval_samples_per_second': 153.546, 'eval_steps_per_second': 19.2, 'epoch': 0.31}
{'loss': 4.4198, 'grad_norm': 1.2131471633911133, 'learning_rate': 0.00019275523269227324, 'epoch': 0.37}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.353372097015381, 'eval_runtime': 88.7034, 'eval_samples_per_second': 153.444, 'eval_steps_per_second': 19.188, 'epoch': 0.37}
{'loss': 4.3994, 'grad_norm': 2.9784295558929443, 'learning_rate': 0.0001905728876698525, 'epoch': 0.42}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.351104259490967, 'eval_runtime': 88.5417, 'eval_samples_per_second': 153.724, 'eval_steps_per_second': 19.223, 'epoch': 0.42}
{'loss': 4.4222, 'grad_norm': 1.3362594842910767, 'learning_rate': 0.0001881194840144631, 'epoch': 0.47}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.328802585601807, 'eval_runtime': 88.6594, 'eval_samples_per_second': 153.52, 'eval_steps_per_second': 19.197, 'epoch': 0.47}
{'loss': 4.4041, 'grad_norm': 2.4823038578033447, 'learning_rate': 0.00018540236405843058, 'epoch': 0.52}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.326116561889648, 'eval_runtime': 88.6583, 'eval_samples_per_second': 153.522, 'eval_steps_per_second': 19.197, 'epoch': 0.52}


  return fn(*args, **kwargs)


{'loss': 4.3774, 'grad_norm': 4.1523895263671875, 'learning_rate': 0.00018242965936120768, 'epoch': 0.57}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.3207879066467285, 'eval_runtime': 88.7325, 'eval_samples_per_second': 153.394, 'eval_steps_per_second': 19.181, 'epoch': 0.57}
{'loss': 4.34, 'grad_norm': 2.102656841278076, 'learning_rate': 0.00017921026637395294, 'epoch': 0.63}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.301670074462891, 'eval_runtime': 88.5477, 'eval_samples_per_second': 153.714, 'eval_steps_per_second': 19.221, 'epoch': 0.63}
{'loss': 4.4102, 'grad_norm': 0.6400576829910278, 'learning_rate': 0.00017575381981500837, 'epoch': 0.68}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.311867713928223, 'eval_runtime': 88.6976, 'eval_samples_per_second': 153.454, 'eval_steps_per_second': 19.189, 'epoch': 0.68}
{'loss': 4.3739, 'grad_norm': 1.2383931875228882, 'learning_rate': 0.00017207066383595503, 'epoch': 0.73}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.286099433898926, 'eval_runtime': 88.7065, 'eval_samples_per_second': 153.439, 'eval_steps_per_second': 19.187, 'epoch': 0.73}
{'loss': 4.3528, 'grad_norm': 0.8951903581619263, 'learning_rate': 0.00016817182106453927, 'epoch': 0.78}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.286120891571045, 'eval_runtime': 88.6292, 'eval_samples_per_second': 153.572, 'eval_steps_per_second': 19.204, 'epoch': 0.78}


  return fn(*args, **kwargs)


{'loss': 4.3369, 'grad_norm': 0.8129469156265259, 'learning_rate': 0.00016406895961711428, 'epoch': 0.84}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.285741329193115, 'eval_runtime': 88.8597, 'eval_samples_per_second': 153.174, 'eval_steps_per_second': 19.154, 'epoch': 0.84}
{'loss': 4.3465, 'grad_norm': 2.870114803314209, 'learning_rate': 0.00015977435817931998, 'epoch': 0.89}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.278467178344727, 'eval_runtime': 88.7873, 'eval_samples_per_second': 153.299, 'eval_steps_per_second': 19.169, 'epoch': 0.89}
{'loss': 4.3426, 'grad_norm': 1.2393348217010498, 'learning_rate': 0.00015530086925950434, 'epoch': 0.94}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.264331340789795, 'eval_runtime': 88.7814, 'eval_samples_per_second': 153.309, 'eval_steps_per_second': 19.171, 'epoch': 0.94}
{'loss': 4.348, 'grad_norm': 0.7168881893157959, 'learning_rate': 0.00015066188072485807, 'epoch': 0.99}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.258229732513428, 'eval_runtime': 88.6966, 'eval_samples_per_second': 153.456, 'eval_steps_per_second': 19.189, 'epoch': 0.99}
{'loss': 4.3193, 'grad_norm': 1.7671858072280884, 'learning_rate': 0.0001458712757353743, 'epoch': 1.04}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.255733966827393, 'eval_runtime': 88.6669, 'eval_samples_per_second': 153.507, 'eval_steps_per_second': 19.195, 'epoch': 1.04}


  return fn(*args, **kwargs)


{'loss': 4.3072, 'grad_norm': 0.8759745955467224, 'learning_rate': 0.000140943391195539, 'epoch': 1.1}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.259997844696045, 'eval_runtime': 88.6288, 'eval_samples_per_second': 153.573, 'eval_steps_per_second': 19.204, 'epoch': 1.1}
{'loss': 4.3285, 'grad_norm': 2.4710357189178467, 'learning_rate': 0.0001358929748480946, 'epoch': 1.15}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.269329071044922, 'eval_runtime': 89.2966, 'eval_samples_per_second': 152.425, 'eval_steps_per_second': 19.06, 'epoch': 1.15}
{'loss': 4.3374, 'grad_norm': 1.4372382164001465, 'learning_rate': 0.00013073514113828272, 'epoch': 1.2}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.251165866851807, 'eval_runtime': 88.6595, 'eval_samples_per_second': 153.52, 'eval_steps_per_second': 19.197, 'epoch': 1.2}
{'loss': 4.2831, 'grad_norm': 0.6965006589889526, 'learning_rate': 0.00012548532598065165, 'epoch': 1.25}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.24522066116333, 'eval_runtime': 88.7634, 'eval_samples_per_second': 153.34, 'eval_steps_per_second': 19.175, 'epoch': 1.25}
{'loss': 4.3126, 'grad_norm': 2.960050344467163, 'learning_rate': 0.00012015924056379844, 'epoch': 1.31}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.270421028137207, 'eval_runtime': 88.6614, 'eval_samples_per_second': 153.517, 'eval_steps_per_second': 19.197, 'epoch': 1.31}


  return fn(*args, **kwargs)


{'loss': 4.315, 'grad_norm': 0.8028329610824585, 'learning_rate': 0.00011477282433129508, 'epoch': 1.36}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.241755485534668, 'eval_runtime': 88.6569, 'eval_samples_per_second': 153.524, 'eval_steps_per_second': 19.198, 'epoch': 1.36}
{'loss': 4.3137, 'grad_norm': 0.8494104146957397, 'learning_rate': 0.00010934219727951301, 'epoch': 1.41}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.236919403076172, 'eval_runtime': 88.5841, 'eval_samples_per_second': 153.651, 'eval_steps_per_second': 19.213, 'epoch': 1.41}
{'loss': 4.3023, 'grad_norm': 0.7408472299575806, 'learning_rate': 0.00010388361171510474, 'epoch': 1.46}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.235468864440918, 'eval_runtime': 88.3551, 'eval_samples_per_second': 154.049, 'eval_steps_per_second': 19.263, 'epoch': 1.46}
{'loss': 4.2881, 'grad_norm': 1.2766470909118652, 'learning_rate': 9.84134036165192e-05, 'epoch': 1.52}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.24033260345459, 'eval_runtime': 88.6166, 'eval_samples_per_second': 153.594, 'eval_steps_per_second': 19.206, 'epoch': 1.52}
{'loss': 4.3218, 'grad_norm': 0.8968895673751831, 'learning_rate': 9.294794374511118e-05, 'epoch': 1.57}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.228937149047852, 'eval_runtime': 88.3627, 'eval_samples_per_second': 154.036, 'eval_steps_per_second': 19.262, 'epoch': 1.57}


  return fn(*args, **kwargs)


{'loss': 4.2826, 'grad_norm': 2.2062573432922363, 'learning_rate': 8.750358865215555e-05, 'epoch': 1.62}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.230471611022949, 'eval_runtime': 88.71, 'eval_samples_per_second': 153.433, 'eval_steps_per_second': 19.186, 'epoch': 1.62}
{'loss': 4.3065, 'grad_norm': 0.984140932559967, 'learning_rate': 8.209663172838837e-05, 'epoch': 1.67}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.237263202667236, 'eval_runtime': 88.6504, 'eval_samples_per_second': 153.536, 'eval_steps_per_second': 19.199, 'epoch': 1.67}
{'loss': 4.3014, 'grad_norm': 0.8321128487586975, 'learning_rate': 7.674325444256899e-05, 'epoch': 1.72}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.22365665435791, 'eval_runtime': 88.5436, 'eval_samples_per_second': 153.721, 'eval_steps_per_second': 19.222, 'epoch': 1.72}
{'loss': 4.3175, 'grad_norm': 0.7514380216598511, 'learning_rate': 7.145947791499274e-05, 'epoch': 1.78}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.2230448722839355, 'eval_runtime': 88.6971, 'eval_samples_per_second': 153.455, 'eval_steps_per_second': 19.189, 'epoch': 1.78}
{'loss': 4.2812, 'grad_norm': 1.6697802543640137, 'learning_rate': 6.626111497088062e-05, 'epoch': 1.83}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.2228684425354, 'eval_runtime': 88.6476, 'eval_samples_per_second': 153.541, 'eval_steps_per_second': 19.2, 'epoch': 1.83}


  return fn(*args, **kwargs)


{'loss': 4.274, 'grad_norm': 1.0112805366516113, 'learning_rate': 6.116372281713581e-05, 'epoch': 1.88}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.223598003387451, 'eval_runtime': 88.5769, 'eval_samples_per_second': 153.663, 'eval_steps_per_second': 19.215, 'epoch': 1.88}
{'loss': 4.2514, 'grad_norm': 1.1199443340301514, 'learning_rate': 5.618255648409302e-05, 'epoch': 1.93}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.220157146453857, 'eval_runtime': 88.4683, 'eval_samples_per_second': 153.852, 'eval_steps_per_second': 19.239, 'epoch': 1.93}
{'loss': 4.2827, 'grad_norm': 2.5401065349578857, 'learning_rate': 5.1332523171594916e-05, 'epoch': 1.99}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.226605415344238, 'eval_runtime': 88.4132, 'eval_samples_per_second': 153.948, 'eval_steps_per_second': 19.251, 'epoch': 1.99}
{'loss': 4.3004, 'grad_norm': 1.1190683841705322, 'learning_rate': 4.662813763602562e-05, 'epoch': 2.04}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.221111297607422, 'eval_runtime': 88.4874, 'eval_samples_per_second': 153.818, 'eval_steps_per_second': 19.234, 'epoch': 2.04}
{'loss': 4.2925, 'grad_norm': 1.1788606643676758, 'learning_rate': 4.208347875181477e-05, 'epoch': 2.09}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.221757888793945, 'eval_runtime': 88.7334, 'eval_samples_per_second': 153.392, 'eval_steps_per_second': 19.181, 'epoch': 2.09}


  return fn(*args, **kwargs)


{'loss': 4.2885, 'grad_norm': 0.9785155653953552, 'learning_rate': 3.7712147377410745e-05, 'epoch': 2.14}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.21707010269165, 'eval_runtime': 88.4867, 'eval_samples_per_second': 153.82, 'eval_steps_per_second': 19.235, 'epoch': 2.14}
{'loss': 4.2999, 'grad_norm': 1.3226473331451416, 'learning_rate': 3.352722565181878e-05, 'epoch': 2.19}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.214890480041504, 'eval_runtime': 88.5655, 'eval_samples_per_second': 153.683, 'eval_steps_per_second': 19.217, 'epoch': 2.19}
{'loss': 4.2562, 'grad_norm': 1.0766068696975708, 'learning_rate': 2.9541237843517034e-05, 'epoch': 2.25}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.214541912078857, 'eval_runtime': 88.6265, 'eval_samples_per_second': 153.577, 'eval_steps_per_second': 19.204, 'epoch': 2.25}
{'loss': 4.2674, 'grad_norm': 0.7264559864997864, 'learning_rate': 2.576611286891901e-05, 'epoch': 2.3}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.21327018737793, 'eval_runtime': 88.4813, 'eval_samples_per_second': 153.829, 'eval_steps_per_second': 19.236, 'epoch': 2.3}
{'loss': 4.2795, 'grad_norm': 1.0880624055862427, 'learning_rate': 2.2213148592553845e-05, 'epoch': 2.35}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.217799663543701, 'eval_runtime': 88.5325, 'eval_samples_per_second': 153.74, 'eval_steps_per_second': 19.225, 'epoch': 2.35}


  return fn(*args, **kwargs)


{'loss': 4.2443, 'grad_norm': 0.772325336933136, 'learning_rate': 1.8892978015803165e-05, 'epoch': 2.4}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.210931777954102, 'eval_runtime': 88.6402, 'eval_samples_per_second': 153.553, 'eval_steps_per_second': 19.201, 'epoch': 2.4}
{'loss': 4.2788, 'grad_norm': 1.413451075553894, 'learning_rate': 1.581553745538288e-05, 'epoch': 2.46}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.212082386016846, 'eval_runtime': 88.5502, 'eval_samples_per_second': 153.709, 'eval_steps_per_second': 19.221, 'epoch': 2.46}
{'loss': 4.2695, 'grad_norm': 1.8602592945098877, 'learning_rate': 1.2990036806801541e-05, 'epoch': 2.51}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.2118821144104, 'eval_runtime': 88.6287, 'eval_samples_per_second': 153.573, 'eval_steps_per_second': 19.204, 'epoch': 2.51}
{'loss': 4.2547, 'grad_norm': 1.111720085144043, 'learning_rate': 1.0424931981789022e-05, 'epoch': 2.56}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.21270227432251, 'eval_runtime': 88.5293, 'eval_samples_per_second': 153.746, 'eval_steps_per_second': 19.225, 'epoch': 2.56}
{'loss': 4.2875, 'grad_norm': 1.5399831533432007, 'learning_rate': 8.12789960218192e-06, 'epoch': 2.61}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.209624290466309, 'eval_runtime': 88.4616, 'eval_samples_per_second': 153.863, 'eval_steps_per_second': 19.24, 'epoch': 2.61}


  return fn(*args, **kwargs)


{'loss': 4.2691, 'grad_norm': 0.9465157985687256, 'learning_rate': 6.105814025999701e-06, 'epoch': 2.66}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.210157871246338, 'eval_runtime': 88.628, 'eval_samples_per_second': 153.575, 'eval_steps_per_second': 19.204, 'epoch': 2.66}
{'loss': 4.2575, 'grad_norm': 0.8795759081840515, 'learning_rate': 4.364726774466077e-06, 'epoch': 2.72}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.211297035217285, 'eval_runtime': 88.6404, 'eval_samples_per_second': 153.553, 'eval_steps_per_second': 19.201, 'epoch': 2.72}
{'loss': 4.2634, 'grad_norm': 0.7972840070724487, 'learning_rate': 2.9098484215444256e-06, 'epoch': 2.77}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.212738037109375, 'eval_runtime': 88.6015, 'eval_samples_per_second': 153.62, 'eval_steps_per_second': 19.21, 'epoch': 2.77}
{'loss': 4.2725, 'grad_norm': 0.9302504062652588, 'learning_rate': 1.7455330001868054e-06, 'epoch': 2.82}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.21030330657959, 'eval_runtime': 88.6653, 'eval_samples_per_second': 153.51, 'eval_steps_per_second': 19.196, 'epoch': 2.82}
{'loss': 4.2779, 'grad_norm': 0.964973509311676, 'learning_rate': 8.752649719641848e-07, 'epoch': 2.87}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.209972858428955, 'eval_runtime': 88.5974, 'eval_samples_per_second': 153.628, 'eval_steps_per_second': 19.211, 'epoch': 2.87}


  return fn(*args, **kwargs)


{'loss': 4.2827, 'grad_norm': 2.7668161392211914, 'learning_rate': 3.016487990739725e-07, 'epoch': 2.93}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.209779739379883, 'eval_runtime': 88.5965, 'eval_samples_per_second': 153.629, 'eval_steps_per_second': 19.211, 'epoch': 2.93}
{'loss': 4.2579, 'grad_norm': 0.8054270148277283, 'learning_rate': 2.6401149932875346e-08, 'epoch': 2.98}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.209802627563477, 'eval_runtime': 88.6813, 'eval_samples_per_second': 153.482, 'eval_steps_per_second': 19.192, 'epoch': 2.98}
{'train_runtime': 11854.7474, 'train_samples_per_second': 31.0, 'train_steps_per_second': 0.484, 'train_loss': 4.3554895388378325, 'epoch': 3.0}


TrainOutput(global_step=5742, training_loss=4.3554895388378325, metrics={'train_runtime': 11854.7474, 'train_samples_per_second': 31.0, 'train_steps_per_second': 0.484, 'total_flos': 9.635357044349338e+16, 'train_loss': 4.3554895388378325, 'epoch': 2.9999020408163264})

In [8]:
model.save_pretrained('./gpt2_qlora_finetuned_python')
tokenizer.save_pretrained('./gpt2_qlora_finetuned_python')

('./gpt2_qlora_finetuned_python\\tokenizer_config.json',
 './gpt2_qlora_finetuned_python\\special_tokens_map.json',
 './gpt2_qlora_finetuned_python\\vocab.json',
 './gpt2_qlora_finetuned_python\\merges.txt',
 './gpt2_qlora_finetuned_python\\added_tokens.json',
 './gpt2_qlora_finetuned_python\\tokenizer.json')

In [9]:
# Example input for pre-trained model
input_text = """Hi how are you?"""

# Tokenize the input
input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)

# Generate output
model.to(device)
pre_training_output = model.generate(
    input_ids,
    max_length=300,
    num_return_sequences=1,
    temperature=0.7,
    top_k=50,
)

# Decode and print the result
decoded_output = tokenizer.decode(pre_training_output[0], skip_special_tokens=True)
print("Output before training:")
print(decoded_output)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
  return fn(*args, **kwargs)


Output before training:
Hi how are you? can can the you can the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the
 the the the the the the the the the the the the the the the the the the

 the the the

 the
 the the the the the

 the the

 the the
 the the the the


 the
 the
 the
 the






 the

 the




 the


 the
 the the

 the the



 the
 the















 the

 the the






 the



 the

 the






 the





In [10]:
# Example input for pre-trained model
input_text = """what is the mistake in this python code here? a = [1,2,3,4,5,6)"""

# Tokenize the input
input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)

# Generate output
model.to(device)
pre_training_output = model.generate(
    input_ids,
    max_length=100,
    num_return_sequences=1,
    temperature=0.7,
    top_k=50,
)

# Decode and print the result
decoded_output = tokenizer.decode(pre_training_output[0], skip_special_tokens=True)
print("Output before training:")
print(decoded_output)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Output before training:
what is the mistake in this python code here? a = [1,2,3,4,5,6)
 error, error, error, error error, error, error error error error error
 error error error
 error error error error error error error error error
 error error error error error error error error error error error error error error error error error error error error error error error error error error error error error error error error error error error error error error error error error error error


# Save Model

# Running Inference

In [11]:
# Inference Example
example_input = "<h1>Heading"
input_ids = tokenizer(example_input, return_tensors="pt")["input_ids"].to(device)
attention_mask = torch.ones(input_ids.shape, device=device)
output_ids = model.generate(input_ids, max_length=512, num_return_sequences=1, top_k=50, top_p=0.95, attention_mask=attention_mask)

# Decode and print the corrected HTML code
decoded_output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print("Original HTML code:", example_input)
print("Corrected HTML code:", decoded_output)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Original HTML code: <h1>Heading
Corrected HTML code: <h1>Heading the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the

In [12]:
# Inference Example
example_input = """<p>This is a paragraph."""
input_ids = tokenizer(example_input, return_tensors="pt")["input_ids"].to(device)
attention_mask = torch.ones(input_ids.shape, device=device)
output_ids = model.generate(input_ids, max_length=512, num_return_sequences=1, top_k=50, top_p=0.95, attention_mask=attention_mask)

# Decode and print the corrected HTML code
decoded_output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
#print("Original HTML code:", example_input)
print("Corrected HTML code:", decoded_output)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Corrected HTML code: <p>This is a paragraph.

































In [13]:
# Inference Example
example_input = """<div><span>This is some text.</p></div>"""
input_ids = tokenizer(example_input, return_tensors="pt")["input_ids"].to(device)
attention_mask = torch.ones(input_ids.shape, device=device)
output_ids = model.generate(input_ids, max_length=512, num_return_sequences=1, top_k=50, top_p=0.95, attention_mask=attention_mask)

# Decode and print the corrected HTML code
decoded_output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
#print("Original HTML code:", example_input)
print("Corrected HTML code:", decoded_output)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Corrected HTML code: <div><span>This is some text.</p></div>><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><><span><span><span><span><span><span><span><span><span><span><span><span><span><span><><span><span><span><span><span><><span><span><span><><span><span><span><><span><><span><span><><span><><span><span><><span><span><span><><><><span><span><><><span><><span><span><><span><><span><span><><><><span><span><><span><><span><><><span><><><span><><><span><><><span><><><><span><><><><span><span><><><><><span><><><><><span><><><><><span><><><span><><><><><><span><><><><><span><><><><span><><><span><><><><><><><><><span><><><><><span><><><><><span><><><><><><><><span><><><><><><><span><><><><><><><><

In [14]:
# Inference Example
example_input = """list = [1,2,3,4,5,6"""
input_ids = tokenizer(example_input, return_tensors="pt")["input_ids"].to(device)
attention_mask = torch.ones(input_ids.shape, device=device)
output_ids = model.generate(input_ids, max_length=512, num_return_sequences=1, top_k=10, top_p=0.97, attention_mask=attention_mask)

# Decode and print the corrected HTML code
decoded_output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
#print("Original HTML code:", example_input)
print("Corrected HTML code:", decoded_output)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Corrected HTML code: list = [1,2,3,4,5,6,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [21]:
# Inference Example
example_input = """<div>list<div>"""
input_ids = tokenizer(example_input, return_tensors="pt")["input_ids"].to(device)
attention_mask = torch.ones(input_ids.shape, device=device)
output_ids = model.generate(input_ids, max_length=512, num_return_sequences=1, top_k=10, top_p=0.97, attention_mask=attention_mask)

# Decode and print the corrected HTML code
decoded_output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
#print("Original HTML code:", example_input)
print("Corrected HTML code:", decoded_output)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  return fn(*args, **kwargs)


Corrected HTML code: <div>list<div>div>divdiv>divdivdiv>div>div>div>div>divdiv>div>divdiv>div>div>div>div>div>div>div>div>div>div>div>div>div>div>div>div>div>div>div>div>div>div>div>div>div>divdiv>>div>div>div>div>div>div>divdiv>divdiv>div>divdiv>div>div>div>div>div>divdivdiv>divdiv>divdiv>divdiv>div>divdiv>div>divdiv>divdiv>div>div>divdivdivdiv>div>divdiv>div>div>div>div>divdiv>divdivdiv>divdivdivdiv>divdivdiv>divdivdiv>div>div>divdivdiv>divdivdiv>divdivdiv>divdivdivdivdiv>divdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdivdi