In [1]:
!pip install -q transformers datasets accelerate peft bitsandbytes trl


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.4/41.4 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.1/60.1 MB[0m [31m29.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m564.6/564.6 kB[0m [31m30.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m101.5 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m564.3/564.3 kB[0m [31m31.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m45.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m71.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.8 MB/s[0m

In [2]:
import torch
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    TrainingArguments, 
    Trainer,
    DataCollatorForLanguageModeling
)
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training
import gc

2025-10-07 05:41:47.423390: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1759815707.620749      37 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1759815707.675338      37 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
print(f"GPU Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")


GPU Available: True
GPU Name: Tesla T4
GPU Memory: 15.83 GB


In [29]:
model_name = "Qwen/Qwen2.5-3B-Instruct"

raw_data = load_dataset("json", data_files="/kaggle/input/aug-dataset/Augmented-dataset.json")
print(f"Dataset loaded: {raw_data}")
print(f"Sample: {raw_data['train'][0]}")

Generating train split: 0 examples [00:00, ? examples/s]

Dataset loaded: DatasetDict({
    train: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 197
    })
})
Sample: {'prompt': 'What is your Name?', 'completion': 'My name is Bhuvan S.'}


In [30]:
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

In [31]:
def preprocess(examples):
    texts = []
    for prompt, completion in zip(examples["prompt"], examples["completion"]):
        messages = [
            {"role": "user", "content": prompt},
            {"role": "assistant", "content": completion}
        ]
        # Apply chat template
        text = tokenizer.apply_chat_template(
            messages, 
            tokenize=False, 
            add_generation_prompt=False
        )
        texts.append(text)
    
    tokenized = tokenizer(
        texts,
        max_length=256,
        truncation=True,
        padding="max_length",
        return_tensors=None
    )
    
    tokenized["labels"] = tokenized["input_ids"].copy()
    
    return tokenized

In [32]:
tokenized_data = raw_data.map(
    preprocess, 
    batched=True,
    remove_columns=raw_data["train"].column_names,
    desc="Tokenizing dataset"
)

print(f"Tokenized dataset: {tokenized_data}")

Tokenizing dataset:   0%|          | 0/197 [00:00<?, ? examples/s]

Tokenized dataset: DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 197
    })
})


In [33]:
gc.collect()
torch.cuda.empty_cache()

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",  
    torch_dtype=torch.float16,
    trust_remote_code=True,
    low_cpu_mem_usage=True  
)

model.gradient_checkpointing_enable()

lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=16,  
    lora_alpha=32,  
    lora_dropout=0.05,  
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],  
    bias="none"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

trainable params: 7,372,800 || all params: 3,093,311,488 || trainable%: 0.2383


In [36]:
output_dir = "/kaggle/working/qwen-finetuned"

training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=50,  
    per_device_train_batch_size=2, 
    gradient_accumulation_steps=4, 
    learning_rate=2e-4, 
    lr_scheduler_type="cosine",  
    warmup_steps=10,  
    logging_steps=100,  
    save_strategy="epoch",  
    save_total_limit=2,  
    fp16=True,  
    gradient_checkpointing=True,
    optim="adamw_torch",  
    report_to="none", 
    push_to_hub=False,
    max_grad_norm=1.0, 
    remove_unused_columns=False,  
    eval_strategy="no", 
)

In [37]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    data_collator=data_collator,
    tokenizer=tokenizer
)

print("Starting training...")
trainer.train()

  trainer = Trainer(


Starting training...


Step,Training Loss
100,0.7869
200,0.2683
300,0.1289
400,0.1027
500,0.0881
600,0.0817
700,0.0772
800,0.0753
900,0.0738
1000,0.0724


TrainOutput(global_step=1250, training_loss=0.15464356327056886, metrics={'train_runtime': 2933.9598, 'train_samples_per_second': 3.357, 'train_steps_per_second': 0.426, 'total_flos': 4.2092764594176e+16, 'train_loss': 0.15464356327056886, 'epoch': 50.0})

In [38]:
model.save_pretrained(f"{output_dir}/final_model_new")
tokenizer.save_pretrained(f"{output_dir}/final_model_new")

print(f"Model saved to {output_dir}/final_model")

Model saved to /kaggle/working/qwen-finetuned/final_model


In [57]:
from peft import PeftModel


base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="cuda",
    torch_dtype=torch.float16,
    trust_remote_code=True
)

finetuned_model = PeftModel.from_pretrained(
    base_model,
    f"{output_dir}/final_model_new"
)
finetuned_model.eval()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Qwen2ForCausalLM(
      (model): Qwen2Model(
        (embed_tokens): Embedding(151936, 2048)
        (layers): ModuleList(
          (0-35): 36 x Qwen2DecoderLayer(
            (self_attn): Qwen2Attention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=2048, out_features=2048, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear(

In [62]:
test_prompts = [
    "What is your name?",
    "Tell me about yourself",
    "What are your technical skills?",
    "what AI techniques are you familiar with and where have you used them?",
    "Do you have any experience that isn't purely technical?",
    "What are you interested in?",
    "What's your specialty?",
    "How can I contact you?",
]


In [63]:
print("="*60)
print("TESTING FINE-TUNED MODEL")
print("="*60)

for test_prompt in test_prompts:
    messages = [{"role": "user", "content": test_prompt}]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    
    inputs = tokenizer(text, return_tensors="pt").to("cuda")
    outputs = finetuned_model.generate(
        **inputs,
        max_new_tokens=100,
        temperature=0.1,
        do_sample=True,
        top_p=0.9,
        repetition_penalty=1.2,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id
    )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    response = response.split("assistant\n")[-1] if "assistant" in response else response
    
    print(f"\nprompt: {test_prompt}")
    print(f"Response: {response.strip()}")
    print("-"*130)


TESTING FINE-TUNED MODEL

prompt: What is your name?
Response: My name is Bhuvan S.
----------------------------------------------------------------------------------------------------------------------------------

prompt: Tell me about yourself
Response: I'm Bhuvan S., an aspiring AI engineer in my final year of college. I love working on end-to-end projects, from data analysis and model training to building APIs and deploying them. My main interests are in deep learning, computer vision, and building practical, explainable AI systems.
----------------------------------------------------------------------------------------------------------------------------------

prompt: What are your technical skills?
Response: I have hands-on experience with several tools, including Git for version control, Python and its libraries for programming, GitHub Actions for automating workflows, Flask for web development, BERT and other NLP models for building intelligent systems, GPUs for accelerated c