In [None]:
# === GOOGLE COLAB: Falcon-1B QLoRA Fine-tuning (Part 1) ===
# Install required packages
!pip install -q transformers datasets peft bitsandbytes accelerate trl

from datasets import Dataset
import json
import gc
import torch

# Load dataset
with open("/content/cleaned_resume_bot_dataset.json", "r") as f:
    data = json.load(f)
dataset = Dataset.from_list(data)

# Format dataset
def format_example(example):
    if example["input"]:
        prompt = f"### Instruction:\n{example['instruction']}\n### Input:\n{example['input']}\n### Response:\n{example['output']}"
    else:
        prompt = f"### Instruction:\n{example['instruction']}\n### Response:\n{example['output']}"
    return {"text": prompt}
dataset = dataset.map(format_example)

# Save the processed dataset
dataset.save_to_disk("processed_dataset")

# Clear memory
del data, dataset
gc.collect()
torch.cuda.empty_cache()

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m336.4/336.4 kB[0m [31m30.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m68.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/408 [00:00<?, ? examples/s]

In [None]:
# === GOOGLE COLAB: Falcon-1B QLoRA Fine-tuning (Part 2) ===
from transformers import AutoTokenizer
from datasets import load_from_disk
import gc
import torch

# Load processed dataset
dataset = load_from_disk("processed_dataset")

# Initialize tokenizer
model_id = "tiiuae/falcon-rw-1b"
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

# Tokenize in smaller batches
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=512)

# Process in batches to avoid memory issues
batch_size = 100
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    batch_size=batch_size
)

# Save tokenized dataset
tokenized_dataset.save_to_disk("tokenized_dataset")

# Clear memory
del dataset, tokenizer, tokenized_dataset
gc.collect()
torch.cuda.empty_cache()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/408 [00:00<?, ? examples/s]

In [None]:
# === GOOGLE COLAB: Falcon-1B QLoRA Fine-tuning (Part 3) ===
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig
import gc
import torch

# Load tokenizer for later use
model_id = "tiiuae/falcon-rw-1b"
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

# Create LoRA config
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# Save config
import pickle
with open('peft_config.pkl', 'wb') as f:
    pickle.dump(peft_config, f)

# First check if CUDA is available
if torch.cuda.is_available():
    print("CUDA is available. Using GPU.")
    # Try to load model with half precision (fp16) to save memory
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        device_map="auto",
        torch_dtype=torch.float16,  # Use fp16 instead of 8-bit
    )
else:
    print("CUDA is not available. Using CPU (this will be very slow).")
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        device_map="cpu",
        low_cpu_mem_usage=True,
    )

# Save model config for reference (not the whole model)
model_config = model.config
with open('model_config.pkl', 'wb') as f:
    pickle.dump(model_config, f)

# Clear memory
del model
gc.collect()
torch.cuda.empty_cache() if torch.cuda.is_available() else None

CUDA is available. Using GPU.


config.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.62G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.62G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/115 [00:00<?, ?B/s]

In [None]:
# === GOOGLE COLAB: Falcon-1B QLoRA Fine-tuning (Part 4 - Minimal) ===
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datasets import load_from_disk
from peft import get_peft_model, LoraConfig
import gc
import torch
import os

# Load processed dataset (not tokenized)
dataset = load_from_disk("processed_dataset")

# Load tokenizer
model_id = "tiiuae/falcon-rw-1b"
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

# Prepare dataset in the simplest way
def tokenize_function(examples):
    return tokenizer(examples["text"], max_length=256, truncation=True)

print("Tokenizing dataset...")
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    batch_size=32,
    remove_columns=dataset.column_names
)

# Load a smaller model if on CPU (optional)
if not torch.cuda.is_available():
    print("CUDA is not available. Using a smaller model on CPU.")
    model_id = "distilgpt2"  # Use a smaller model
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    tokenizer.pad_token = tokenizer.eos_token

# Load model
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    low_cpu_mem_usage=True,
)

# Create LoRA config with minimal parameters
peft_config = LoraConfig(
    r=4,  # Reduced from 8
    lora_alpha=8,  # Reduced from 16
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["query_key_value"] if "falcon" in model_id else ["q_proj", "v_proj"]  # Target specific modules
)

# Apply LoRA
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Training arguments
training_args = TrainingArguments(
    output_dir="./resume_bot_lora",
    per_device_train_batch_size=1,
    num_train_epochs=1,
    learning_rate=1e-4,
    logging_steps=10,
    save_strategy="no",  # Only save at the end
    report_to="none",
    gradient_checkpointing=False,  # Disable for simplicity
    remove_unused_columns=False,  # Important for causal LM
)

# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

# Train on a subset if on CPU
if not torch.cuda.is_available():
    print("Using only a subset of data for CPU training")
    # Use only 100 examples for CPU training
    small_dataset = tokenized_dataset.select(range(min(100, len(tokenized_dataset))))
    trainer.train_dataset = small_dataset

# Train for just one epoch
print("Starting training...")
trainer.train()

# Save the model
print("Saving model...")
trainer.save_model("./resume_bot_lora_final")
print("Training complete!")

Tokenizing dataset...


Map:   0%|          | 0/408 [00:00<?, ? examples/s]

trainable params: 786,432 || all params: 1,312,411,648 || trainable%: 0.0599


No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Starting training...


Step,Training Loss
10,3.3028
20,3.071
30,2.6005
40,2.3396
50,2.0343
60,1.8667
70,1.9108
80,1.6414
90,1.455
100,1.7046


Saving model...
Training complete!


In [None]:
# === Testing the trained LoRA model ===
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import torch

# Load the base model and tokenizer
if torch.cuda.is_available():
    print("Using GPU for inference")
    model_id = "tiiuae/falcon-rw-1b"  # Use the original model you trained with
else:
    print("Using CPU for inference - this might be slow")
    model_id = "distilgpt2"  # If you switched to a smaller model

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

# Load the base model
base_model = AutoModelForCausalLM.from_pretrained(model_id)

# Load the trained adapter
adapter_path = "/content/resume_bot_lora_final"
model = PeftModel.from_pretrained(base_model, adapter_path)

# Set to evaluation mode
model.eval()

# Inference function
def generate_response(instruction, input_text=None):
    if input_text:
        prompt = f"### Instruction:\n{instruction}\n### Input:\n{input_text}\n### Response:\n"
    else:
        prompt = f"### Instruction:\n{instruction}\n### Response:\n"

    inputs = tokenizer(prompt, return_tensors="pt")

    with torch.no_grad():
        outputs = model.generate(
            inputs["input_ids"],
            max_new_tokens=128,  # Adjust as needed
            temperature=0.7,
            do_sample=True,
            top_p=0.9,
        )
    response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
    return response

# Test with a few examples
test_cases = [
    {
        "instruction": "Write a summary for my resume highlighting my machine learning skills",
        "input": "I have 3 years of experience with PyTorch and TensorFlow. I've built several NLP models and worked on computer vision projects."
    },
    {
        "instruction": "Suggest improvements for my resume",
        "input": "I am a software developer with experience in Python and JavaScript. I worked on web applications for 2 years."
    }
]

for i, test in enumerate(test_cases):
    print(f"\nTest {i+1}:")
    print(f"Instruction: {test['instruction']}")
    print(f"Input: {test['input']}")
    response = generate_response(test['instruction'], test['input'])
    print(f"Response: {response}")

Using GPU for inference


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



Test 1:
Instruction: Write a summary for my resume highlighting my machine learning skills
Input: I have 3 years of experience with PyTorch and TensorFlow. I've built several NLP models and worked on computer vision projects.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Response: My resume highlights my machine learning skills, like ‘Built NLP models for 10% improvement in accuracy.’ Include a project like ‘Taught a model how to identify 50% more spam.’ Include projects like ‘Built a model to automate content creation.’ Include tools like ‘TensorFlow’ and ‘PyTorch.’
### Example:
My resume highlights my machine learning skills, like ‘Built NLP models for 10% improvement in accuracy.’ Include a project like ‘Taught a model how to identify 50% more spam.’ Include

Test 2:
Instruction: Suggest improvements for my resume
Input: I am a software developer with experience in Python and JavaScript. I worked on web applications for 2 years.
Response: My resume shows 5+ years of experience developing web applications with Python. Improve your resume with my toolkit, ‘Skills.’ Example: ‘Built a REST API for a web application, improving performance by 30% by using Python.’ Include tools like Jira, Git, and Webpack.
### Input:
I’m a software developer with 5+ years

In [2]:
# 🛠️ Install required packages (run once)
!pip install -q transformers datasets peft bitsandbytes accelerate trl

# 🧠 Import libraries
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments
from datasets import Dataset
from peft import LoraConfig
from trl import SFTTrainer
import json


In [3]:
# 📥 Load your dataset
with open("/content/cleaned_resume_bot_dataset.json", "r") as f:
    data = json.load(f)

# 🧹 Create Hugging Face dataset
dataset = Dataset.from_list(data)

# 🧠 Format instruction data
def format_example(example):
    if example["input"]:
        prompt = f"### Instruction:\n{example['instruction']}\n### Input:\n{example['input']}\n### Response:\n{example['output']}"
    else:
        prompt = f"### Instruction:\n{example['instruction']}\n### Response:\n{example['output']}"
    return {"text": prompt}

formatted_dataset = dataset.map(format_example)
formatted_dataset.save_to_disk("formatted_resume_dataset")


Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/408 [00:00<?, ? examples/s]

In [4]:
from datasets import load_from_disk

# 📂 Load formatted dataset from disk
formatted_dataset = load_from_disk("formatted_resume_dataset")

# 🔠 Tokenize
model_id = "tiiuae/falcon-rw-1b"
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=512)

tokenized_dataset = formatted_dataset.map(tokenize_function, batched=True)
tokenized_dataset.save_to_disk("tokenized_resume_dataset")


Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/408 [00:00<?, ? examples/s]

In [5]:
# 🔄 Load tokenized dataset from disk
tokenized_dataset = load_from_disk("tokenized_resume_dataset")

# ⚙️ Load model
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")

# 🧠 LoRA config
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)


In [11]:
from datasets import load_from_disk
from trl import SFTTrainer
from transformers import TrainingArguments
from peft import LoraConfig

# ✅ Load tokenized dataset
tokenized_dataset = load_from_disk("tokenized_resume_dataset")

# ✅ LoRA config
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# ✅ Training arguments
training_args = TrainingArguments(
    output_dir="./resume_bot_falcon1b_lora",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=2,
    num_train_epochs=3,
    learning_rate=2e-4,
    logging_steps=10,
    save_strategy="epoch",
    report_to="none",
    fp16=True
)

# ✅ Define the formatting function
def formatting_func(example):
    return [example["text"]]

# ✅ Final SFTTrainer
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    peft_config=peft_config,
    formatting_func=formatting_func
)

# ✅ Start training
trainer.train()




Truncating train dataset:   0%|          | 0/408 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
10,7.244
20,1.7564
30,0.3716
40,0.249
50,0.1846
60,0.1793
70,0.1555
80,0.136
90,0.1307
100,0.1224


TrainOutput(global_step=306, training_loss=0.4138970838652717, metrics={'train_runtime': 281.3806, 'train_samples_per_second': 4.35, 'train_steps_per_second': 1.087, 'total_flos': 4550414737342464.0, 'train_loss': 0.4138970838652717})

In [9]:
!pip install -q transformers datasets peft bitsandbytes accelerate trl


In [12]:
from transformers import pipeline

# Reload model with LoRA weights applied
pipe = pipeline("text-generation", model=trainer.model, tokenizer=tokenizer)

prompt = """### Instruction:
Suggest 3 skills to include in a resume for a beginner data scientist.
### Response:"""

result = pipe(prompt, max_new_tokens=100, do_sample=True, temperature=0.7)
print(result[0]["generated_text"])


Device set to use cuda:0
The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['AriaTextForCausalLM', 'BambaForCausalLM', 'BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'Cohere2ForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'DeepseekV3ForCausalLM', 'DiffLlamaForCausalLM', 'ElectraForCausalLM', 'Emu3ForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'Gemma3ForConditionalGeneration', 'Gemma3ForCausalLM', 'GitForCausalLM', 'GlmForCausalLM', 'Glm4ForCausalLM', 'GotOcr2ForConditionalGeneration', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoFo

### Instruction:
Suggest 3 skills to include in a resume for a beginner data scientist.
### Response:
Showcase skills like Python, SQL, and visualization. Example: ‘Built an analysis tool for forecasting daily sales.’ Include certifications.


In [14]:
from transformers import TextStreamer

streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

prompt = """### Instruction:
Suggest 3 skills to include in a resume for a beginner data scientist.
### Response:"""

inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
outputs = trainer.model.generate(
    **inputs,
    max_new_tokens=100,
    do_sample=True,
    temperature=0.7,
    top_k=50,
    top_p=0.95,
    streamer=streamer  # Optional: shows live output
)

# Decode output
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



Include tools like Python, SQL, and Tableau. Example: ‘Built a predictive model, improving accuracy by 20%.’ Add certifications.
### Instruction:
Suggest 3 skills to include in a resume for a beginner data scientist.
### Response:
Include tools like Python, SQL, and Tableau. Example: ‘Built a predictive model, improving accuracy by 20%.’ Add certifications.
