In [1]:
!pip install -q datasets transformers peft accelerate trl

In [2]:
!pip install --upgrade fsspec==2024.12.0 gcsfs==2024.12.0




In [3]:
from google.colab import drive

# Replace with the actual path to your JSON file in Google Drive
gdrive_json_path = '/content/drive/MyDrive/Asha_llm_data/women_jobs_chatbot_data.json'

In [4]:
import json
from datasets import Dataset

try:
    with open(gdrive_json_path, 'r') as f:
        data = json.load(f)
    dataset = Dataset.from_list(data)
    print(f"✅ Dataset loaded with {len(dataset)} samples from Google Drive.")
    print(f"Sample dataset entry:\n{dataset[0]}")
except FileNotFoundError:
    print(f"❌ Error: JSON file not found at '{gdrive_json_path}'. Please check the path.")
    exit()
except json.JSONDecodeError:
    print(f"❌ Error: Could not decode JSON from '{gdrive_json_path}'. Please ensure it's a valid JSON file.")
    exit()

✅ Dataset loaded with 4694 samples from Google Drive.
Sample dataset entry:
{'instruction': 'Answer women job related general queries. Tell me about the job: Research Intern – Generative AI Agents', 'output': 'Company: synergylabs, Location: gurugram, haryana, india, Description: job title research intern generative ai agent location gurugram duration month stipend inr u technologydriven startup exploring future aipowered agent automation mission build scalable solution using cuttingedge generative ai llm autonomous agent solve realworld problem join u youre curious innovative ready get hand dirty ai research role overview research intern dive deep world generative ai agent large language model llm autonomous multiagent system youll help u map landscape explore technical capability evaluate tool contribute prototype development key responsibility conduct comprehensive research generative ai agent llmbased agent autonomous agent framework analyze recent paper blog github project industr

In [5]:
from transformers import AutoTokenizer

model_name = "EleutherAI/pythia-70m"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [6]:
print("--- Inspecting Dataset ---")
print(dataset)
print(dataset[0])

--- Inspecting Dataset ---
Dataset({
    features: ['instruction', 'output'],
    num_rows: 4694
})
{'instruction': 'Answer women job related general queries. Tell me about the job: Research Intern – Generative AI Agents', 'output': 'Company: synergylabs, Location: gurugram, haryana, india, Description: job title research intern generative ai agent location gurugram duration month stipend inr u technologydriven startup exploring future aipowered agent automation mission build scalable solution using cuttingedge generative ai llm autonomous agent solve realworld problem join u youre curious innovative ready get hand dirty ai research role overview research intern dive deep world generative ai agent large language model llm autonomous multiagent system youll help u map landscape explore technical capability evaluate tool contribute prototype development key responsibility conduct comprehensive research generative ai agent llmbased agent autonomous agent framework analyze recent paper blo

In [17]:
def preprocess_function_single(example):
    instruction = example.get('instruction')
    output = example.get('output')
    if instruction and output:
        try:
            input_text = f"{instruction} {output}"
            model_inputs = tokenizer(
                input_text,
                truncation=True,
                max_length=512,
                padding="max_length"
            )
            # Mask out padding tokens in labels
            labels = model_inputs["input_ids"].copy()
            labels = [label if mask == 1 else -100 for label, mask in zip(labels, model_inputs["attention_mask"])]
            model_inputs["labels"] = labels
            return model_inputs
        except Exception as e:
            print(f"Error processing example: {example}\nError: {e}")
            return None
    else:
        print(f"Warning: Skipping example due to missing 'instruction' or 'output': {example}")
        return None


In [8]:
few_samples_dataset = dataset.select(range(5))


In [9]:
tokenized_few_samples = few_samples_dataset.map(preprocess_function_single, num_proc=1)
print(tokenized_few_samples[0])


Map:   0%|          | 0/5 [00:00<?, ? examples/s]

{'instruction': 'Answer women job related general queries. Tell me about the job: Research Intern – Generative AI Agents', 'output': 'Company: synergylabs, Location: gurugram, haryana, india, Description: job title research intern generative ai agent location gurugram duration month stipend inr u technologydriven startup exploring future aipowered agent automation mission build scalable solution using cuttingedge generative ai llm autonomous agent solve realworld problem join u youre curious innovative ready get hand dirty ai research role overview research intern dive deep world generative ai agent large language model llm autonomous multiagent system youll help u map landscape explore technical capability evaluate tool contribute prototype development key responsibility conduct comprehensive research generative ai agent llmbased agent autonomous agent framework analyze recent paper blog github project industry update related ai agent benchmark compare agent framework toolkits perform

In [18]:
tokenized_dataset = dataset.map(preprocess_function_single, num_proc=1)
print(tokenized_dataset[0])



Map:   0%|          | 0/4694 [00:00<?, ? examples/s]

{'instruction': 'Answer women job related general queries. Tell me about the job: Research Intern – Generative AI Agents', 'output': 'Company: synergylabs, Location: gurugram, haryana, india, Description: job title research intern generative ai agent location gurugram duration month stipend inr u technologydriven startup exploring future aipowered agent automation mission build scalable solution using cuttingedge generative ai llm autonomous agent solve realworld problem join u youre curious innovative ready get hand dirty ai research role overview research intern dive deep world generative ai agent large language model llm autonomous multiagent system youll help u map landscape explore technical capability evaluate tool contribute prototype development key responsibility conduct comprehensive research generative ai agent llmbased agent autonomous agent framework analyze recent paper blog github project industry update related ai agent benchmark compare agent framework toolkits perform

In [22]:
from peft import prepare_model_for_kbit_training

# If using a quantized model (like bitsandbytes), prepare it
model = prepare_model_for_kbit_training(model)


In [19]:
from transformers import AutoModelForCausalLM
from peft import LoraConfig, get_peft_model
import torch

# Load the base model
model = AutoModelForCausalLM.from_pretrained(model_name, device_map='auto')

# Configure LoRA
lora_config = LoraConfig(
    r=8, # Further reduced rank
    lora_alpha=16, # Reduced alpha
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# Apply LoRA
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 98,304 || all params: 70,524,928 || trainable%: 0.1394


In [23]:
from peft import prepare_model_for_kbit_training, get_peft_model

# Make model ready for LoRA + gradient updates
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

# Double-check it has trainable parameters
model.print_trainable_parameters()


trainable params: 98,304 || all params: 70,524,928 || trainable%: 0.1394




In [20]:
from transformers import TrainingArguments

output_dir = "./pythia-70m-women-jobs-lora"
per_device_train_batch_size = 4
gradient_accumulation_steps = 4
optim = "adamw_torch"  # ← FIXED HERE
save_steps = 100
logging_steps = 10
learning_rate = 5e-4
max_grad_norm = 0.3
max_steps = 1000
warmup_ratio = 0.1
lr_scheduler_type = "cosine"

training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    fp16=True,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=True,
    lr_scheduler_type=lr_scheduler_type,
    push_to_hub=False,
    gradient_checkpointing=True,
)


In [16]:
# Print number of trainable parameters
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())
print(f"Trainable parameters: {trainable_params} / {total_params}")


Trainable parameters: 98304 / 70524928


In [24]:
from trl import SFTTrainer

model.train()

trainer = SFTTrainer(
    model=model,
    train_dataset=tokenized_dataset,
    peft_config=lora_config,
    args=training_arguments,
)

# Step 3: Start training
print("\n🚀 Starting the training process...")
trainer.train()
print("✅ Training finished!")

Truncating train dataset:   0%|          | 0/4694 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.



🚀 Starting the training process...


Step,Training Loss
10,7.8413
20,7.5776
30,5.9062
40,2.7794
50,1.2638
60,1.1225
70,0.992
80,0.8532
90,0.7825
100,0.7334


✅ Training finished!


In [28]:
# Save the LoRA adapter
trainer.model.save_pretrained(output_dir)

# Save tokenizer files (needed for inference)
tokenizer.save_pretrained(output_dir)


('./pythia-70m-women-jobs-lora/tokenizer_config.json',
 './pythia-70m-women-jobs-lora/special_tokens_map.json',
 './pythia-70m-women-jobs-lora/tokenizer.json')

In [31]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import torch

# Load base model and tokenizer
base_model = AutoModelForCausalLM.from_pretrained("EleutherAI/pythia-70m", device_map="auto")
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-70m")

# Load LoRA adapter
model = PeftModel.from_pretrained(base_model, "./pythia-70m-women-jobs-lora")
model.eval()

# Inference code
prompt = "Suggest jobs for Software Technology\n"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=100,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
    )

print("🧠 Model Output:\n", tokenizer.decode(outputs[0], skip_special_tokens=True))


🧠 Model Output:
 Suggest jobs for Software Technology

A:

I think it would be more interesting to consider that the current system is a more powerful system than the one that can be built.
The reason is that it's a lot easier to build software, which is why it's a lot easier to build software.


