In [1]:
import os
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'

In [2]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
import time
import evaluate
import pandas as pd
import numpy as np
import time

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
from huggingface_hub import login
login(token = 'my token')

# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")
model = model.to(device)

In [5]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"
print_number_of_trainable_model_parameters(model)

'trainable model parameters: 1235814400\nall model parameters: 1235814400\npercentage of trainable model parameters: 100.00%'

In [6]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=32, # Rank
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "up_proj", "down_proj", "o_proj", "gate_proj"],
    lora_dropout=0.05,
    bias="none"
    #task_type=TaskType.SEQ_2_SEQ_LM # FLAN-T5
)


In [7]:
peft_model = get_peft_model(model,
                            lora_config)
peft_model = peft_model.to(device) 
# Print number of trainable model parameters
# Your Code Here
# Function to print the number of trainable parameters
def print_trainable_parameters(model):
    trainable_params = 0
    all_params = 0
    for param in model.parameters():
        all_params += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(f"Trainable parameters: {trainable_params} ({100 * trainable_params / all_params:.2f}%) of total parameters")

# Print the number of trainable parameters
print_trainable_parameters(peft_model)

Trainable parameters: 22544384 (1.79%) of total parameters


In [8]:
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    GenerationConfig,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq
)

### Demo data change later, neeed to load the real data into a similar format later

In [9]:
data = [
    {
        "job_description": "We are looking for a data scientist with experience in Python, SQL, and ML modeling.",
        "original_resume": "John Doe: Experienced software engineer with background in web development, JavaScript, and frontend design.",
        "prompt": "Please provide suggestions and instructions to improve the above resume so it matches the given job description.",
        "improved_instructions": "Focus on highlighting Python and SQL experience. Mention machine learning projects and emphasize data analysis skills."
    },
    {
        "job_description": "Our company seeks a backend developer proficient in Node.js, databases, and AWS deployments.",
        "original_resume": "Jane Smith: Full-stack developer with React, CSS, and graphic design experience.",
        "prompt": "Given the job description and original resume, improve the resume with backend development keywords and relevant AWS skills.",
        "improved_instructions": "Add Node.js backend project experience, emphasize database optimization, and showcase AWS deployment experience."
    }
]

In [10]:
# Convert this to a Hugging Face Dataset
dataset = Dataset.from_list(data)

def format_example(example):
    return (
        "Job Description:\n" + example["job_description"] + "\n\n" +
        "Original Resume:\n" + example["original_resume"] + "\n\n" +
        "Instruction:\n" + example["prompt"] + "\n\n" +
        "Improved Instructions (TARGET):\n" + example["improved_instructions"]
    )

In [11]:
def preprocess_function(examples):
    model_inputs = {
        "input_ids": [],
        "attention_mask": [],
        "labels": []
    }

    for jd, resume, p, target in zip(
        examples["job_description"],
        examples["original_resume"],
        examples["prompt"],
        examples["improved_instructions"]
    ):
        # Prepare a prompt that includes job description, original resume, and user prompt
        prompt_text = (
            "Below is a job description and an original resume. "
            "Your task is to provide suggestions and instructions on how to improve the resume.\n\n"
            f"Job Description:\n{jd}\n\n"
            f"Original Resume:\n{resume}\n\n"
            f"USER PROMPT:\n{p}\n\n"
            "Please provide improved instructions below:\n"
        )

        # Full text includes the prompt and the target instructions
        full_text = prompt_text + target

        # Tokenize the full text
        tokenized = tokenizer(full_text, max_length=512, truncation=True)

        # For causal LM training, labels = input_ids
        tokenized["labels"] = tokenized["input_ids"].copy()

        model_inputs["input_ids"].append(tokenized["input_ids"])
        model_inputs["attention_mask"].append(tokenized["attention_mask"])
        model_inputs["labels"].append(tokenized["labels"])

    return model_inputs

In [12]:
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=dataset.column_names)
train_dataset = tokenized_dataset


Map:   0%|          | 0/2 [00:00<?, ? examples/s]

In [13]:
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

training_args = TrainingArguments(
    output_dir="./finetuned-llama-lora",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    learning_rate=1e-4,
    num_train_epochs=2,
    logging_steps=1,
    #save_steps=10,
    #save_total_limit=1,
    gradient_accumulation_steps=1,
    fp16=True if torch.cuda.is_available() else False,
    eval_strategy="no",
    logging_dir="./logs"
)

In [14]:
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=None,
    tokenizer=tokenizer,
    data_collator=data_collator
)

  trainer = Trainer(


In [15]:
trainer.train()

# Save the LoRA adapter
trainer.save_model("./finetuned-llama-lora")

Step,Training Loss
1,2.8139
2,3.0665
3,2.2024
4,2.6638


In [16]:
test_job_description = "We need a data engineer with strong Python, ETL, and data warehousing skills."
test_original_resume = "Sam Johnson: Developer with some Python experience and interest in data."
test_prompt = "Provide instructions to improve the resume to match the data engineer role."

test_input = (
    "Below is a job description and an original resume. "
    "Your task is to provide suggestions and instructions on how to improve the resume.\n\n"
    f"Job Description:\n{test_job_description}\n\n"
    f"Original Resume:\n{test_original_resume}\n\n"
    f"USER PROMPT:\n{test_prompt}\n\n"
    "Please provide improved instructions below:\n"
)

inputs = tokenizer(test_input, return_tensors="pt").to(device)

generation_config = GenerationConfig(
    max_new_tokens=128,
    temperature=0.7,
    do_sample=True,
    top_k=50
)

outputs = peft_model.generate(**inputs, generation_config=generation_config)
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("=== GENERATED OUTPUT ===")
print(generated_text)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


=== GENERATED OUTPUT ===
Below is a job description and an original resume. Your task is to provide suggestions and instructions on how to improve the resume.

Job Description:
We need a data engineer with strong Python, ETL, and data warehousing skills.

Original Resume:
Sam Johnson: Developer with some Python experience and interest in data.

USER PROMPT:
Provide instructions to improve the resume to match the data engineer role.

Please provide improved instructions below:
1. Start with a strong title and objective statement.
2. Highlight relevant technical skills and certifications.
3. Emphasize achievements and accomplishments in the field.
4. Use action verbs to describe responsibilities and tasks.
5. Quantify achievements by including numbers and statistics.
6. Include relevant projects, tools, and technologies.
7. Tailor the resume to the job description.

Here is the original resume:

Sam Johnson: Developer with some Python experience and interest in data.
- Created a web app 