In [1]:
import pandas as pd
import re
from transformers import pipeline

In [2]:
# ─── Configuration ─────────────────────────────────────────────────────────────
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"  # if you still need it elsewhere
LLM_MODEL       = "gpt2"                                    # local substitute for gpt-3.5
CHECKPOINT_CSV  = "/Users/basusmac/Desktop/Github Repositories/SmartCandidate-Analyzer-RAG-Based-Resume-Screening/data/main-data/data-checkpoints/jd5.csv"
OUTPUT_TESTSET  = "/Users/basusmac/Desktop/Github Repositories/SmartCandidate-Analyzer-RAG-Based-Resume-Screening/data/main-data/test-sets/testset-5.csv"
OUTPUT_GENERATED= "/Users/basusmac/Desktop/Github Repositories/SmartCandidate-Analyzer-RAG-Based-Resume-Screening/data/main-data/generated-resumes/synthetic-resume-5.csv"

In [3]:
# ─── 1) Load your job descriptions ───────────────────────────────────────────────
df = pd.read_csv(CHECKPOINT_CSV)
titles = df["Job Title"].tolist()
descs  = df["Job Description"].tolist()
# prepend the title to each description
jobs   = [f"{t}\n{d}" for t, d in zip(titles, descs)]

In [4]:
# ─── 2) Instantiate a local text‑generation pipeline ─────────────────────────────
generator = pipeline(
    "text-generation",
    model=LLM_MODEL,
    device=0,                # set -1 if you’re on CPU only
    max_new_tokens=512,      # adjust to allow up to 512 tokens of output
    do_sample=True,
    top_k=50,
    return_full_text=False   # so we only get the generated part, not the prompt
)

Device set to use mps:0


In [5]:
# ─── 3) Generate resumes ─────────────────────────────────────────────────────────
generated_resumes_list = []          # fix typo here
for jd in jobs:                      # every single JD
    prompt = (
        "You are an AI assistant that helps create resumes for a given job description.\n"
        "Generate 2 resumes for each job description so that one resume is an almost perfect match, "
        "while the other resume is only slightly relevant. Use a combination of skills, different "
        "industry/project work experience, education, and certifications to produce resume data. "
        "You may add some KPIs to make work experience realistic. Do not include any note or "
        "explanation of how you generate the resumes.\n\n"
        f"Job Description (200–500 words max):\n{jd}"
    )
    out = generator(prompt)                # run the HF pipeline
    generated_text = out[0]["generated_text"]  # extract the generated text
    generated_resumes_list.append(generated_text)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (1024). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gen

In [7]:
# ─── 4) Split out the two resumes and record ground truth ─────────────────────────
generated_resumes = []
ground_truths     = []

for text in generated_resumes_list:
    # 1) Remove the header markers, just like before
    clean = re.sub(
        r"(?i)(Resume 1|Perfect Match|Almost Perfect Match|Slightly Relevant)",
        "",
        text
    ).strip()

    # 2) Try splitting on the literal marker first
    if "Resume 2" in clean:
        part1, part2 = clean.split("Resume 2", 1)
    else:
        # Fallback: split on the first double‑newline
        parts = clean.split("\n\n", 1)
        if len(parts) == 2:
            part1, part2 = parts
        else:
            # If still only one chunk, treat whole thing as part1, leave part2 empty
            part1, part2 = parts[0], ""

    # 3) Strip off any leading/trailing whitespace
    part1 = part1.strip()
    part2 = part2.strip()

    # 4) Collect
    generated_resumes.extend([part1, part2])
    ground_truths.append(part1)

In [8]:
# ─── 5) Build & save the test‑set CSV ────────────────────────────────────────────
testset_df = pd.DataFrame({
    "Job Description": jobs,
    "Ground Truth":    ground_truths
})
testset_df.to_csv(OUTPUT_TESTSET, index=False)

In [9]:
# ─── 6) Shuffle & save the synthetic resumes CSV ─────────────────────────────────
gen_df = pd.DataFrame({"Resume": generated_resumes})
gen_df = gen_df.sample(frac=1, random_state=13)  # shuffle
gen_df.to_csv(OUTPUT_GENERATED, index=False)