In [1]:
import json, random, faker

**Declare a function to generate fake data**

In [2]:
fake = faker.Faker()
roles = ["Backend Developer", "Frontend Developer", "Full-Stack Developer", "DevOps Engineer"]
skills_pool = [
    ["Java", "Spring Boot", "PostgreSQL", "Docker"],
    ["Python", "Django", "MySQL", "AWS"],
    ["JavaScript", "React", "Node.js", "MongoDB"],
    ["C#", ".NET", "Azure", "CI/CD"],
    ["Go", "Gin", "Kubernetes", "GCP"]
]
certs = [
    ("AWS Certified Developer", "Amazon Web Services"),
    ("Oracle Certified Java Developer", "Oracle"),
    ("Microsoft Azure Fundamentals", "Microsoft"),
    ("Docker Certified Associate", "Docker Inc."),
]

**Generate Data**

In [3]:
data = []
for i in range(500):
    name = fake.name()
    title = random.choice(roles)
    email = fake.email()
    phone = f"0{random.randint(200,999)}-{random.randint(555,999)}-{random.randint(1000,9999)}"
    github = f"https://github.com/{name.split()[0].lower()}{random.randint(1,999)}"
    linkedin = f"https://linkedin.com/in/{name.split()[0].lower()}{random.randint(1,999)}"
    location = f"{fake.city()}, {fake.country()}"
    dob = fake.date_of_birth(minimum_age=22, maximum_age=35).strftime("%d/%m/%Y")
    summary = f"{title} with {random.randint(2,8)} years of experience specializing in scalable systems and web applications."
    skillset = [{"name": s} for s in random.choice(skills_pool)]
    company = fake.company()
    work_experience = [{
        "position": title,
        "company": company,
    }]
    project = [{
        "name": f"{random.choice(['Inventory', 'Booking', 'Payment', 'E-Commerce', 'Analytics'])} System",
        "description": f"Developed a {random.choice(['web', 'cloud', 'mobile'])}-based solution using modern frameworks.",
        "link": github
    }]
    education = [{
        "school": fake.company() + " University",
        "major": "Computer Science",
        "period": f"{random.randint(2013,2017)} – {random.randint(2017,2021)}"
    }]
    cert_title, cert_org = random.choice(certs)
    certificate = [{"title": cert_title, "organization": cert_org, "date": str(random.randint(2020,2024))}]
    language = [{"name": "English", "level": random.choice(["Fluent", "Advanced", "Native"])}]

    obj = {
        "input": f"{name}\nEmail: {email} | Phone: {phone}\nGitHub: {github} | LinkedIn: {linkedin}\nLocation: {location} | Date of Birth: {dob}\n\nSummary:\n{summary}\n\nSkill:\n{', '.join([s['name'] for s in skillset])}\n\nWork Experience:\n{work_experience[0]['period']}\n{title} at {company}\n\nHighlight Project:\n{project[0]['name']} – {project[0]['description']}\n\nEducation:\n{education[0]['school']} – {education[0]['major']} ({education[0]['period']})\n\nCertificate:\n{certificate[0]['title']} – {certificate[0]['organization']} ({certificate[0]['date']})\n\nForeign Language:\nEnglish ({language[0]['level']})",
        "output": {
            "name": name,
            "title": title,
            "contact": {
                "email": email,
                "phone": phone,
                "github": github,
                "linkedin": linkedin,
                "location": location,
                "date_of_birth": dob
            },
            "summary": summary,
            "skills": skillset,
            "work_experience": work_experience,
            "projects": project,
            "education": education,
            "certificates": certificate,
            "languages": language
        }
    }

    data.append(obj)

append each generated profile to the data list

In [4]:
# Save each JSON per line
with open("../agent_core/data/resume_dataset.txt", "w", encoding="utf-8") as f:
    for item in data:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

print("✅ Generated resume_dataset.txt with 500 entries.")

✅ Generated resume_dataset.txt with 500 entries.
