Implementamos un generador de datos sinteticos basado en un promt generado también de forma sintética para conseguir más diversidad de los textos

In [None]:
!pip install transformers accelerate bitsandbytes sentencepiece huggingface-hub torch


Generación local de cover letters usando modelos descargados desde Hugging Face<br>
(soporta quantización 8-bit via bitsandbytes).

Uso:<br>
  - Crea un secrets.json con tu token HF si el modelo requiere autenticación:<br>
    {
      "HUGGINGFACE_TOKEN": "hf_xxx",

      "LLAMA_MODEL": "meta-llama/Llama-2-7b-chat-hf",
      
      "HF_MODEL": "tiiuae/falcon-7b-instruct"
    }
  - Añade secrets.json a .gitignore antes de subir a GitHub.

Notas de hardware:
  - En GPU tipo P100 (12/16GB VRAM) modelos de ~7B con 8-bit suelen ser factibles.



In [None]:

import os
import json
import textwrap
import logging
from typing import Tuple, Optional

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig

LOG = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

#Utils 
def load_secrets(secrets_path: str = "secrets.json") -> dict:
    """Carga secrets.json si existe"""
    if os.path.exists(secrets_path):
        with open(secrets_path, "r", encoding="utf-8") as f:
            return json.load(f)
    return {}


# Prompt builder
def random_applicant() -> dict:
    """Genera un perfil de solicitante aleatorio (en inglés, con más variedad y contexto)."""
    import random

    # --- Datos básicos ---
    first_names = [
        "Alex", "Jamie", "Taylor", "Jordan", "Morgan", "Riley", "Casey", "Chris", "Avery", "Drew",
        "Samantha", "Daniel", "Sophia", "Michael", "Olivia", "Liam", "Isabella", "Noah", "Ethan", "Emma"
    ]
    last_names = [
        "Johnson", "Smith", "Lee", "Patel", "Williams", "Garcia", "Brown", "Davis", "Miller", "Wilson",
        "Martinez", "Anderson", "Clark", "Lopez", "Lewis", "Walker", "Young", "Allen", "King"
    ]

    # --- Roles profesionales ---
    roles = [
        "Data Scientist", "Software Engineer", "Frontend Developer", "UX Designer",
        "Marketing Manager", "Business Analyst", "Product Manager", "Research Scientist",
        "Sales Executive", "Customer Success Manager", "Financial Analyst", "HR Coordinator",
        "Operations Lead", "IT Support Specialist", "DevOps Engineer", "Content Strategist",
        "Mechanical Engineer", "Project Manager", "Copywriter", "Legal Assistant"
    ]

    # --- Empresas ficticias ---
    companies = [
        "Acme Corp", "TechNova", "BlueSky Labs", "FutureWorks", "Quantum Analytics", "NextGen Solutions",
        "EverBright", "Pioneer Systems", "OrbitSoft", "NeuralEdge", "BrightLeaf Consulting", "SkyBridge AI",
        "Helix Dynamics", "CleverPath", "OptiData", "Sunrise Media", "GreenFlow Technologies"
    ]

    # --- Sectores o industrias ---
    industries = [
        "technology", "finance", "education", "healthcare", "energy", "marketing", "consulting",
        "manufacturing", "media", "logistics", "environmental science", "AI research", "non-profit"
    ]

    # --- Nivel educativo / académico ---
    education_levels = [
        "Bachelor's in Computer Science", "Master's in Data Analytics", "MBA",
        "Bachelor's in Marketing", "PhD in Artificial Intelligence", "Bachelor's in Business Administration",
        "Master's in Mechanical Engineering", "Bachelor's in Graphic Design", "BSc in Economics",
        "Bachelor's in Psychology", "BA in Communications"
    ]

    # --- Tonos posibles ---
    tones = [
        "professional", "enthusiastic", "confident", "humble but ambitious", "creative", "friendly", "formal",
        "inspirational", "passionate about innovation", "results-driven", "empathetic", "analytical"
    ]

    # --- Habilidades ---
    skills_pool = [
        "Python", "R", "Java", "C++", "SQL", "Machine Learning", "Deep Learning", "Project Management",
        "Marketing Strategy", "Data Visualization", "Cloud Computing", "Communication", "Leadership",
        "Team Collaboration", "Research", "Excel", "Power BI", "TensorFlow", "React", "Docker", "Kubernetes",
        "SEO Optimization", "Public Speaking", "Customer Relations", "Financial Modeling"
    ]

    # --- Extras que afectan el prompt ---
    extras = [
        "Emphasize measurable impact and leadership.",
        "Highlight adaptability and eagerness to learn new tools.",
        "Show strong cross-functional collaboration and communication.",
        "Focus on creative problem-solving and curiosity for innovation.",
        "Mention the applicant’s commitment to diversity and inclusion.",
        "Emphasize attention to detail and efficiency.",
        "Showcase ability to work under pressure and meet deadlines.",
        "Highlight mentorship and team motivation skills.",
        "Include a brief anecdote that reflects passion for the field.",
        "Keep the tone optimistic and forward-thinking."
    ]

    # --- Generación aleatoria ---
    name = f"{random.choice(first_names)} {random.choice(last_names)}"
    role = random.choice(roles)
    company = random.choice(companies)
    tone = random.choice(tones)
    industry = random.choice(industries)
    education_level = random.choice(education_levels)
    extra = random.choice(extras)

    n_skills = random.randint(4, 7)
    skills_list = random.sample(skills_pool, n_skills)

    exp_years = random.randint(1, 15)
    exp_focus = random.choice([
        f"developing {role.lower()} solutions for the {industry} sector",
        f"driving innovation and data-driven decisions",
        f"leading multidisciplinary teams in fast-paced environments",
        f"managing end-to-end projects and achieving KPIs",
        f"improving processes and optimizing efficiency"
    ])

    experience_summary = f"{exp_years} years of experience {exp_focus}."

    return {
        "name": name,
        "role": role,
        "company": company,
        "industry": industry,
        "education_level": education_level,
        "experience_summary": experience_summary,
        "skills_list": skills_list,
        "tone": tone,
        "extra": extra
    }


def build_cover_letter_prompt(applicant: dict) -> str:
    """Construye prompt consistente para generar la cover letter (en inglés)."""
    skills = ", ".join(applicant.get("skills_list", [])) or "N/A"
    extra = applicant.get("extra", "")
    prompt = f"""
You are an expert career advisor and professional cover letter writer.

Write a concise, persuasive English cover letter (job application / cover letter) using the information below.
Return only the letter body (no metadata or commentary).

Applicant:
- Name: {applicant.get('name', 'Applicant')}
- Target role: {applicant.get('role', '')}
- Target company: {applicant.get('company', '')}
- Short experience summary: {applicant.get('experience_summary', '')}
- Key skills: {skills}
- Tone: {applicant.get('tone', 'professional')}
- Extra: {extra}

Requirements:
- Length: 150-300 words.
- Include a tailored opening line mentioning the company and role.
- Include 2-3 sentences highlighting accomplishments/results (use approximate numbers if necessary).
- Use a clear closing paragraph inviting next steps and a polite sign-off.
- Do not include real contact details; use placeholders if needed.

Begin the letter with an appropriate greeting (e.g., "Dear Hiring Manager," or "Dear <Company> Recruiting Team,").
"""
    return textwrap.dedent(prompt).strip()


# Model loading (local) 
def load_model_local(
    model_name: str,
    hf_token: Optional[str] = None,
    use_8bit: bool = True,
    use_4bit: bool = False,
) -> Tuple[AutoModelForCausalLM, AutoTokenizer]:
    """
    Intenta cargar un modelo localmente.
    - model_name: repo HF (ej: "meta-llama/Llama-2-7b-chat-hf" o "tiiuae/falcon-7b-instruct")
    - hf_token: token de Hugging Face if needed (o configura HF_TOKEN env)
    - use_8bit: intenta load_in_8bit (bitsandbytes)
    - use_4bit: alternativa, usa load_in_4bit (bitsandbytes + soporte)
    Retorna (model, tokenizer).
    """
    # token fallback
    hf_token = hf_token or os.getenv("HUGGINGFACE_TOKEN")

    LOG.info("Cargando tokenizer para %s ...", model_name)
    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        use_fast=False,
        trust_remote_code=True,
        use_auth_token=hf_token,
    )

    # Intentar carga con quantización (8-bit)
    load_kwargs = {"trust_remote_code": True, "use_auth_token": hf_token}
    try:
        if use_4bit:
            LOG.info("Intentando cargar en 4-bit (bitsandbytes)...")
            model = AutoModelForCausalLM.from_pretrained(
                model_name,
                load_in_4bit=True,
                device_map="auto",
                torch_dtype=torch.float16,
                **load_kwargs,
            )
        elif use_8bit:
            LOG.info("Intentando cargar en 8-bit (bitsandbytes)...")
            model = AutoModelForCausalLM.from_pretrained(
                model_name,
                load_in_8bit=True,
                device_map="auto",
                **load_kwargs,
            )
        else:
            LOG.info("Cargando en precisión normal (puede OOM)...")
            model = AutoModelForCausalLM.from_pretrained(
                model_name,
                device_map="auto",
                torch_dtype=torch.float16 if torch.cuda.is_available() else None,
                **load_kwargs,
            )
        LOG.info("Modelo cargado correctamente: %s", model_name)
        return model, tokenizer
    except Exception as e:
        LOG.warning("Carga con quantización falló: %s", e)
        LOG.info("Intentando carga sin quantización (fallback)...")
        # fallback sin quantización (unfeasible because of size)
        try:
            model = AutoModelForCausalLM.from_pretrained(
                model_name,
                device_map="auto",
                torch_dtype=torch.float16 if torch.cuda.is_available() else None,
                trust_remote_code=True,
                use_auth_token=hf_token,
            )
            LOG.info("Modelo cargado en fallback (sin quantización).")
            return model, tokenizer
        except Exception as e2:
            LOG.error("Carga fallback falló: %s", e2)
            raise RuntimeError(f"Fallo al cargar modelo local `{model_name}`: {e2}")


#  Generation helper 
def generate_with_model(
    applicant: dict,
    model,
    tokenizer,
    temperature: float = 0.7,
    max_new_tokens: int = 400,
) -> str:
    """
    Genera cover letter usando model/tokenizer ya cargados.
    - applicant: dict con keys (name, role, company, experience_summary, skills_list, tone, extra)
    - model: AutoModelForCausalLM
    - tokenizer: AutoTokenizer
    """
    prompt = build_cover_letter_prompt(applicant)
    # tokenizar
    inputs = tokenizer(prompt, return_tensors="pt")
    # mover a device del modelo si existe mapping (model.device_map) o a cuda/CPU
    try:
        # Si model tiene atributo device_map (hf accelerate), buscar device del primer parámetro
        if hasattr(model, "device"):
            device = model.device
        else:
            # usa cuda si disponible
            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    except Exception:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Config de generación (usamos GenerationConfig si el modelo lo soporta)
    gen_kwargs = dict(
        temperature=float(temperature),
        top_p=0.95,
        do_sample=True,
        max_new_tokens=int(max_new_tokens),
    )
    try:
        # Si el modelo soporta GenerationConfig (nuevo API)
        if "GenerationConfig" in globals():
            generation_config = GenerationConfig(**gen_kwargs)
            outputs = model.generate(**inputs, generation_config=generation_config)
        else:
            outputs = model.generate(**inputs, **gen_kwargs)
    except TypeError:
        # fallback a call directo
        outputs = model.generate(**inputs, **gen_kwargs)

    text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Si el modelo repite el prompt, intentamos quitar el prompt inicial
    if text.startswith(prompt):
        text = text[len(prompt):].strip()
    return text.strip()


#  Wrappers solicitados 
# Mantener cache para no recargar el modelo cada llamada
_MODEL_CACHE = {}


def generate_cover_llama_local(
    applicant: dict,
    secrets_path: str = "secrets.json",
    model_name: Optional[str] = None,
    use_8bit: bool = True,
    use_4bit: bool = False,
    temperature: float = 0.7,
    max_new_tokens: int = 400,
) -> str:
    """
    Genera cover letter con un modelo LLaMA local (intenta quantización).
    - Lee secrets.json para token HF si es necesario.
    - model_name por defecto se toma de secrets o se fija a Llama-2-7b-chat-hf.
    """
    secrets = load_secrets(secrets_path)
    hf_token = secrets.get("HUGGINGFACE_TOKEN") or os.getenv("HUGGINGFACE_TOKEN")

    model_name = model_name or secrets.get("LLAMA_MODEL") or "meta-llama/Llama-2-7b-chat-hf"

    cache_key = f"llama::{model_name}::8b{use_8bit}::4b{use_4bit}"
    if cache_key not in _MODEL_CACHE:
        model, tokenizer = load_model_local(model_name, hf_token=hf_token, use_8bit=use_8bit, use_4bit=use_4bit)
        _MODEL_CACHE[cache_key] = (model, tokenizer)
    else:
        model, tokenizer = _MODEL_CACHE[cache_key]

    return generate_with_model(applicant, model, tokenizer, temperature=temperature, max_new_tokens=max_new_tokens)


def generate_cover_hf_local(
    applicant: dict,
    secrets_path: str = "secrets.json",
    model_name: Optional[str] = None,
    use_8bit: bool = True,
    use_4bit: bool = False,
    temperature: float = 0.7,
    max_new_tokens: int = 400,
) -> str:
    """
    Genera cover letter con un modelo de Hugging Face local (ej: Falcon).
    Funciona igual que generate_cover_llama_local, distinto default model.
    """
    secrets = load_secrets(secrets_path)
    hf_token = secrets.get("HUGGINGFACE_TOKEN") or os.getenv("HUGGINGFACE_TOKEN")

    model_name = model_name or secrets.get("HF_MODEL") or "tiiuae/falcon-7b-instruct"

    cache_key = f"hf::{model_name}::8b{use_8bit}::4b{use_4bit}"
    if cache_key not in _MODEL_CACHE:
        model, tokenizer = load_model_local(model_name, hf_token=hf_token, use_8bit=use_8bit, use_4bit=use_4bit)
        _MODEL_CACHE[cache_key] = (model, tokenizer)
    else:
        model, tokenizer = _MODEL_CACHE[cache_key]

    return generate_with_model(applicant, model, tokenizer, temperature=temperature, max_new_tokens=max_new_tokens)


In [None]:
#  Ejemplo de uso 

sample_applicant = {
    "name": "Alex Johnson",
    "role": "Data Scientist",
    "company": "Acme Tech",
    "experience_summary": "3 years building production ML pipelines and improving prediction accuracy for customer churn.",
    "skills_list": ["Python", "pandas", "scikit-learn", "SQL", "model deployment"],
    "tone": "professional and confident",
    "extra": "Emphasize cross-team collaboration and measurable impact."
}

# Genera usando LLaMA local (intenta 8-bit)
try:
    llm_cover = generate_cover_llama_local(sample_applicant, use_8bit=True, temperature=0.7)
    print("\n--- LLaMA COVER ---\n", llm_cover[:2000])
except Exception as e:
    LOG.error("LLaMA generation failed: %s", e)

# Genera usando HF model local (p. ej. Falcon)
try:
    hf_cover = generate_cover_hf_local(sample_applicant, use_8bit=True, temperature=0.9)
    print("\n--- HF COVER ---\n", hf_cover[:2000])
except Exception as e:
    LOG.error("HF generation failed: %s", e)


In [None]:
import pandas as pd
from tqdm import tqdm

def generate_cover_dataset(
    n: int = 10,
    model_type: str = "llama",  # o "hf"
    temperature: float = 0.7,
    use_8bit: bool = True,
    use_4bit: bool = False,
    secrets_path: str = "secrets.json",
    save_path: Optional[str] = None,
) -> pd.DataFrame:
    """
    Genera un dataset de cover letters con prompts y resultados.
    model_type: "llama" o "hf"
    """
    data = []

    LOG.info(f"Generating {n} cover letters using model_type={model_type}...")

    for _ in tqdm(range(n), desc="Generating covers"):
        applicant = random_applicant()
        prompt = build_cover_letter_prompt(applicant)

        try:
            if model_type.lower() == "llama":
                letter = generate_cover_llama_local(
                    applicant,
                    secrets_path=secrets_path,
                    use_8bit=use_8bit,
                    use_4bit=use_4bit,
                    temperature=temperature,
                )
            else:
                letter = generate_cover_hf_local(
                    applicant,
                    secrets_path=secrets_path,
                    use_8bit=use_8bit,
                    use_4bit=use_4bit,
                    temperature=temperature,
                )
        except Exception as e:
            LOG.error("Error generating letter: %s", e)
            letter = f"[ERROR] {e}"

        row = {
            **applicant,
            "prompt": prompt,
            "cover_letter": letter,
        }
        data.append(row)

    df = pd.DataFrame(data)
    LOG.info(" Dataset generated with %d samples", len(df))

    if save_path:
        df.to_csv(save_path, index=False)
        LOG.info("Dataset saved to %s", save_path)

    return df
