# Library üìö

In [1]:
import os, time,csv,re, unicodedata, pandas as pd
from typing import List, Optional
from dotenv import load_dotenv

from urllib.parse import urlparse
import trafilatura, requests
from bs4 import BeautifulSoup

from pydantic import BaseModel, Field
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import PydanticOutputParser
from textwrap import wrap

In [2]:
try:
    from langchain_mistralai import ChatMistralAI   # nouveau paquet
except ImportError:
    # fallback anciens imports si besoin
    from langchain_community.chat_models import ChatMistralAI

# Mini LLM Mistral

In [3]:
load_dotenv()
Mistral_key=os.getenv("Mistral_key")
LLM_MODEL='open-mixtral-8x7b'

llm = ChatMistralAI(
    model=LLM_MODEL,
    temperature=0,
    api_key=os.getenv("MISTRAL_API_KEY"),
    timeout=60,   
    max_retries=3,
)

In [4]:
CATEGORIES = ["Data Analyst","Data Scientist","Data Engineer","Analytics Engineer","Consultant Data","Autre"]
class ClassifOut(BaseModel):
    category: str = Field(description=f"Une des cat√©gories: {', '.join(CATEGORIES)}")
    reasoning: Optional[str] = Field(default=None, description="Raison courte")

class SkillsOut(BaseModel):
    skills: List[str] = Field(description="Comp√©tences techniques normalis√©es (Python, SQL, Airflow, dbt, Spark, Power BI‚Ä¶)")

classif_parser = PydanticOutputParser(pydantic_object=ClassifOut)
skills_parser = PydanticOutputParser(pydantic_object=SkillsOut)

## Prompts ü§ñ

In [5]:
classif_prompt = ChatPromptTemplate.from_messages([
    ("system",
     "Tu classes des intitul√©s de poste FR/EN en cat√©gories fixes. "
     f"Cat√©gories valides: {', '.join(CATEGORIES)}. "
     "Choisis UNE cat√©gorie la plus appropri√©e. R√©ponds en JSON strict."),
    ("human",
     "Intitul√©: {title}\n\nRappels: Data Analyst (analyste/BI/reporting), "
     "Data Scientist (ML/IA/recherche), Data Engineer (pipelines/ETL/Spark/warehouse), "
     "Analytics Engineer (dbt/semantic/BI engineering), Consultant Data (consultant sp√©cialis√© data), "
     "Autre sinon.\n\n{format_instructions}")
])

skills_prompt = ChatPromptTemplate.from_messages([
    ("system",
     "Tu re√ßois un texte d'offre (FR/EN). "
     "Liste UNIQUEMENT les comp√©tences TECHNIQUES explicitement pr√©sentes dans le texte. "
     "N'invente rien. Normalise les noms (ex: 'scikit learn' ‚Üí 'scikit-learn'). "
     "R√©ponds en JSON strict."),
    ("human",
     "Texte:\n\n{job_text}\n\n{format_instructions}")
])

## Chain üîó

In [6]:
classif_chain = classif_prompt | llm | classif_parser
skills_chain = skills_prompt | llm | skills_parser

## LLM Calls ‚òéÔ∏è

In [7]:
SKILL_LIST = [
    "python","sql","excel","tableau","power bi","looker","metabase",
    "airflow","dbt","spark","hadoop","databricks","kafka",
    "scikit-learn","pandas","numpy","pytorch","tensorflow","keras",
    "docker","kubernetes","git","mlflow","evidently","jenkins",
    "snowflake","redshift","bigquery","postgres","mongodb",
    "nlp","computer vision","xgboost","lightgbm",
    "aws","gcp","azure",
]
_skill_patterns = [re.compile(rf"\b{re.escape(s)}\b", re.I) for s in SKILL_LIST]
NORMALIZE = {"scikit learn": "scikit-learn", "powerbi": "power bi"}

def extract_skills(job_text: str) -> list:
    # 1) R√®gles d'abord (garantit un minimum)
    base = extract_skills_rules(job_text)

    # 2) Si le texte est court, on s‚Äôarr√™te l√†
    if not job_text or len(job_text) < 200:
        return base

    # 3) LLM (chunk√©) pour compl√©ter
    chunks = wrap(job_text, 6000)
    llm_skills = set()
    for ch in chunks[:2]:  # max 2 chunks pour √©viter timeouts/co√ªts
        try:
            out = skills_chain.invoke({
                "job_text": ch,
                "format_instructions": skills_parser.get_format_instructions()
            })
            for s in out.skills:
                s = s.strip()
                if s:
                    llm_skills.add(NORMALIZE.get(s.lower(), s))
        except Exception:
            continue

    # 4) Union + d√©dup + limite
    final = []
    seen = set()
    for s in base + sorted(llm_skills):
        k = s.lower()
        if k and k not in seen:
            seen.add(k)
            final.append(s)
    return final[:25]

def extract_skills_rules(text: str):
    """Extraction de comp√©tences par simple recherche de mots-cl√©s."""
    found = set()
    t = text or ""
    for s, pat in zip(SKILL_LIST, _skill_patterns):
        if pat.search(t):
            found.add(s)
    cleaned = []
    for s in found:
        cleaned.append(NORMALIZE.get(s.lower(), s))
    return sorted(set(cleaned))


In [8]:
def extract_skills_chunked(job_text: str) -> list:
    if not job_text:
        return []
    CHUNK = 6000
    chunks = wrap(job_text, CHUNK)
    all_skills = set(extract_skills_rules(job_text))  # base rules sur tout le texte
    for ch in chunks[:2]:  # limite √† 2 chunks pour vitesse/co√ªt
        try:
            out = skills_chain.invoke({
                "job_text": ch,
                "format_instructions": skills_parser.get_format_instructions()
            })
            for s in out.skills:
                if s.strip():
                    all_skills.add(s.strip())
        except Exception:
            continue
    return sorted(all_skills)[:25]

In [9]:
def clean_text(t: str) -> str:
    return " ".join((t or "").split())

def scrape_url(url: str, verbose: bool = True) -> str:
    if not isinstance(url, str) or not url.strip():
        return ""
    domain = urlparse(url).netloc.lower()

    # --- Essai A : requests -> trafilatura.extract (plus robuste, timeout c√¥t√© requests)
    try:
        r = requests.get(url, timeout=15, headers={"User-Agent": "Mozilla/5.0"})
        if r.ok and r.text:
            extracted = trafilatura.extract(
                r.text,
                include_comments=False,
                include_tables=False,
                favor_recall=True,
            )
            if extracted:
                text = clean_text(extracted)
                if verbose:
                    print(f"  [scrape] {domain} via requests+trafilatura len={len(text)}")
                if len(text) > 120:
                    return text
    except Exception as e:
        if verbose:
            print(f"  [scrape] requests/trafilatura err: {e}")

    # --- Essai B : BeautifulSoup + s√©lecteurs sp√©cifiques sites
    try:
        r = requests.get(url, timeout=15, headers={"User-Agent": "Mozilla/5.0"})
        if r.ok:
            soup = BeautifulSoup(r.text, "html.parser")

            # S√©lecteurs sp√©cifiques (adaptables) :
            if "welcometothejungle" in domain:
                main = soup.select_one("article, main, [data-testid='job-body']")
                if main: soup = main
            if "workable.com" in domain:
                main = soup.select_one("main, [data-ui='job-attributes'], [data-ui='job-description']")
                if main: soup = main
            if "smartrecruiters.com" in domain:
                main = soup.select_one("div#jobad, div#st-jo")
                if main: soup = main
            if "oraclecloud.com" in domain or "workday" in domain:
                main = soup.select_one("section[data-automation-id='jobPostingDescription'], section[data-automation-id='jobPostingHeader'], main, article")
                if main: soup = main

            # Nettoyage g√©n√©rique
            for tag in soup(["script","style","noscript","header","footer","nav"]):
                tag.decompose()

            text = clean_text(soup.get_text(" "))
            if verbose:
                print(f"  [scrape] {domain} via bs4 len={len(text)}")
            if len(text) > 120:
                return text
    except Exception as e:
        if verbose:
            print(f"  [scrape] bs4 err: {e}")

    if verbose:
        print(f"  [scrape] {domain} ‚Üí texte insuffisant/indisponible")
    return ""




In [10]:
def strip_accents(s: str) -> str:
    if not isinstance(s, str): return ""
    return "".join(c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn").lower()

def classify_title(title: str):
    t = strip_accents(title)
    # heuristiques rapides
    if "analytics engineer" in t or ("analytics" in t and "engineer" in t) or "ingenieur analytics" in t:
        pre = "Analytics Engineer"
    elif "data engineer" in t or "ingenieur donnees" in t or "ingenieur data" in t:
        pre = "Data Engineer"
    elif "data scientist" in t or "scientifique des donnees" in t or "ml engineer" in t:
        pre = "Data Scientist"
    elif "data analyst" in t or "analyste" in t or "charge d etudes" in t or "bi analyst" in t or "business analyst" in t:
        pre = "Data Analyst"
    elif "consultant data" in t or "consultant ia" in t or "consultant bi" in t or "consultant analytics" in t:
        pre = "Consultant Data"
    else:
        pre = None

    if pre:
        return ClassifOut(category=pre, reasoning="heuristique")

    # sinon, LLM
    try:
        return classif_chain.invoke({
            "title": title or "",
            "format_instructions": classif_parser.get_format_instructions()
        })
    except Exception:
        return ClassifOut(category="Autre", reasoning=None)


# Pipeline üë®‚Äçüîß

In [11]:
def run_pipeline(
    input_csv: str,
    output_csv: str = "data/suivi_candidatures_enrichi_mistral.csv",
    classify_all: bool = True,
    scrape_and_skills: bool = True,
    sleep_s: float = 0.6
):
    df = pd.read_csv(input_csv)

    if "Poste" not in df.columns:
        raise ValueError("Colonne 'Poste' requise.")

    for col in ["Cat√©gorie Poste (LLM)", "Cat√©gorie Raison (LLM)", "Offre (texte)", "Comp√©tences (LLM)"]:
        if col not in df.columns:
            df[col] = None

    total_rows = len(df)
    print(f"[INFO] D√©marrage pipeline sur {total_rows} lignes.")

    for i, row in df.iterrows():
        title = str(row.get("Poste", "") or "")
        link = str(row.get("Lien", "") or "")

        print(f"[INFO] Traitement ligne {i+1}/{total_rows} ‚Äî Poste: {title[:50]}...")

        # 1) Classification
        if classify_all or pd.isna(row.get("Cat√©gorie Poste (LLM)")):
            try:
                out = classify_title(title)
                df.at[i, "Cat√©gorie Poste (LLM)"] = out.category
                df.at[i, "Cat√©gorie Raison (LLM)"] = out.reasoning or ""
                print(f"  ‚Ü≥ Cat√©gorie: {out.category}")
            except Exception as e:
                df.at[i, "Cat√©gorie Poste (LLM)"] = "Autre"
                df.at[i, "Cat√©gorie Raison (LLM)"] = f"err:{type(e).__name__}"
                print(f"  [ERREUR] Classification: {e}")
            time.sleep(sleep_s)

        # 2) Scraping + extraction comp√©tences
        if scrape_and_skills and link:
            print(f"  ‚Ü≥ Scraping URL: {link}")
            text = scrape_url(link)
            df.at[i, "Offre (texte)"] = text

            if text:
                try:
                    skills = extract_skills(text)
                    df.at[i, "Comp√©tences (LLM)"] = ", ".join(skills)
                    print(f"    ‚Ü≥ Comp√©tences trouv√©es: {len(skills)}")
                except Exception as e:
                    df.at[i, "Comp√©tences (LLM)"] = ""
                    print(f"    [ERREUR] Extraction comp√©tences: {e}")
            else:
                print(f"    [INFO] Aucun texte r√©cup√©r√©")
            time.sleep(sleep_s)

        # Sauvegarde interm√©diaire toutes les 10 lignes
        if (i + 1) % 10 == 0:
            print(f"[INFO] Sauvegarde interm√©diaire apr√®s {i+1} lignes...")
            os.makedirs(os.path.dirname(output_csv) or ".", exist_ok=True)
            for col in ["Comp√©tences (LLM)", "Offre (texte)"]:
                if col in df.columns:
                    df[col] = df[col].astype(str)
            df.to_csv(
                output_csv,
                index=False,
                quoting=csv.QUOTE_MINIMAL,
                escapechar="\\",
                lineterminator="\n",
                encoding="utf-8",
            )

    # Sauvegarde finale
    print("[INFO] Sauvegarde finale du CSV...")
    os.makedirs(os.path.dirname(output_csv) or ".", exist_ok=True)
    for col in ["Comp√©tences (LLM)", "Offre (texte)"]:
        if col in df.columns:
            df[col] = df[col].astype(str)
    df.to_csv(
        output_csv,
        index=False,
        quoting=csv.QUOTE_MINIMAL,
        escapechar="\\",
        lineterminator="\n",
        encoding="utf-8",
    )
    print(f"[OK] Fichier cr√©√©: {output_csv}")

# Start Button üèåÔ∏è‚Äç‚ôÇÔ∏è

In [12]:
if __name__ == "__main__":
    INPUT = "/teamspace/studios/this_studio/data/suivi_candidatures_V2.csv" 
    OUTPUT = "/teamspace/studios/this_studio/data/suivi_candidatures_enrichi_mistral.csv"
    run_pipeline(INPUT, OUTPUT)

[INFO] D√©marrage pipeline sur 281 lignes.
[INFO] Traitement ligne 1/281 ‚Äî Poste: Data Analyst Environnement...
  ‚Ü≥ Cat√©gorie: Data Analyst
  ‚Ü≥ Scraping URL: https://www.welcometothejungle.com/fr/companies/inex-circular/jobs/data-analyst-environnement-cdi_paris_IC_y4Y4m9Z
  [scrape] www.welcometothejungle.com via requests+trafilatura len=4564
    ‚Ü≥ Comp√©tences trouv√©es: 5
[INFO] Traitement ligne 2/281 ‚Äî Poste: Data Engineer Junior...
  ‚Ü≥ Cat√©gorie: Data Engineer
  ‚Ü≥ Scraping URL: https://apply.workable.com/singulier/j/C258FD86BC/?utm_source=linkedin.com
  [scrape] apply.workable.com via bs4 len=32
  [scrape] apply.workable.com ‚Üí texte insuffisant/indisponible
    [INFO] Aucun texte r√©cup√©r√©
[INFO] Traitement ligne 3/281 ‚Äî Poste: Data Analyst...
  ‚Ü≥ Cat√©gorie: Data Analyst
  ‚Ü≥ Scraping URL: https://www.welcometothejungle.com/fr/companies/arcep/jobs/data-analyst-f-h_paris-cedex-12
  [scrape] www.welcometothejungle.com via requests+trafilatura len=2957
    ‚Ü

# Final Result üèÜ

In [13]:
mistral_df=pd.read_csv('data/suivi_candidatures_enrichi_mistral.csv')
mistral_df.head(50)

Unnamed: 0,Poste,Date de Soumission,√âtat,Lien,Date de r√©ponse,CV utilis√©,Contact Email,Entreprise,Mois,Cat√©gorie Poste,Cat√©gorie Poste (LLM),Cat√©gorie Raison (LLM),Offre (texte),Comp√©tences (LLM)
0,Data Analyst Environnement,2025-07-31,Candidature envoy√©e,https://www.welcometothejungle.com/fr/companie...,,CV2,,Inex Circular,7.0,Data Analyst,Data Analyst,heuristique,Cette offre n‚Äôest plus disponible. R√©sum√© du p...,"airflow, excel, mongodb, pandas, python"
1,Data Engineer Junior,2025-07-31,Candidature envoy√©e,https://apply.workable.com/singulier/j/C258FD8...,,CV2,,Singulier,7.0,Data Engineer,Data Engineer,heuristique,,
2,Data Analyst,2025-07-31,Candidature envoy√©e,https://www.welcometothejungle.com/fr/companie...,,CV2,,ARCEP,7.0,Data Analyst,Data Analyst,heuristique,R√©sum√© du poste CDD / Temporaire(36 mois) Pari...,"excel, python"
3,Data scientist,2025-07-31,Candidature envoy√©e,https://www.welcometothejungle.com/fr/companie...,,CV2,,Viginum,7.0,Data Scientist,Data Scientist,heuristique,R√©sum√© du poste CDD / Temporaire(36 mois) Pari...,"computer vision, docker, git, nlp, pandas, python"
4,Data Scientist - Artifical Intelligence Projec...,2025-07-31,Refus,https://ekez.fa.em2.oraclecloud.com/hcmUI/Cand...,2025-08-01,CV2,,Natixis,7.0,Data Scientist,Data Scientist,heuristique,,
5,"Data Quality Specialist, AI Tutor - Paris (Fix...",2025-07-31,Candidature envoy√©e,https://www.welcometothejungle.com/fr/companie...,,CV2,,Mistral AI,7.0,Autre,Autre,,CDD / Temporaire Paris T√©l√©travail total Salai...,
6,CDI - Data Analyst - Petit Bateau !,2025-07-31,Candidature envoy√©e,https://www.linkedin.com/jobs/view/4274098673/...,,CV2,,Petit Bateau,7.0,Data Analyst,Data Analyst,heuristique,"We're signing you in Discover people, jobs, an...",
7,Assistant Gestion R√©f√©rentiels et Outils H/F,2025-07-31,Candidature envoy√©e,https://jobs.totalenergies.com/en_US/careers/A...,,CV2,,Total Energies,7.0,Autre,Autre,,- Data Privacy1/4 - Personal Information2/4 - ...,
8,Gestionnaire de donn√©es GED,2025-07-31,Candidature envoy√©e,https://www.welcometothejungle.com/fr/companie...,,CV3,,RESAH,7.0,Autre,Autre,,CDD / Temporaire(1 √† 3 mois) Paris T√©l√©travail...,excel
9,CDD - Analyste Surveillance des Engagements F/H,2025-07-31,Candidature envoy√©e,https://www.welcometothejungle.com/fr/companie...,,CV3,,La Banque Postale,7.0,Data Analyst,Data Analyst,heuristique,CDD / Temporaire(6 mois) Paris T√©l√©travail fr√©...,"excel, power bi, sql"
