# TP4 — Phase 3 : Génération du Dataset via API Infomaniak

**Tâche** : Analyse littéraire de paroles de musique  
**Dataset source** : `brunokreiner/genius-lyrics`  
**Teacher** : `openai/gpt-oss-120b` via API Infomaniak  
**Deux stages** :
- Stage 1 : température basse (τ=0.3) → réponses stables
- Stage 2 : température haute (τ=0.9) → réponses diversifiées

In [None]:
# Installation des dépendances (décommenter si besoin sur Colab)
# !pip install openai datasets

In [None]:
import json
import sys
import time
import random
from pathlib import Path
from datasets import load_dataset
import openai

print('Imports OK')

## Configuration

In [None]:
API_KEY      = "nKuJabWS1epvq3x-m8by6NOU4xP4_znNL9OhmgXBPz9OeWOHlyGJIENnG8oXLT-4oOXNmESqExEMZv6o"
BASE_URL     = "https://api.infomaniak.com/2/ai/48/openai/v1"
TEACHER_MODEL = "openai/gpt-oss-120b"

N_STAGE1   = 150   # Basse température
N_STAGE2   = 150   # Haute température
TEMP_STAGE1 = 0.3
TEMP_STAGE2 = 0.9

MIN_LYRICS_LEN = 300
MAX_LYRICS_LEN = 2000

OUTPUT_DIR = Path("data")
OUTPUT_DIR.mkdir(exist_ok=True)

print(f"Output : {OUTPUT_DIR.resolve()}")

## System Prompt — Analyse littéraire

In [None]:
SYSTEM_PROMPT = """You are an expert literary critic specializing in popular music.
When given song lyrics, you must reason through the analysis carefully step by step.

Always structure your response as follows:
1. First, reason inside <reasoning>...</reasoning> tags where you:
   - Identify the main themes and motifs
   - Analyze the poetic devices (metaphors, repetition, rhyme scheme, imagery)
   - Examine the emotional arc and narrative structure
   - Consider the cultural and artistic context
2. Then provide a concise, well-structured literary analysis as your final answer.

Be thorough in your reasoning but clear and insightful in your final analysis."""

print(SYSTEM_PROMPT)

## Chargement et filtrage du dataset

In [None]:
def load_lyrics_samples(n_total: int, seed: int = 42) -> list:
    """Charge et filtre des paroles de qualité depuis le dataset."""
    print("Chargement du dataset genius-lyrics...")
    ds = load_dataset("brunokreiner/genius-lyrics", split="train")

    filtered = []
    for ex in ds:
        lyrics = ex.get("lyrics", "") or ""
        if (
            ex.get("is_english", False)
            and MIN_LYRICS_LEN <= len(lyrics) <= MAX_LYRICS_LEN
            and len(lyrics.split()) >= 50
        ):
            filtered.append({
                "lyrics":      lyrics,
                "artist_name": ex.get("artist_name") or "Unknown Artist",
                "genres":      ex.get("genres_list") or [],
            })

    print(f"Exemples éligibles : {len(filtered)}")
    random.seed(seed)
    return random.sample(filtered, min(n_total, len(filtered)))


all_samples = load_lyrics_samples(N_STAGE1 + N_STAGE2)
samples_stage1 = all_samples[:N_STAGE1]
samples_stage2 = all_samples[N_STAGE1:]
print(f"Stage 1 : {len(samples_stage1)} exemples | Stage 2 : {len(samples_stage2)} exemples")

## Fonctions d'appel API Teacher

In [None]:
client = openai.OpenAI(base_url=BASE_URL, api_key=API_KEY)

def build_user_prompt(artist: str, lyrics: str) -> str:
    artist_info = f" by {artist}" if artist != "Unknown Artist" else ""
    return (
        f"Please provide a literary analysis of the following song lyrics{artist_info}:\n\n"
        f"---\n{lyrics}\n---\n\n"
        "Analyze the themes, poetic devices, emotional arc, and narrative structure."
    )


def call_teacher_api(user_prompt: str, temperature: float, max_retries: int = 3) -> dict | None:
    """Appelle l'API et retourne contenu + logprobs. Retourne None en cas d'échec."""
    for attempt in range(max_retries):
        try:
            response = client.chat.completions.create(
                model=TEACHER_MODEL,
                messages=[
                    {"role": "system", "content": SYSTEM_PROMPT},
                    {"role": "user",   "content": user_prompt},
                ],
                temperature=temperature,
                max_tokens=2000,
                logprobs=True,
                top_logprobs=1,
            )
            choice = response.choices[0]
            logprobs_data = []
            if choice.logprobs and choice.logprobs.content:
                for t in choice.logprobs.content:
                    logprobs_data.append({"token": t.token, "logprob": t.logprob})
            return {
                "content":       choice.message.content,
                "logprobs":      logprobs_data,
                "finish_reason": choice.finish_reason,
            }
        except openai.RateLimitError:
            wait = 30 * (attempt + 1)
            print(f"  Rate limit. Attente {wait}s...")
            time.sleep(wait)
        except Exception as e:
            print(f"  Erreur (tentative {attempt+1}/{max_retries}): {e}")
            if attempt < max_retries - 1:
                time.sleep(5)
    return None


print("Fonctions API définies.")

## Génération Stage 1 — Température basse (τ = 0.3)

Réponses stables et précises. Utilisées pour ancrer le raisonnement du student.

In [None]:
def generate_stage(samples: list, stage: int, temperature: float) -> list:
    print(f"\n{'='*50}")
    print(f"STAGE {stage} — température={temperature} — {len(samples)} exemples")
    print(f"{'='*50}")

    results = []
    llamafactory_data = []

    for i, sample in enumerate(samples):
        print(f"[{i+1}/{len(samples)}] {sample['artist_name']}")
        user_prompt = build_user_prompt(sample["artist_name"], sample["lyrics"])
        api_result  = call_teacher_api(user_prompt, temperature)

        if api_result is None or len(api_result["content"]) < 200:
            print("  → Ignoré")
            continue

        results.append({
            "stage":       stage,
            "temperature": temperature,
            "artist_name": sample["artist_name"],
            "genres":      sample["genres"],
            "lyrics":      sample["lyrics"],
            "instruction": user_prompt,
            "response":    api_result["content"],
            "logprobs":    api_result["logprobs"],
            "finish_reason": api_result["finish_reason"],
        })
        llamafactory_data.append({
            "conversations": [
                {"from": "system", "value": SYSTEM_PROMPT},
                {"from": "human",  "value": user_prompt},
                {"from": "gpt",    "value": api_result["content"]},
            ]
        })
        print(f"  → OK ({len(api_result['content'])} chars, {len(api_result['logprobs'])} tokens)")
        time.sleep(1)

    # Sauvegarde
    raw_file = OUTPUT_DIR / f"stage{stage}_raw.json"
    lmf_file = OUTPUT_DIR / f"stage{stage}_llamafactory.json"
    with open(raw_file, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)
    with open(lmf_file, "w", encoding="utf-8") as f:
        json.dump(llamafactory_data, f, ensure_ascii=False, indent=2)

    print(f"\n✓ {len(results)} exemples sauvegardés → {raw_file}")
    return results


stage1_results = generate_stage(samples_stage1, stage=1, temperature=TEMP_STAGE1)

## Génération Stage 2 — Température haute (τ = 0.9)

Réponses plus créatives et diversifiées. Enrichissent la distribution d'entraînement.

In [None]:
stage2_results = generate_stage(samples_stage2, stage=2, temperature=TEMP_STAGE2)

## Récapitulatif

In [None]:
print("=" * 50)
print("RÉCAPITULATIF GÉNÉRATION")
print("=" * 50)
print(f"Stage 1 (τ={TEMP_STAGE1}) : {len(stage1_results)} exemples")
print(f"Stage 2 (τ={TEMP_STAGE2}) : {len(stage2_results)} exemples")
print(f"Total             : {len(stage1_results) + len(stage2_results)} exemples")
print(f"Fichiers dans     : {OUTPUT_DIR.resolve()}")
print()
print("Prochaine étape : Phase 4 — DAS filtering (sur Colab avec GPU)")