## generate Q/A in general for all files 

In [None]:
import os
import json
import time
from pathlib import Path
from dotenv import load_dotenv
from groq import Groq


# ============================================
#  FUNCTION TO GENERATE Q&A PAIRS VIA GROQ
# ============================================
def GenerationGroq(search_results, groq_key, add_info="", temperature=0.7, max_tokens=8192):
    """
    Generate question–answer pairs about ENSA Tétouan from a document using Groq GPT-OSS-20B.
    """
    client = Groq(api_key=groq_key)

    prompt = f"""
    You are an assistant generating realistic question–answer pairs for a chatbot about ENSA Tétouan.

    Context document:
    {search_results}

    Your goal:
    - Generate between 5 and 10 diverse and relevant question–answer pairs.
    - Each question should be written naturally as if from a student.
    - Answers must be factual, in French, concise and helpful.
    - {add_info}

    Output format:
    Each line must be a valid JSON object (JSONL) like:
    {{"instruction": "student question", "input": "", "output": "assistant answer"}}

    Important:
    - Do NOT include explanations, markdown, or extra text.
    - Return only valid JSONL lines.
    """

    # === Stream completion ===
    completion = client.chat.completions.create(
        model="openai/gpt-oss-20b",
        messages=[{"role": "user", "content": prompt}],
        temperature=temperature,
        max_completion_tokens=max_tokens,
        top_p=1,
        reasoning_effort="medium",
        stream=True
    )

    # === Collect streamed content ===
    generations = ""
    for chunk in completion:
        content_piece = chunk.choices[0].delta.content or ""
        print(content_piece, end="")  # live feedback
        generations += content_piece

    return generations.strip()


In [8]:
from pathlib import Path
import os

current_path = Path.cwd()

target_file = current_path.parents[0]
print(target_file)

base_path = target_file / "data" / "data_final"
print(base_path)

c:\Users\PC Paradise\Desktop\Work\Projects\ENSATe-chatbot
c:\Users\PC Paradise\Desktop\Work\Projects\ENSATe-chatbot\data\data_final


In [None]:
from dotenv import load_dotenv 

load_dotenv()
GROQ_KEY = os.getenv("groq_api")
add_info = ""
# === Main Loop ===
all_pairs = []

for folder in os.listdir(base_path):
    folder_path = base_path / folder
    if not folder_path.is_dir():
        continue

    for file in os.listdir(folder_path):
        path = folder_path / file

        if not path.is_file():
            continue

        try:
            # Handle both JSON and TXT files
            if path.suffix == ".json":
                with open(path, "r", encoding="utf-8") as f:
                    content = json.load(f)
            else:
                with open(path, "r", encoding="utf-8") as f:
                    content = f.read()

            # Call your model/generation function
            print(f"generating for : {path.name}")
            result = GenerationGroq("", content, GROQ_KEY, add_info)

            # Parse generated JSON lines safely
            for line in result.splitlines():
                try:
                    pair = json.loads(line)
                    all_pairs.append(pair)
                except json.JSONDecodeError:
                    continue

        except Exception as e:
            print(f"Error reading {path.name}: {e}")
            continue

print(f"Total pairs collected: {len(all_pairs)}")


generating for : calendrier-printemps-2025
generating for : les clubs de lecole.txt
generating for : commission Culturelle et Activités Sportive issue du Conseil d’établissement.txt
generating for : Commission de coordination et de suivi du budget issue du conseil d’établissement.txt
generating for : Commission de Recherche Scientifique et Coopération issue du Conseil d’Établissement.txt
generating for : commission pédagogique issue du Conseil d’Établissement.txt
generating for : Commission.txt
generating for : Membres Conseil Etablissement.txt
generating for : departement-genie-informatique.txt
generating for : departement-humanites.txt
generating for : departement-ingenierie-et-technologie-des-systemes-industriels-itsi.txt
generating for : departement-intelligence-artificielle-et-digitalisation-iad.txt
generating for : departement-sciences-et-technologies-industrielles-et-civiles-stic.txt
generating for : departement-sciences-mathematiques-et-aide-a-la-decision.txt
generating for : d

In [11]:
# Save all pairs
with open("school_dataset.jsonl", "w", encoding="utf-8") as f:
    for ex in all_pairs:
        f.write(json.dumps(ex, ensure_ascii=False) + "\n")


## generate Q/A for emaloi du temps (schedules)

In [None]:
# ============================================
#  MAIN SCRIPT
# ============================================
if __name__ == "__main__":
    load_dotenv()
    GROQ_KEY = os.getenv("groq_api")

    if not GROQ_KEY:
        raise ValueError("Missing 'groq_api' in .env file")

    add_info = """La première question doit toujours être:
    "Quel est l'emploi du temps pour la filière X",
    où X est une des filières à l'ENSA Tétouan.
    La réponse doit toujours mentionner "pour l'année scolaire 2023-2024"."""

    current_path = Path.cwd()
    base_path = current_path.parents[0] / "data" / "data_final"
    folder_path = base_path / "emploi-temps"
    output_path = "generated_pairs.jsonl"

    WAIT_TIME = 2
    MAX_RETRIES = 3
    all_pairs = []

    print(f"Generating Q&A pairs from folder: {folder_path}\n")

    for file in os.listdir(folder_path):
        path = folder_path / file
        if not path.is_file():
            continue
        try:
            if path.suffix == ".json":
                with open(path, "r", encoding="utf-8") as f:
                    content = json.dumps(json.load(f), ensure_ascii=False)
            else:
                with open(path, "r", encoding="utf-8") as f:
                    content = f.read()

            print(f"\nGenerating for: {path.name}\n")

            for attempt in range(1, MAX_RETRIES + 1):
                try:
                    result = GenerationGroq(content, GROQ_KEY, add_info)
                    break
                except Exception as api_err:
                    print(f"Attempt {attempt}/{MAX_RETRIES} failed: {api_err}")
                    time.sleep(WAIT_TIME * attempt)
            else:
                print(f"Failed to generate for {path.name} after {MAX_RETRIES} retries.")
                continue

            # === Parse JSON lines ===
            valid_lines = 0
            for line in result.splitlines():
                line = line.strip()
                if not line:
                    continue
                try:
                    pair = json.loads(line)
                    all_pairs.append(pair)
                    valid_lines += 1
                except json.JSONDecodeError:
                    print(f"Skipped invalid JSON line: {line[:100]}...")
                    continue

            print(f"Parsed {valid_lines} valid pairs from {path.name}")
            time.sleep(WAIT_TIME)

        except Exception as e:
            print(f"Error reading {path.name}: {e}")
            continue

    # === Save dataset ===
    with open(output_path, "w", encoding="utf-8") as f:
        for pair in all_pairs:
            f.write(json.dumps(pair, ensure_ascii=False) + "\n")

    print(f"\nTotal pairs collected: {len(all_pairs)}")
    print(f"Saved dataset to: {output_path}")


Generating Q&A pairs from folder: c:\Users\PC Paradise\Desktop\Work\Projects\ENSATe-chatbot\data\data_final\emploi-temps


Generating for: emploi du temps-1ere-annee-preparatoire-section-1.json

{"instruction":"Quel est l'emploi du temps pour la filière Informatique","input":"","output":"Pour l'année scolaire 2023-2024, l'emploi du temps général à l'ENSA Tétouan se présente comme suit : Lundi : Algèbre 2 (08:30-10:00, Amphi 1), Chimie générale (10:30-12:00, Amphi 1), Analyse (14:30-16:00, Amphi 2). Mardi : Anglais 1 (08:30-10:00, Salle 1), Anglais 1 (08:30-10:00, Salle 2), Algorithmique (10:30-12:00, Amphi 1, prof. A. Tahiri), Chimie (14:30-16:00, Salle 2). Mercredi : Analyse 2 (08:30-10:00, Amphi 1, prof. Filali), Thermodynamique et Statique des Fluides (10:30-12:00, Amphi 1, prof. El Khannoussi), Algèbre (14:30-16:00, Amphi 2, GR2), Activités des étudiants (14:00-18:00). Jeudi : Culture digitale (08:30-10:00, Amphi 1, prof. Idaomar), Anglais 1 (10:30-12:00, Salle 2, GR3), Thermo. (10

## generate Q/A in general for each split of files 

In [65]:
import os
import json
import time
import textwrap
from pathlib import Path
from dotenv import load_dotenv
from groq import Groq

# =====================================
#          SETTINGS
# =====================================
load_dotenv()

# List of multiple API keys (from your .env or hard-coded)
API_KEYS = [
    os.getenv("groq_api_1"),
    os.getenv("groq_api_2"),
    os.getenv("groq_api_3"),
    os.getenv("groq_api_4"),
    os.getenv("groq_api_5"),
    os.getenv("groq_api_6"),
    os.getenv("groq_api_7"),
    os.getenv("groq_api_8"),
    os.getenv("groq_api_9"),

]
KEY_INDEX = 0  # start with the first key

current_path = Path.cwd()
target_file = current_path.parents[0]

BASE_PATH = target_file / "data" / "data_final"
OUTPUT_FILE = "all_generated_pairs.jsonl"
ADD_INFO = ""
MAX_CONTEXT_CHARS = 6000
MAX_BATCHES_PER_FILE = 3
SLEEP_BETWEEN_CALLS = 5
RETRY_DELAY = 20  # seconds if all keys fail



In [66]:
# =====================================
#  Helper: Get current API key
# =====================================
def get_current_key():
    global KEY_INDEX
    return API_KEYS[KEY_INDEX]



In [67]:

# =====================================
#  Helper: Rotate to next API key
# =====================================
def rotate_key():
    global KEY_INDEX
    KEY_INDEX = (KEY_INDEX + 1) % len(API_KEYS)
    print(f"Switching to API key #{KEY_INDEX + 1}")



In [68]:
# =====================================
#  Function to split text safely
# =====================================
def split_text(text, max_chars=MAX_CONTEXT_CHARS):
    paragraphs = text.split("\n")
    chunks, current_chunk = [], ""
    for p in paragraphs:
        if len(current_chunk) + len(p) + 1 < max_chars:
            current_chunk += p + "\n"
        else:
            chunks.append(current_chunk.strip())
            current_chunk = p + "\n"
    if current_chunk.strip():
        chunks.append(current_chunk.strip())
    return chunks




In [69]:
# =====================================
#  Function to generate Q&A pairs
# =====================================
def generate_groq(content, add_info="", temperature=0.3, max_tokens=4096, retries=3):
    global KEY_INDEX
    attempt = 0

    while attempt < retries * len(API_KEYS):
        api_key = get_current_key()
        client = Groq(api_key=api_key)

        prompt = f"""
        You are an assistant generating realistic question–answer pairs to use them to train a chatbot about ENSA Tétouan.

        Context:
        {content}

        Your goal:
        - Generate 5 to 10 diverse and relevant Q&A pairs.
        - Each question should sound natural from a student.
        - Answers must be factual, concise, and in French.
        - Each question should contains context, because Q/R est independante.
        - {add_info}

        Output format:
        Each line must be valid JSON:
        {{"instruction": "student question", "input": "", "output": "assistant answer"}}

        Do not include anything else.
        """

        try:
            completion = client.chat.completions.create(
                model="openai/gpt-oss-120b", 
                messages=[{"role": "user", "content": prompt}],
                temperature=temperature,
                max_completion_tokens=max_tokens,
                top_p=0.9,
                reasoning_effort="medium",
            )
            return completion.choices[0].message.content.strip()

        except Exception as e:
            print(f"Error with key #{KEY_INDEX + 1}: {e}")
            rotate_key()
            attempt += 1
            time.sleep(3)

    print(f"All keys failed. Waiting {RETRY_DELAY}s before retry...")
    time.sleep(RETRY_DELAY)
    return generate_groq(content, add_info, temperature, max_tokens, retries)

In [70]:
# =====================================
#  MAIN PIPELINE
# =====================================
all_pairs = []
seen_questions = set()

for folder in os.listdir(BASE_PATH):
    folder_path = BASE_PATH / folder
    if not folder_path.is_dir():
        continue

    for file in os.listdir(folder_path):
        path = folder_path / file
        if not path.is_file():
            continue

        print(f"\nProcessing: {path.name}")
        try:
            # Load content
            if path.suffix == ".json":
                with open(path, "r", encoding="utf-8") as f:
                    content = json.dumps(json.load(f), ensure_ascii=False)
            else:
                with open(path, "r", encoding="utf-8") as f:
                    content = f.read()

            chunks = split_text(content)
            file_pairs = []

            for i, chunk in enumerate(chunks[:MAX_BATCHES_PER_FILE]):
                print(f" → Generating batch {i+1}/{len(chunks)} ...")
                ADD_INFO = f"More context : {path.name}"
                result = generate_groq(chunk, ADD_INFO)
                time.sleep(SLEEP_BETWEEN_CALLS)

                for line in result.splitlines():
                    try:
                        pair = json.loads(line)
                        q = pair.get("instruction", "").strip()
                        if q and q not in seen_questions:
                            all_pairs.append(pair)
                            file_pairs.append(pair)
                            seen_questions.add(q)
                    except json.JSONDecodeError:
                        continue

            print(f" {len(file_pairs)} pairs added from {path.name}")
            # Save progress after each file
            with open(OUTPUT_FILE, "a", encoding="utf-8") as f:
                for pair in file_pairs:
                    json.dump(pair, f, ensure_ascii=False)
                    f.write("\n")
            print(f"Progress saved after {path.name}")


        except Exception as e:
            print(f"Error reading {path.name}: {e}")
            continue


Processing: calendrier-printemps-2025
 → Generating batch 1/1 ...
 7 pairs added from calendrier-printemps-2025
Progress saved after calendrier-printemps-2025

Processing: les clubs de lecole.txt
 → Generating batch 1/3 ...
 → Generating batch 2/3 ...
 → Generating batch 3/3 ...
 21 pairs added from les clubs de lecole.txt
Progress saved after les clubs de lecole.txt

Processing: commission Culturelle et Activités Sportive issue du Conseil d’établissement.txt
 → Generating batch 1/1 ...
 7 pairs added from commission Culturelle et Activités Sportive issue du Conseil d’établissement.txt
Progress saved after commission Culturelle et Activités Sportive issue du Conseil d’établissement.txt

Processing: Commission de coordination et de suivi du budget issue du conseil d’établissement.txt
 → Generating batch 1/1 ...
 7 pairs added from Commission de coordination et de suivi du budget issue du conseil d’établissement.txt
Progress saved after Commission de coordination et de suivi du budget i