The purpose of this notbook is to augments dataset, to enhance model context 

In [18]:
import os
import json
import time
from pathlib import Path
from dotenv import load_dotenv
from groq import Groq

def GenerationGroq(prompt, groq_key, temperature=0.7, max_tokens=8192):
    client = Groq(api_key=groq_key)

    completion = client.chat.completions.create(
        model="openai/gpt-oss-safeguard-20b",
        messages=[{"role": "user", "content": prompt}],
        temperature=temperature,
        max_completion_tokens=max_tokens,
        top_p=1,
        reasoning_effort="medium",
        stream=True
    )

    generated = ""
    for chunk in completion:
        piece = chunk.choices[0].delta.content or ""
        print(piece, end="")
        generated += piece

    return generated.strip()

In [19]:
load_dotenv()
GROQ_KEY = os.getenv("groq_api")

if not GROQ_KEY:
    raise ValueError("Missing 'groq_api' in your .env file")

base_folder = Path("data_final")

WAIT_TIME = 2
MAX_RETRIES = 3

print(f"Augmenting files in folder: {base_folder}\n")

for folder in os.listdir(base_folder):
    folderP = base_folder / folder

    if not folderP.is_dir():
        continue

    for file in os.listdir(folderP):
        if file.endswith("_reph.txt"):
            try:
                (folderP / file).unlink()
                print(f"Deleted old file: {file}")
            except Exception as e:
                print(f"Could not delete {file}: {e}")

    for file in os.listdir(folderP):
        file_path = folderP / file

        if not file_path.is_file():
            continue

        print(f"\nProcessing: {file_path.name}\n")

        try:
            # Load content
            if file_path.suffix == ".json":
                with open(file_path, "r", encoding="utf-8") as f:
                    original_data = json.load(f)
                    content = json.dumps(original_data, ensure_ascii=False)
            else:
                with open(file_path, "r", encoding="utf-8") as f:
                    content = f.read()

        except Exception as e:
            print(f"Could not read {file_path.name}, error: {e}")
            continue

        # Prompt
        prompt = (
            "Rephrase the following text without changing its meaning. "
            "Use different wording, phrasing, and sentence structures. "
            "Do NOT add any markdown symbols, no bold, no italics, no headings, no lists, "
            "no code blocks, no backticks, and no decorative characters like **** or ----. "
            "For schedules (json files), transform it as a paragraph, like a description "
            "Do NOT add explanations, comments, or additional content. "
            "Return ONLY the rephrased text, nothing else.\n\n"
            f"{content}"
        )


        # Retry logic
        for attempt in range(1, MAX_RETRIES + 1):
            try:
                augmented = GenerationGroq(prompt, GROQ_KEY)
                break
            except Exception as err:
                print(f"Attempt {attempt} failed: {err}")
                time.sleep(WAIT_TIME * attempt)
        else:
            print(f"Failed to augment {file_path.name}")
            continue

        # Save result as .txt
        new_name = file_path.stem + "_reph.txt"
        new_path = folderP / new_name

        try:
            with open(new_path, "w", encoding="utf-8") as f:
                f.write(augmented)

            print(f"Saved augmented file: {new_path.name}")

        except Exception as e:
            print(f"Error saving augmented file for {file_path.name}, error: {e}")
            continue


Augmenting files in folder: data_final

Deleted old file: calendrier-printemps-2024-2025_reph.txt

Processing: calendrier-printemps-2024-2025.txt

The spring 2024/2025 semester schedule features several key dates: February 15 hosts the first continuous assessment, CC1; March 17 marks the end of teaching; April 24 signals the start of final exams; May 1 is Labor Day, a public holiday; June 2 sees the deliberations for CI3; June 3 follows with CI2 deliberations; June 4 concludes the CI1 deliberations; June 7 hosts the 2AP1 deliberations; June 8 holds the 2AP2 deliberations; June 27 observes the first day of Moharram, the Islamic New Year, a public holiday; June 28 begins the PFE thesis defenses; July 30 celebrates Throne Day, a national holiday; the date for Aid El Fitr remains to be confirmed; the start of the spring break is yet to be determined; make‑up exam dates are pending; the date for Aid Al Adha is also to be confirmed; and the end of the first semester break will be scheduled l