In [1]:
import sys
import os
from dotenv import load_dotenv

# 1. Add 'scripts' folder to path so we can import the module
# Assumes notebook is in Finetuning/notebooks/
sys.path.append('../scripts')
sys.path.append('..') # For utils_ft

# 2. Load Environment Variables (API Key)
# We load this BEFORE importing the script to ensure the client initializes correctly
from utils_ft import get_root_path, to_relative
root = get_root_path()
load_dotenv(os.path.join(root, '.env'))

# 3. Check Key
if not os.getenv("GOOGLE_API_KEY"):
    raise ValueError("GOOGLE_API_KEY not found in .env file.")

print(f"Project Root: {root}")
print("Environment loaded.")

Project Root: /Users/arjandeweerd/AI_Projects/Ass3_LLM_ASP/SchASPLM-Shamantics
Environment loaded.


In [2]:
# Import the function from your script
# This will initialize the Gemini Client using the key loaded above
from nl_text_data_augmentation import paraphrase_file, DATA_ROOT

print(f"Data Directory: {to_relative(DATA_ROOT)}")

Data Directory: Finetuning/data/Benchmark Data


In [3]:
print(f"--- Starting Data Augmentation using Gemini ---\n")

# Counters
total_processed = 0
skipped = 0

# Recursively walk through the DATA_ROOT
for root_dir, dirs, files in os.walk(DATA_ROOT):
    # We identify a "Problem Folder" by the presence of the required files
    if "NL.txt" in files and "ASP.txt" in files:
        folder_name = os.path.basename(root_dir)
        
        # Check if it is an original problem (not a generated variant)
        if "_variant" not in folder_name:
            # Call the augmentation function
            # You can change style="academic" to "natural", "instructional"
            paraphrase_file(root_dir, style="academic")
            paraphrase_file(root_dir, style="natural")
            paraphrase_file(root_dir, style="instructional")

            total_processed += 1
        else:
            # It is a variant folder, so we skip it to avoid re-augmenting
            skipped += 1

print(f"\n--- Done! ---")
print(f"Original problems processed: {total_processed}")
print(f"Existing variants skipped: {skipped}")

--- Starting Data Augmentation using Gemini ---

Generating [academic]: hanoi_tower -> hanoi_tower_variant_gemini_academic_1...
 -> Success.
Generating [natural]: hanoi_tower -> hanoi_tower_variant_gemini_natural_1...
 -> Success.
Generating [instructional]: hanoi_tower -> hanoi_tower_variant_gemini_instructional_1...
 -> Success.
Generating [academic]: visit_all -> visit_all_variant_gemini_academic_1...
 -> Success.
Generating [natural]: visit_all -> visit_all_variant_gemini_natural_1...
 -> Success.
Generating [instructional]: visit_all -> visit_all_variant_gemini_instructional_1...
 -> Success.
Generating [academic]: RichocetRobots -> RichocetRobots_variant_gemini_academic_1...
 -> Success.
Generating [natural]: RichocetRobots -> RichocetRobots_variant_gemini_natural_1...
 -> Success.
Generating [instructional]: RichocetRobots -> RichocetRobots_variant_gemini_instructional_1...
 -> Success.
Generating [academic]: Nomystery -> Nomystery_variant_gemini_academic_1...
 -> Success.
Gener