In [1]:
import sys
import os
from dotenv import load_dotenv

# 1. Add 'scripts' folder to path so we can import the module
# Assumes notebook is in Finetuning/notebooks/
sys.path.append('../scripts')
sys.path.append('..') # For utils_ft

# 2. Load Environment Variables (API Key)
# We load this BEFORE importing the script to ensure the client initializes correctly
from utils_ft import get_root_path, to_relative
root = get_root_path()
load_dotenv(os.path.join(root, '.env'))

# 3. Check Key
if not os.getenv("GOOGLE_API_KEY"):
    raise ValueError("GOOGLE_API_KEY not found in .env file.")

print(f"Project Root: {root}")
print("Environment loaded.")

Project Root: c:\Users\arjan\SchASPLM-Shamantics
Environment loaded.


In [2]:
# Import the function from your script
# This will initialize the Gemini Client using the key loaded above
from nl_text_data_augmentation import paraphrase_file, DATA_ROOT

print(f"Data Directory: {to_relative(DATA_ROOT)}")

Data Directory: Finetuning\data\Benchmark Data


In [3]:
print(f"--- Starting Data Augmentation using Gemini ---\n")

# Counters
total_processed = 0
skipped = 0

for category in os.listdir(DATA_ROOT):
    cat_path = os.path.join(DATA_ROOT, category)
    if not os.path.isdir(cat_path): continue
    
    for difficulty in os.listdir(cat_path):
        diff_path = os.path.join(cat_path, difficulty)
        if not os.path.isdir(diff_path): continue
        
        for problem_folder in os.listdir(diff_path):
            full_prob_path = os.path.join(diff_path, problem_folder)
            
            if os.path.isdir(full_prob_path) and "_variant" not in problem_folder:
                # Pass the style here
                paraphrase_file(full_prob_path, style="academic")
                total_processed += 1
            else:
                skipped += 1

print(f"\n--- Done! ---")
print(f"Original problems processed: {total_processed}")
print(f"Existing variants skipped: {skipped}")

--- Starting Data Augmentation using Gemini ---


--- Done! ---
Original problems processed: 27
Existing variants skipped: 81
