In [1]:
import pandas as pd
import os

In [2]:
df = pd.read_csv('../data/data.csv')
documents = df.to_dict(orient='records')

In [3]:
from google import genai
from dotenv import load_dotenv

load_dotenv()

api_key = os.getenv("GEMINI_API_KEY")
client = genai.Client(api_key=api_key)

In [4]:
documents[0]

{'id': 0,
 'country_long': 'United Kingdom',
 'name': '14 Tullywiggan Road',
 'primary_fuel': 'Biomass',
 'capacity_mw': 1.0,
 'commissioning_year': nan,
 'passage': 'The 14 Tullywiggan Road power plant in United Kingdom. It is a Biomass facility with a capacity of 1 MW. Located at latitude 55 and longitude -7.'}

In [5]:
prompt_template = """
You emulate a user using our Global Power Plants RAG.
Formulate 3 **natural, complete questions** this user might ask based on the given GPPD record.
Each question should be directly inferable from the data, but **do not include any answers**.
Focus on making the questions clear, concise, and specific to the record.

The record:

country_long: {country_long},
name: {name},
primary_fuel: {primary_fuel},
capacity_mw: {capacity_mw},
commissioning_year: {commissioning_year},
passage: {passage}

Provide the output as valid JSON without using code blocks:
{{ "questions": ["question1", "question2", "question3", "question4", "question5"] }}
""".strip()


In [6]:
prompt = prompt_template.format(**documents[0])

In [7]:
def llm(prompt):
    response = client.models.generate_content(
        model="gemini-2.5-flash",
        contents=prompt
    )
    return response.text

In [8]:
questions = llm(prompt)

In [9]:
import json

In [10]:
json.loads(questions)

{'questions': ['What is the primary fuel source used by the 14 Tullywiggan Road power plant in the United Kingdom?',
  'What is the electrical generation capacity in megawatts of the 14 Tullywiggan Road facility?',
  'Do you have information regarding the commissioning year for the 14 Tullywiggan Road Biomass plant?']}

In [11]:
sampled_docs = df.sample(100, random_state=42).to_dict(orient="records")

In [12]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.models.generate_content(
        model="gemini-2.5-flash",
        contents=prompt
    )
    return response.text

In [13]:
from tqdm.auto import tqdm

In [14]:
results = {}

In [15]:
for doc in tqdm(sampled_docs): 
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions = generate_questions(doc)
    results[doc_id] = questions

  0%|          | 0/100 [00:00<?, ?it/s]

In [16]:
results

{11495: '{ "questions": [\n    "What is the primary fuel source for the Townsend Hydro power plant?",\n    "What is the generation capacity of the Townsend Hydro power plant in megawatts?",\n    "In what year was the Townsend Hydro power plant commissioned?"\n  ]\n}',
 6850: '{"questions": [\n  "What is the primary fuel source for the IKEA Joliet Rooftop PV System power plant?",\n  "What is the electrical generation capacity of the IKEA Joliet Rooftop PV System in megawatts?",\n  "In what year was the IKEA Joliet Rooftop PV System commissioned?"\n]}',
 11255: '{\n  "questions": [\n    "Where is the TPE Whitney Solar LLC power plant located and what is its primary fuel type?",\n    "What is the capacity in megawatts of the TPE Whitney Solar LLC power plant?",\n    "When was the TPE Whitney Solar LLC plant commissioned, and what are its geographical coordinates?"\n  ]\n}',
 7187: '{\n  "questions": [\n    "In which country is the Judith Gap Wind Energy Center power plant located?",\n    

In [24]:
final_results = []

for doc_id, questions_json in results.items():
    try:
        # Convert JSON string to Python dict
        questions_data = json.loads(questions_json)
        
        # Extract the list of questions
        for q in questions_data["questions"]:
            final_results.append((doc_id, q))
    except Exception as e:
        print(f"Error parsing doc_id {doc_id}: {e}")


In [25]:
final_results[0]

(11495, 'What is the primary fuel source for the Townsend Hydro power plant?')

In [26]:
final_results[1]

(11495,
 'What is the generation capacity of the Townsend Hydro power plant in megawatts?')

In [27]:
final_results[5]

(6850, 'In what year was the IKEA Joliet Rooftop PV System commissioned?')

In [33]:
df_results = pd.DataFrame(final_results, columns=["id", "question"])

In [34]:
df_results

Unnamed: 0,id,question
0,11495,What is the primary fuel source for the Townse...
1,11495,What is the generation capacity of the Townsen...
2,11495,In what year was the Townsend Hydro power plan...
3,6850,What is the primary fuel source for the IKEA J...
4,6850,What is the electrical generation capacity of ...
...,...,...
307,8398,When was the MontSun Community Solar facility ...
308,8398,Can you provide the specific latitude and long...
309,2157,What is the primary fuel source for the Safran...
310,2157,What is the generating capacity of the Safran ...


In [35]:
df_results.to_csv('../data/ground-truth-retrieval.csv', index=False)

In [37]:
!head ../data/ground-truth-retrieval.csv

id,question
11495,What is the primary fuel source for the Townsend Hydro power plant?
11495,What is the generation capacity of the Townsend Hydro power plant in megawatts?
11495,In what year was the Townsend Hydro power plant commissioned?
6850,What is the primary fuel source for the IKEA Joliet Rooftop PV System power plant?
6850,What is the electrical generation capacity of the IKEA Joliet Rooftop PV System in megawatts?
6850,In what year was the IKEA Joliet Rooftop PV System commissioned?
11255,Where is the TPE Whitney Solar LLC power plant located and what is its primary fuel type?
11255,What is the capacity in megawatts of the TPE Whitney Solar LLC power plant?
11255,"When was the TPE Whitney Solar LLC plant commissioned, and what are its geographical coordinates?"
