In [1]:
import json
import pandas as pd
import os

In [7]:
# === Paths ===
json_input_path = r"C:\Users\1176153\Downloads\github\Thesis\data\Preprocessing_text\generation_evaluation_dataset.json"
ref_xlsx_path = r"C:\Users\1176153\Downloads\github\Thesis\data\Preprocessing_text\ref_data.xlsx"
gen_xlsx_path = r"C:\Users\1176153\Downloads\github\Thesis\model\src\rag_no_prompting_medium_Mistral_results.xlsx"
final_output_json_path = r"C:\Users\1176153\Downloads\github\Thesis\data\Preprocessing_text\evaluation_generation_dataset_mistral_hard.json"

### === STEP 1: Extract from JSON to XLSX ===

with open(json_input_path, "r", encoding="utf-8") as f:
    data = json.load(f)

# Grab first 29 items only
records = data["evaluation_generation_dataset"][:29]

ref_data = {
    "reference_answer": [entry["reference_answer"][0] if entry["reference_answer"] else "" for entry in records],
    "relevant_docs": [", ".join(entry["relevant_docs"]) for entry in records]
}

ref_df = pd.DataFrame(ref_data)
ref_df.to_excel(ref_xlsx_path, index=False)
print(f"✅ Extracted reference data saved to {ref_xlsx_path}")

✅ Extracted reference data saved to C:\Users\1176153\Downloads\github\Thesis\data\Preprocessing_text\ref_data.xlsx


In [8]:
### === STEP 2: Recombine all into final JSON ===

# Load 50 question-generated_answer pairs
gen_df = pd.read_excel(gen_xlsx_path, engine="openpyxl", skiprows=1, usecols=[0, 1], nrows=50)
gen_df.columns = ["question", "generated_answer"]
gen_df.fillna("", inplace=True)

# Load reference answers and docs (29 rows)
ref_df = pd.read_excel(ref_xlsx_path)
ref_df.fillna("", inplace=True)

final_entries = []

for idx in range(len(gen_df)):
    question = gen_df.loc[idx, "question"]
    generated_answer = gen_df.loc[idx, "generated_answer"]

    if idx < len(ref_df):
        reference_answer = [ref_df.loc[idx, "reference_answer"]]
        relevant_docs = [doc.strip() for doc in str(ref_df.loc[idx, "relevant_docs"]).split(",")]
    else:
        reference_answer = [""]
        relevant_docs = [""]

    entry = {
        "id": idx + 1,
        "question": question,
        "relevant_docs": relevant_docs,
        "reference_answer": reference_answer,
        "generated_answer": [generated_answer],
        "human_score": 0
    }
    final_entries.append(entry)

# Save to final JSON
with open(final_output_json_path, "w", encoding="utf-8") as f:
    json.dump({"evaluation_generation_dataset_mistral_hard": final_entries}, f, indent=2, ensure_ascii=False)

print(f"✅ Final JSON file with 50 entries saved to: {final_output_json_path}")

✅ Final JSON file with 50 entries saved to: C:\Users\1176153\Downloads\github\Thesis\data\Preprocessing_text\evaluation_generation_dataset_mistral_hard.json


# Generating the evaluation datasets.json 

In [2]:
import pandas as pd
import json
import re

# Load Excel file
excel_file = r"C:\Users\1176153\Downloads\github\Thesis\model\src\rag_no_prompting_medium_Mistral_results.xlsx"
df = pd.read_excel(excel_file, engine="openpyxl")

# Initialize result structure
json_data = {
    "evaluation_generation_dataset_mistral_hard": []
}

# Helper function to split "Document 1: ... Document 2: ..." into two parts
def split_documents(text):
    doc1_match = re.search(r"Document 1:\s*(.*?)(?:Document 2:|$)", text, re.DOTALL)
    doc2_match = re.search(r"Document 2:\s*(.*)", text, re.DOTALL)
    doc1 = doc1_match.group(1).strip() if doc1_match else ""
    doc2 = doc2_match.group(1).strip() if doc2_match else ""
    return [doc1, doc2]

# Iterate through each row (starting from index 0 which corresponds to Excel row 2)
for idx, row in df.iterrows():
    question = str(row['question']).strip()
    generated_answer = [str(row['generated_answer']).strip()]
    reference_docs = split_documents(str(row['retrieved_documents']))

    entry = {
        "id": idx + 1,
        "question": question,
        "relevant_docs": reference_docs,
        "reference_answer": [""],  # Assuming blank for now
        "generated_answer": generated_answer,
        "human_score": 0
    }
    json_data["evaluation_generation_dataset_mistral_hard"].append(entry)

# Write to JSON file
output_json_path = r"C:\Users\1176153\Downloads\github\Thesis\data\Preprocessing_text\evaluation_generation_dataset_mistral_no_prompting_hard.json"
with open(output_json_path, "w", encoding="utf-8") as f:
    json.dump(json_data, f, ensure_ascii=False, indent=2)

print("✅ JSON file generated successfully.")

✅ JSON file generated successfully.


### Combining also the reference answer


In [None]:
import pandas as pd
import json
import re

# Paths to input and output files
main_excel_file = r"C:\Users\1176153\Downloads\github\Thesis\model\src\mistral_no_rag_results_easy.xlsx"
ref_excel_file = r"C:\Users\1176153\Downloads\github\Thesis\data\Preprocessing_text\ref_data.xlsx"
output_json_path = r"C:\Users\1176153\Downloads\github\Thesis\data\Preprocessing_text\evaluation_generation_dataset_mistral_no_prompting_hard.json"

# Load Excel files
df_main = pd.read_excel(main_excel_file, engine="openpyxl")
df_ref = pd.read_excel(ref_excel_file, engine="openpyxl")

# Initialize result structure
json_data = {
    "evaluation_generation_dataset_mistral_hard": []
}

# Helper function to split "Document 1: ... Document 2: ..." into two parts
def split_documents(text):
    doc1_match = re.search(r"Document 1:\s*(.*?)(?:Document 2:|$)", text, re.DOTALL)
    doc2_match = re.search(r"Document 2:\s*(.*)", text, re.DOTALL)
    doc1 = doc1_match.group(1).strip() if doc1_match else ""
    doc2 = doc2_match.group(1).strip() if doc2_match else ""
    return [doc1, doc2]

# Iterate through each row (assuming both files have same number/order of rows)
for idx, row in df_main.iterrows():
    question = str(row['question']).strip()
    generated_answer = [str(row['generated_answer']).strip()]
    reference_docs = split_documents(str(row['retrieved_documents']))

    # Reference answer from ref_data.xlsx (assumed to be in column 'reference_answer')
    reference_answer = [str(df_ref.iloc[idx]['reference_answer']).strip()] if 'reference_answer' in df_ref.columns else [""]

    entry = {
        "id": idx + 1,
        "question": question,
        "relevant_docs": reference_docs,
        "reference_answer": reference_answer,
        "generated_answer": generated_answer,
        "human_score": 0
    }
    json_data["evaluation_generation_dataset_mistral_hard"].append(entry)

# Write to JSON
with open(output_json_path, "w", encoding="utf-8") as f:
    json.dump(json_data, f, ensure_ascii=False, indent=2)

print("✅ JSON file with reference answers generated successfully.")


KeyError: 'retrieved_documents'

### No RAG

In [13]:
import pandas as pd
import json

# Paths to input and output files
main_excel_file = r"C:\Users\1176153\Downloads\github\Thesis\model\src\mistral_no_rag_results_easy.xlsx"
ref_excel_file = r"C:\Users\1176153\Downloads\github\Thesis\data\Preprocessing_text\ref_data.xlsx"
output_json_path = r"C:\Users\1176153\Downloads\github\Thesis\data\Preprocessing_text\evaluation_generation_dataset_mistral_no_rag_results_easy.xlsx.json"

# Load Excel files
df_main = pd.read_excel(main_excel_file, engine="openpyxl")
df_ref = pd.read_excel(ref_excel_file, engine="openpyxl")

# Initialize result structure
json_data = {
    "evaluation_generation_dataset_mistral_no_rag_results_easy": []
}

# Iterate through each row (assuming both files have same number/order of rows)
for idx, row in df_main.iterrows():
    question = str(row['question']).strip()
    generated_answer = [str(row['generated_answer']).strip()]
    reference_answer = [str(df_ref.iloc[idx]['reference_answer']).strip()] if 'reference_answer' in df_ref.columns else [""]

    entry = {
        "id": idx + 1,
        "question": question,
        "reference_answer": reference_answer,
        "generated_answer": generated_answer,
        "human_score": 0
    }
    json_data["evaluation_generation_dataset_mistral_no_rag_results_easy"].append(entry)

# Write to JSON
with open(output_json_path, "w", encoding="utf-8") as f:
    json.dump(json_data, f, ensure_ascii=False, indent=2)

print("✅ JSON file without retrieval data generated successfully.")


IndexError: single positional indexer is out-of-bounds

# Generate word doc with truth answers

In [9]:
import pandas as pd
from docx import Document

# Paths
excel_path = r"C:\Users\1176153\Downloads\github\Thesis\model\src\rag_zero_shot_prompting_medium_Mistral_results.xlsx"
word_path = r"C:\Users\1176153\Downloads\github\Thesis\model\src\combined_output.docx"

# Load Excel
df = pd.read_excel(excel_path)

# Create Word document
doc = Document()

# Iterate rows
for idx, row in df.iterrows():
    question = str(row['A']) if 'A' in df.columns else str(row.iloc[0])
    retrieved_docs = str(row['C']) if 'C' in df.columns else str(row.iloc[2])

    # Add question as heading
    doc.add_heading(f"Question {idx + 1}:", level=2)
    doc.add_paragraph(question)

    # Add retrieved documents as normal text
    doc.add_heading("Retrieved Documents:", level=3)
    doc.add_paragraph(retrieved_docs)

    # Add a line break for readability
    doc.add_paragraph("\n---\n")

# Save Word file
doc.save(word_path)

print(f"Combined Word document saved to: {word_path}")


Combined Word document saved to: C:\Users\1176153\Downloads\github\Thesis\model\src\combined_output.docx
