In [None]:
import pandas as pd
import numpy as np



In [None]:
# load all datasets and, if necessary, add the column 'halluzination' to the questions
gpt4omini = pd.read_excel("Results/GPT4omini_RAW.xlsx")
gpt4 = pd.read_excel("Results/GPT4_RAW.xlsx")
gpt4o = pd.read_excel("Results/GPT4o_RAW.xlsx")
llama = pd.read_excel("Results/LLaMA3_RAW.xlsx")
mixtral = pd.read_excel("Results/Mixtral_RAW.xlsx")
gemini = pd.read_excel("Results/Gemini_Pro_RAW.xlsx")
llama_nemotron = pd.read_excel("Results/Llama_Nemotron_RAW.xlsx")
llama_perplexity = pd.read_excel("Results/Llama_huge_Perplexity.xlsx")


# add the column 'halluzination' to the datasets
for df in [gpt4omini, gpt4, gpt4o, llama, mixtral, gemini, llama_nemotron, llama_perplexity]:
    if 'halluzination' not in df.columns:
        df['halluzination'] = None
        
# Check if there are NaN values in the column 'De Facto Answer' and print the row numbers if there are any
for df in [gpt4omini, gpt4, gpt4o, llama, mixtral, gemini, llama_nemotron, llama_perplexity]:
    nan_rows = df[df['De Facto Answer'].isnull()].index.tolist()
    if nan_rows:
        print(f"{df['Model'][0]} has NaN values in the 'De Facto Answer' column at rows: {nan_rows}")


In [None]:
# add all datasets to one dataframe
df_all_without_rag = pd.concat([gpt4omini, gpt4, gpt4o, llama, mixtral, gemini, llama_nemotron, llama_perplexity], ignore_index=True)


In [None]:
import json

# Load the JSON file content
json_path = 'Results_gpt-4o-2024-11-20_outputs.json'

# Read the entire file content
with open(json_path, 'r') as file:
    content = file.read()

# Fix JSON format by ensuring proper structure
content = content.replace('\n', '').replace('}{', '},{')
content = f'[{content}]'

# Load the cleaned JSON data
data_list = json.loads(content)

# Convert the list of JSON objects into a DataFrame
df_json = pd.json_normalize(data_list)

# Define the desired columns and their values
columns = ['Question Number', 'Type', 'Question', 'Sample Answer According to Guideline', 
           'De Facto Answer', 'batch', 'Rater 1', 'Rater 2', 'Final Rating', 'Source', 'Model']

# Create a DataFrame with the specified columns
df_rag = pd.DataFrame(columns=columns)

# Populate the DataFrame with the JSON data and the additional fixed values
df_rag["Question Number"] = np.nan
df_rag['Type'] = np.nan
df_rag['Question'] = df_json['Human Message'].explode().reset_index(drop=True)
df_rag['Sample Answer According to Guideline'] = np.nan
df_rag['De Facto Answer'] = df_json['AI Response'].explode().reset_index(drop=True)
df_rag['batch'] = df_json['batch']
df_rag["Rater 1"] = ""
df_rag["Rater 2"] = ""
df_rag["Final Rating"] = ""
df_rag["Source"] = ""
df_rag['Model'] = 'GPT-4o +RAG'




print(df_rag.head())
df_rag.to_excel("Results/GPT4oandRAG.xlsx", index=False)


In [None]:
df_rag[['Question', 'Question Number', 'Type', 'Sample Answer According to Guideline']] = \
    llama_perplexity[['Question', 'Question Number', 'Type', 'Sample Answer According to Guideline']].values


# Optionally, save the updated df_rag
# df_rag.to_excel("Updated_GPT4oandRAG.xlsx", index=False)
#print the nan values in the updated columns of df_rag (Question Number, Type, Sample Answer According to Guideline)
print(df_rag[df_rag['Question Number'].isnull()])
print(df_rag[df_rag['Type'].isnull()])
print(df_rag[df_rag['Sample Answer According to Guideline'].isnull()])

print("Metadata successfully updated!")
df_rag.to_excel("Results/GPT4oandRAG.xlsx", index=False)



Final Dataset

In [None]:
df = pd.concat([df_all_without_rag, df_rag], ignore_index= True)
df.to_excel("Results/All_Answers_Full_DF.xlsx", index=False)
