In [3]:
import json

def read_jsonl_file(filepath):
    """
    Reads the JSONL file and returns a list of dictionaries.
    
    Args:
        filepath (str): Path to the JSONL file.
    
    Returns:
        data (list): A list of dictionaries containing the data from the JSONL file.
    """
    data = []
    with open(filepath, 'r', encoding='utf-8') as file:
        for line in file:
            data.append(json.loads(line.strip()))
    return data

In [4]:
def extract_numerical_data(data):
    """
    Extracts numerical fields from each entry in the data.
    
    Args:
        data (list): A list of dictionaries containing data from the JSONL file.
    
    Returns:
        numerical_data (dict): A dictionary containing lists of numerical values for each field.
    """
    numerical_data = {
        "precision": [],
        "recall": [],
        "f1": [],
        "onto_conf": [],
        "rel_halluc": [],
        "sub_halluc": [],
        "obj_halluc": []
    }
    
    for entry in data:
        numerical_data["precision"].append(float(entry.get("precision", 0.0)))
        numerical_data["recall"].append(float(entry.get("recall", 0.0)))
        numerical_data["f1"].append(float(entry.get("f1", 0.0)))
        numerical_data["onto_conf"].append(float(entry.get("onto_conf", 0.0)))
        numerical_data["rel_halluc"].append(float(entry.get("rel_halluc", 0.0)))
        numerical_data["sub_halluc"].append(float(entry.get("sub_halluc", 0.0)))
        numerical_data["obj_halluc"].append(float(entry.get("obj_halluc", 0.0)))
    
    return numerical_data

In [5]:
def calculate_averages(numerical_data):
    """
    Calculates the average for each numerical field and returns them with prefixed keys.
    
    Args:
        numerical_data (dict): A dictionary containing lists of numerical values for each field.
    
    Returns:
        averages (dict): A dictionary containing the average values for each field, with "avg_" prefix.
    """
    averages = {
        "avg_precision": 0.0,
        "avg_recall": 0.0,
        "avg_f1": 0.0,
        "avg_onto_conf": 0.0,
        "avg_rel_halluc": 0.0,
        "avg_sub_halluc": 0.0,
        "avg_obj_halluc": 0.0
    }
    
    for key, values in numerical_data.items():
        avg_key = f"avg_{key}"
        if values:
            averages[avg_key] = sum(values) / len(values)
    
    return averages


In [6]:
def save_to_jsonl(data, output_filepath):
    """
    Saves a list of dictionaries to a JSONL file.
    
    Args:
        data (list): A list of dictionaries containing average values for each file.
        output_filepath (str): The path to the output JSONL file.
    """
    with open(output_filepath, 'w', encoding='utf-8') as file:
        for record in data:
            json.dump(record, file)
            file.write("\n")

In [7]:
def process_multiple_files(files, output_filepath):
    """
    Processes multiple files to calculate and save their average statistics.
    
    Args:
        files (list): A list of tuples where each tuple contains:
                      - the filepath to the file
                      - the ontology name (e.g., "1_movie", "2_music")
        output_filepath (str): The path to the output JSONL file.
    """
    all_results = []
    
    for filepath, ontology in files:
        # Step 1: Read the JSONL file
        data = read_jsonl_file(filepath)
        
        # Step 2: Extract numerical data
        numerical_data = extract_numerical_data(data)
        
        # Step 3: Calculate averages for "all_test_cases"
        averages_all = calculate_averages(numerical_data)
        averages_all.update({"onto": ontology, "type": "all_test_cases"})
        all_results.append(averages_all)
        
        # Step 4: (Optional) Calculate averages for "selected_test_cases"
        # In this case, you might want to filter specific entries from data to
        # create a subset of test cases. For now, this is an example without filtering.
     #   averages_selected = calculate_averages(numerical_data)  # Assuming it's the same data structure
      #  averages_selected.update({"onto": ontology, "type": "selected_test_cases"})
    
      #  all_results.append(averages_selected)
    
    # Step 5: Save all the results to a single JSONL file
    save_to_jsonl(all_results, output_filepath)

In [8]:
# Example usage:
files = [
    ('Wikidata/Evaluation_Statistics/ont_1_movie_llm_stats.jsonl', '1_movie'),
    ('Wikidata/Evaluation_Statistics/ont_2_music_llm_stats.jsonl', '2_music'),
    ('Wikidata/Evaluation_Statistics/ont_3_sport_llm_stats.jsonl', '3_sport'),
    ('Wikidata/Evaluation_Statistics/ont_5_military_llm_stats.jsonl', '5_military'),
    ('Wikidata/Evaluation_Statistics/ont_6_computer_llm_stats.jsonl', '6_computer'),
    ('Wikidata/Evaluation_Statistics/ont_7_space_llm_stats.jsonl', '7_space'),
    ('Wikidata/Evaluation_Statistics/ont_8_politics_llm_stats.jsonl', '8_politics'),
    ('Wikidata/Evaluation_Statistics/ont_9_nature_llm_stats.jsonl', '9_nature')
]
OUTPUT_FILEPATH = 'Wikidata/Avg_Eval_Statistics/combined_averages.jsonl'

process_multiple_files(files, OUTPUT_FILEPATH)