In [76]:
import json

def read_jsonl_file(filepath):
    """
    Reads the JSONL file and returns a list of dictionaries.
    
    Args:
        filepath (str): Path to the JSONL file.
    
    Returns:
        data (list): A list of dictionaries containing the data from the JSONL file.
    """
    data = []
    with open(filepath, 'r', encoding='utf-8') as file:
        for line in file:
            data.append(json.loads(line.strip()))
    return data

In [77]:
def extract_numerical_data(data):
    """
    Extracts numerical fields from each entry in the data.
    
    Args:
        data (list): A list of dictionaries containing data from the JSONL file.
    
    Returns:
        numerical_data (dict): A dictionary containing lists of numerical values for each field.
    """
    numerical_data = {
        "precision": [],
        "recall": [],
        "f1": [],
        "onto_conf": [],
        "rel_halluc": [],
        "sub_halluc": [],
        "obj_halluc": []
    }
    
    for entry in data:
        numerical_data["precision"].append(float(entry.get("precision", 0.0)))
        numerical_data["recall"].append(float(entry.get("recall", 0.0)))
        numerical_data["f1"].append(float(entry.get("f1", 0.0)))
        numerical_data["onto_conf"].append(float(entry.get("onto_conf", 0.0)))
        numerical_data["rel_halluc"].append(float(entry.get("rel_halluc", 0.0)))
        numerical_data["sub_halluc"].append(float(entry.get("sub_halluc", 0.0)))
        numerical_data["obj_halluc"].append(float(entry.get("obj_halluc", 0.0)))
    
    return numerical_data

In [78]:
def calculate_averages(numerical_data):
    """
    Calculates the average for each numerical field and returns them with prefixed keys.
    
    Args:
        numerical_data (dict): A dictionary containing lists of numerical values for each field.
    
    Returns:
        averages (dict): A dictionary containing the average values for each field, with "avg_" prefix.
    """
    averages = {
        "avg_precision": 0.0,
        "avg_recall": 0.0,
        "avg_f1": 0.0,
        "avg_onto_conf": 0.0,
        "avg_rel_halluc": 0.0,
        "avg_sub_halluc": 0.0,
        "avg_obj_halluc": 0.0
    }
    
    for key, values in numerical_data.items():
        avg_key = f"avg_{key}"
        if values:
            averages[avg_key] = sum(values) / len(values)
    
    return averages


In [79]:
def save_to_jsonl(data, output_filepath):
    """
    Saves a list of dictionaries to a JSONL file.
    
    Args:
        data (list): A list of dictionaries containing average values for each file.
        output_filepath (str): The path to the output JSONL file.
    """
    with open(output_filepath, 'w', encoding='utf-8') as file:
        for record in data:
            json.dump(record, file)
            file.write("\n")

In [80]:
def process_multiple_files(files, output_filepath):
    """
    Processes multiple files to calculate and save their average statistics.
    
    Args:
        files (list): A list of tuples where each tuple contains:
                      - the filepath to the file
                      - the ontology name (e.g., "1_movie", "2_music")
        output_filepath (str): The path to the output JSONL file.
    """
    all_results = []
    
    for filepath, ontology in files:
        # Step 1: Read the JSONL file
        data = read_jsonl_file(filepath)
        
        # Step 2: Extract numerical data
        numerical_data = extract_numerical_data(data)
        
        # Step 3: Calculate averages for "all_test_cases"
        averages_all = calculate_averages(numerical_data)
        averages_all.update({"onto": ontology, "type": "all_test_cases"})
        all_results.append(averages_all)
        
        # Step 4: (Optional) Calculate averages for "selected_test_cases"
        # In this case, you might want to filter specific entries from data to
        # create a subset of test cases. For now, this is an example without filtering.
     #   averages_selected = calculate_averages(numerical_data)  # Assuming it's the same data structure
      #  averages_selected.update({"onto": ontology, "type": "selected_test_cases"})
    
      #  all_results.append(averages_selected)
    
    # Step 5: Save all the results to a single JSONL file
    save_to_jsonl(all_results, output_filepath)

In [50]:
# Example usage:

files = [
   ('/upb/users/b/balram/profiles/unix/cs/Text2KG/withont/data/dbpedia_webnig/evaluation_statistics/without_GT_stats/ont_12_monument_llm_stats_improved_without_batch_without_quant5.jsonl','11_meanoftransportation')
]



OUTPUT_FILEPATH = '/upb/users/b/balram/profiles/unix/cs/Text2KG/withont/data/dbpedia_webnig/overall_avg_Statistics/Llama/Llama_overall_averages_with_GT_12_monument_without_batch_without_quant5.jsonl'

In [42]:
process_multiple_files(files, OUTPUT_FILEPATH)

FileNotFoundError: [Errno 2] No such file or directory: '/upb/users/b/balram/profiles/unix/cs/Text2KG/withont/data/dbpedia_webnig/evaluation_statistics/without_GT_stats/ont_12_monument_llm_stats_improved_without_batch_without_quant5.jsonl'

In [43]:
# Example usage:

files = [
    ( "/upb/users/b/balram/profiles/unix/cs/Text2KG/withont/data/wikidata/evaluation_statistics/without_gt_Quant_batch_stats/ont_7_space_llm_stats_improved.jsonl","'7_space'")
]



OUTPUT_FILEPATH = '/upb/users/b/balram/profiles/unix/cs/Text2KG/withont/data/wikidata/overall_avg_Statistics/Llama/Llama_overall_averages_without_GT_7_space_latest.jsonl'

In [44]:
process_multiple_files(files, OUTPUT_FILEPATH)

In [74]:
files = [
('/upb/users/b/balram/profiles/unix/cs/Text2KG/withont/data/dbpedia_webnig/evaluation_statistics/with_missing_GT/ont_1_university_llm_stats_improved.jsonl','1_university'),
('/upb/users/b/balram/profiles/unix/cs/Text2KG/withont/data/dbpedia_webnig/evaluation_statistics/with_missing_GT/ont_2_musicalwork_llm_stats_improved.jsonl','2_musicalwork'),
('/upb/users/b/balram/profiles/unix/cs/Text2KG/withont/data/dbpedia_webnig/evaluation_statistics/with_missing_GT/ont_3_airport_llm_stats_improved.jsonl','3_airport'),
('/upb/users/b/balram/profiles/unix/cs/Text2KG/withont/data/dbpedia_webnig/evaluation_statistics/with_missing_GT/ont_4_building_llm_stats_improved.jsonl','4_building'),
('/upb/users/b/balram/profiles/unix/cs/Text2KG/withont/data/dbpedia_webnig/evaluation_statistics/with_missing_GT/ont_5_athlete_llm_stats_improved.jsonl','5_athlete'),
('/upb/users/b/balram/profiles/unix/cs/Text2KG/withont/data/dbpedia_webnig/evaluation_statistics/with_missing_GT/ont_6_politician_llm_stats_improved.jsonl','6_politician'),
('/upb/users/b/balram/profiles/unix/cs/Text2KG/withont/data/dbpedia_webnig/evaluation_statistics/with_missing_GT/ont_7_company_llm_stats_improved.jsonl','7_company'),
('/upb/users/b/balram/profiles/unix/cs/Text2KG/withont/data/dbpedia_webnig/evaluation_statistics/with_missing_GT/ont_8_celestialbody_llm_stats_improved.jsonl','8_celestialbody'),
('/upb/users/b/balram/profiles/unix/cs/Text2KG/withont/data/dbpedia_webnig/evaluation_statistics/with_missing_GT/ont_9_astronaut_llm_stats_improved.jsonl','9_astronaut'),
('/upb/users/b/balram/profiles/unix/cs/Text2KG/withont/data/dbpedia_webnig/evaluation_statistics/with_missing_GT/ont_10_comicscharacter_llm_stats_improved.jsonl', '10_comicscharacter'),
('/upb/users/b/balram/profiles/unix/cs/Text2KG/withont/data/dbpedia_webnig/evaluation_statistics/with_missing_GT/ont_11_meanoftransportation_llm_stats_improved.jsonl','11_meanoftransportation'),
('/upb/users/b/balram/profiles/unix/cs/Text2KG/withont/data/dbpedia_webnig/evaluation_statistics/with_missing_GT/ont_12_monument_llm_stats_improved.jsonl','12_monument'),
('/upb/users/b/balram/profiles/unix/cs/Text2KG/withont/data/dbpedia_webnig/evaluation_statistics/with_missing_GT/ont_13_food_llm_stats_improved.jsonl','13_food'),
('/upb/users/b/balram/profiles/unix/cs/Text2KG/withont/data/dbpedia_webnig/evaluation_statistics/with_missing_GT/ont_14_writtenwork_llm_stats_improved.jsonl','14_writtenwork'),
('/upb/users/b/balram/profiles/unix/cs/Text2KG/withont/data/dbpedia_webnig/evaluation_statistics/with_missing_GT/ont_15_sportsteam_llm_stats_improved.jsonl','15_sportsteam'),
('/upb/users/b/balram/profiles/unix/cs/Text2KG/withont/data/dbpedia_webnig/evaluation_statistics/with_missing_GT/ont_16_city_llm_stats_improved.jsonl','16_city'),
('/upb/users/b/balram/profiles/unix/cs/Text2KG/withont/data/dbpedia_webnig/evaluation_statistics/with_missing_GT/ont_17_artist_llm_stats_improved.jsonl','17_artist'),
('/upb/users/b/balram/profiles/unix/cs/Text2KG/withont/data/dbpedia_webnig/evaluation_statistics/with_missing_GT/ont_18_scientist_llm_stats_improved.jsonl','18_scientist'),
('/upb/users/b/balram/profiles/unix/cs/Text2KG/withont/data/dbpedia_webnig/evaluation_statistics/with_missing_GT/ont_19_film_llm_stats_improved.jsonl','19_film'),
]





OUTPUT_FILEPATH = '/upb/users/b/balram/profiles/unix/cs/Text2KG/withont/data/dbpedia_webnig/overall_avg_Statistics/Llama/with_missing_GT/Llama_overall_averages_with_missing_GT.jsonl'

In [75]:
process_multiple_files(files, OUTPUT_FILEPATH)

In [34]:
files = [
("/upb/users/b/balram/profiles/unix/cs/Text2KG/withont/data/dbpedia_webnig/evaluation_statistics/without_gt_Quant_batch_stats/ont_12_monument_llm_stats_improved_test.jsonl",'12_monument'),
]





OUTPUT_FILEPATH = '/upb/users/b/balram/profiles/unix/cs/Text2KG/withont/data/dbpedia_webnig/overall_avg_Statistics/Llama/Llama_overall_averages_without_GT_12_monument_llm_stats_improved_test.jsonl'

In [35]:
process_multiple_files(files, OUTPUT_FILEPATH)

In [None]:
#### This for with including those GT which have no triples

In [86]:
files = [
    ('/upb/users/b/balram/profiles/unix/cs/Text2KG/withont/data/wikidata/evaluation_statistics/without_missing_GT/ont_1_movie_llm_stats_improved.jsonl', '1_movie'),
   #('/upb/users/b/balram/profiles/unix/cs/Text2KG/withont/data/wikidata/evaluation_statistics/without_missing_GT/ont_2_music_llm_stats_improved.jsonl', '2_music'),
    ('/upb/users/b/balram/profiles/unix/cs/Text2KG/withont/data/wikidata/evaluation_statistics/without_missing_GT/ont_3_sport_llm_stats_improved.jsonl', '3_sport'),
    #('/upb/users/b/balram/profiles/unix/cs/Text2KG/withont/data/wikidata/evaluation_statistics/without_missing_GT//ont_4_book_llm_stats_improved.jsonl', '4_book'),
    #('/upb/users/b/balram/profiles/unix/cs/Text2KG/withont/data/wikidata/evaluation_statistics/without_missing_GT/ont_5_military_llm_stats_improved.jsonl', '5_military'),
    ('/upb/users/b/balram/profiles/unix/cs/Text2KG/withont/data/wikidata/evaluation_statistics/without_missing_GT/ont_6_computer_llm_stats_improved.jsonl', '6_computer'),
    ('/upb/users/b/balram/profiles/unix/cs/Text2KG/withont/data/wikidata/evaluation_statistics/without_missing_GT/ont_7_space_llm_stats_improved.jsonl', '7_space'),
    ('/upb/users/b/balram/profiles/unix/cs/Text2KG/withont/data/wikidata/evaluation_statistics/without_missing_GT/ont_8_politics_llm_stats_improved.jsonl', '8_politics'),
    #('/upb/users/b/balram/profiles/unix/cs/Text2KG/withont/data/wikidata/evaluation_statistics/without_missing_GT/ont_9_nature_llm_stats_improved.jsonl', '9_nature'),
    #('/upb/users/b/balram/profiles/unix/cs/Text2KG/withont/data/wikidata/evaluation_statistics/without_missing_GT/ont_10_culture_llm_stats_improved.jsonl', '10_culture'),
]





OUTPUT_FILEPATH = '/upb/users/b/balram/profiles/unix/cs/Text2KG/withont/data/wikidata/overall_avg_Statistics/Llama/without_missing_GT/Llama_overall_averages_without_missing_GT.jsonl'

In [87]:
process_multiple_files(files, OUTPUT_FILEPATH)

In [32]:
files = [
('/upb/users/b/balram/profiles/unix/cs/Text2KG/withont/data/dbpedia_webnig/evaluation_statistics/without_gt_Quant_batch_stats/ont_11_meanoftransportation_llm_stats_improved.jsonl','11_meanoftransportation'),
('/upb/users/b/balram/profiles/unix/cs/Text2KG/withont/data/dbpedia_webnig/evaluation_statistics/without_gt_Quant_batch_stats/ont_12_monument_llm_stats_improved.jsonl','12_monument'),
('/upb/users/b/balram/profiles/unix/cs/Text2KG/withont/data/dbpedia_webnig/evaluation_statistics/without_gt_Quant_batch_stats/ont_2_musicalwork_llm_stats_improved.jsonl','2_musicalwork'),
('/upb/users/b/balram/profiles/unix/cs/Text2KG/withont/data/dbpedia_webnig/evaluation_statistics/without_gt_Quant_batch_stats/ont_3_airport_llm_stats_improved.jsonl','3_airport'),
('/upb/users/b/balram/profiles/unix/cs/Text2KG/withont/data/dbpedia_webnig/evaluation_statistics/without_gt_Quant_batch_stats/ont_6_politician_llm_stats_improved.jsonl','6_politician'),
('/upb/users/b/balram/profiles/unix/cs/Text2KG/withont/data/dbpedia_webnig/evaluation_statistics/without_gt_Quant_batch_stats/ont_7_company_llm_stats_improved.jsonl','7_company'),
('/upb/users/b/balram/profiles/unix/cs/Text2KG/withont/data/dbpedia_webnig/evaluation_statistics/without_gt_Quant_batch_stats/ont_8_celestialbody_llm_stats_improved.jsonl','8_celestialbody'),
]





OUTPUT_FILEPATH = '/upb/users/b/balram/profiles/unix/cs/Text2KG/withont/data/dbpedia_webnig/overall_avg_Statistics/Llama/Llama_overall_averages_without_gt_Quant_batch_stats.jsonl'

In [33]:
process_multiple_files(files, OUTPUT_FILEPATH)