Parse the output files to extract all ratings and the average ratings

In [1]:
import json
import pandas as pd
import re


def extract_ratings(jsonl_files, ratings_excel, avg_ratings_excel, rating_columns, avg_columns):
    """
    Read JSONL files and extract information.

    :param jsonl_files: model-generated output files in string formats.
    :param ratings_excel: excel file to save all model-generated ratings.
    :param avg_ratings_exlce: excle file to save the average ratings of each image across different rating aspects.
    :param rating_columns, avg_columns: customized column names.

    :return: two xlsx files recording all rating data and avegrage data respectively.
    """
    all_data = []
    
    for idx, jsonl_file in enumerate(jsonl_files):
        data = []
        with open(jsonl_file, 'r', encoding='utf-8') as file:
            for line in file:
                entry = json.loads(line.strip())
                image_name = entry["image_path"].split("\\")[-1].split(".")[0]
                
                matches = re.findall(r"Rating:\s*(\d+)", entry["output"])
                ratings = [int(m) for m in matches] if matches else [None] * 10 
                
                for rating in ratings:
                    data.append([image_name, rating])
        
        df = pd.DataFrame(data, columns=["image", rating_columns[idx]])
        all_data.append(df)
    
    # Concatenate data from all JSONL files
    ratings_df = pd.concat(all_data, axis=1)
    ratings_df = ratings_df.loc[:, ~ratings_df.columns.duplicated()]
    
    # Compute average rating per image for each rating column
    avg_df = ratings_df.groupby("image", as_index=False).mean()
    avg_df.columns = ["image"] + avg_columns
    
    # Save
    ratings_df.to_excel(ratings_excel, index=False)
    avg_df.to_excel(avg_ratings_excel, index=False)
    print(f"Results saved to {ratings_excel} and {avg_ratings_excel}")

In [3]:
extract_ratings(
    [
        r"C:\Users\86158\Desktop\Research\LlaVa_Image_Encoder_Eval\Data\with_temperature\output_all_320_temperature_0.3\output_Relevance_all_320.jsonl", 
        r"C:\Users\86158\Desktop\Research\LlaVa_Image_Encoder_Eval\Data\with_temperature\output_all_320_temperature_0.3\output_Arousal_all_320.jsonl", 
        r"C:\Users\86158\Desktop\Research\LlaVa_Image_Encoder_Eval\Data\with_temperature\output_all_320_temperature_0.3\output_Valence_all_320.jsonl"
    ], 
    r"C:\Users\86158\Desktop\Research\LlaVa_Image_Encoder_Eval\Data\with_temperature\parsed_output_all_320_temperature_0.3\Output_LLaVa_Ratings_all_320.xlsx", 
    r"C:\Users\86158\Desktop\Research\LlaVa_Image_Encoder_Eval\Data\with_temperature\parsed_output_all_320_temperature_0.3\Output_LLaVa_Avg_Ratings_all_320.xlsx", 
    ["Relevance", "Arousal", "Valence"], 
    ["Avg_Relevance", "Avg_Arousal", "Avg_Valence"]
)

Results saved to C:\Users\86158\Desktop\Research\LlaVa_Image_Encoder_Eval\Data\with_temperature\parsed_output_all_320_temperature_0.3\Output_LLaVa_Ratings_all_320.xlsx and C:\Users\86158\Desktop\Research\LlaVa_Image_Encoder_Eval\Data\with_temperature\parsed_output_all_320_temperature_0.3\Output_LLaVa_Avg_Ratings_all_320.xlsx


Merge the average ratings file with human-encoded file

In [1]:
import pandas as pd

def merge_xlsx_columns(avg_ratings_file, raw_data_file, output_file):
    """
    Merge the raw data and the model-generated data

    :param avg_ratings_file: model-generated data
    :param raw_data_file: raw data (human encoded)
    :param output_file: merged data

    :return: a merged xlsx file.
    """
    avg_ratings_df = pd.read_excel(avg_ratings_file)
    
    raw_data_df = pd.read_excel(raw_data_file)
    
    # Ensure the number of raws match
    if avg_ratings_df.shape[0] != raw_data_df.shape[0]:
        raise ValueError("No match.")
    
    # Copy columns
    avg_ratings_df[["Raw_Avg_Relevance", "Raw_Avg_Arousal", "Raw_Avg_Valence"]] = raw_data_df.iloc[:, 1:4].values
    
    # Save
    avg_ratings_df.to_excel(output_file, index=False)
    print(f"Merged file saved as {output_file}")


In [2]:
merge_xlsx_columns(
    r"C:\Users\86158\Desktop\Research\LlaVa_Image_Encoder_Eval\Data\with_temperature\parsed_output_all_320_temperature_0.3\Output_LLaVa_Avg_Ratings_all_320.xlsx", 
    r"C:\Users\86158\Desktop\Research\LlaVa_Image_Encoder_Eval\Data\Raw\aci_database_image_ratings.xlsx", 
    r"C:\Users\86158\Desktop\Research\LlaVa_Image_Encoder_Eval\Data\with_temperature\merged_avg_ratings_llava_human_all_320.xlsx"
)

Merged file saved as C:\Users\86158\Desktop\Research\LlaVa_Image_Encoder_Eval\Data\with_temperature\merged_avg_ratings_llava_human_all_320.xlsx
