In [1]:
def get_all_values_df(model_answers_folder_path, 
                      cosine_sim_folder_path, 
                      start_day='2020-01-01', 
                      start_hour='00:00:00', 
                      end_day='2030-12-31', 
                      end_hour='23:59:59'):
    """
    Concatenates DataFrames from multiple JSONL files within the specified time interval
    and adds GPT-3.5 judgment and cosine similarity values to the model answers DataFrame.

    Parameters:
        model_answers_folder_path (str): Path to the folder containing model answers JSONL files.
        cosine_sim_folder_path (str): Path to the folder containing cosine similarity JSONL files.
        start_day (str): Start day of the time interval (default is '2020-01-01').
        start_hour (str): Start hour of the time interval (default is '00:00:00').
        end_day (str): End day of the time interval (default is '2030-12-31').
        end_hour (str): End hour of the time interval (default is '23:59:59').

    Returns:
        pd.DataFrame: DataFrame containing all model answers along with cosine similarity values.
    """
    # Concatenate DataFrames from model answers JSONL files
    all_model_answers_df = concatenate_df(get_files_in_interval(model_answers_folder_path, start_day, end_day, start_hour, end_hour))
    
    # Concatenate DataFrames from cosine similarity JSONL files
    all_cosine_sim_df = concatenate_df(get_files_in_interval(cosine_sim_folder_path, start_day, end_day, start_hour, end_hour))
    
    # Add cosine similarity column to model answers DataFrame
    all_model_answers_df['cosine_similarity_between_answers'] = all_cosine_sim_df.iloc[:, -1]
    
    return all_model_answers_df

In [2]:
# Function fot concatenating DataFrames from multiple JSONL files into a single DataFrame
def concatenate_df(jsonl_paths_list):
    # Convert each JSONL file into a DataFrame and store them in a list
    all_df = [convert_jsonl_into_df(jsonl_path) for jsonl_path in jsonl_paths_list]
    
    # Concatenate all DataFrames into a single DataFrame
    concat_df = pd.concat(all_df)
    
    # Reset the index of the concatenated DataFrame
    concat_df.reset_index(drop=True, inplace=True)
    
    # Return the concatenated DataFrame
    return concat_df

In [3]:
# Function to retrieve all files in a timestamp range
def get_files_in_interval(folder_path, start_day, end_day, start_hour, end_hour):
    # Convert start and end day strings to datetime objects
    start_datetime = datetime.strptime(start_day + ' ' + start_hour, '%Y-%m-%d %H:%M:%S')
    end_datetime = datetime.strptime(end_day + ' ' + end_hour, '%Y-%m-%d %H:%M:%S')

    # Get all files in the folder
    all_files = os.listdir(folder_path)

    # Compile regex pattern to match date and time in file names
    pattern = re.compile(r'\d{4}-\d{2}-\d{2}_\d{2}:\d{2}:\d{2}')

    # Filter files based on the given interval
    files_in_interval = []
    for file in all_files:
        # Extract date and time from file name using regex
        match = re.search(pattern, file)
        if match:
            file_datetime = datetime.strptime(match.group(), '%Y-%m-%d_%H:%M:%S')

            # Check if the file datetime is within the specified interval
            if start_datetime <= file_datetime <= end_datetime:
                files_in_interval.append(os.path.join(folder_path, file))

    return files_in_interval

In [4]:
# Function to convert a jsonl file into a pandas DataFrame
def convert_jsonl_into_df(jsonl_path):
    lines = []
    with open(jsonl_path) as f:
        lines = f.read().splitlines()
        
    df_inter = pd.DataFrame(lines)
    df_inter.columns = ['json_element']
    # Decoding json object into a dictionary
    df_inter['json_element'].apply(json.loads)
    # Json normalize will convert any semi-structured json data into a flat table
    df_final = pd.json_normalize(df_inter['json_element'].apply(json.loads))

    return df_final

In [5]:
def answers_comparison_GPT_35_turbo(deployment_name, client, df_before_GPT, model_name, dataset_name, experience_timestamps):

    """
    Generate "Yes" or "No" answers after comparing meaning similarity of two given answers.

    Args:
        deployment_name: name of the model used for the answer comparison.
        client: information about the AzureOpenAI client.
        QA_result: a QA_Results_Comparator object that stocks all contexts, questions and answers of a dataset and the corresponding model' answers.
        model_name: name of the model who gave the one of the two answers to compare.
        
    Returns:
        A QA_Results_Comparator object where all the GPT 3.5 turbo answers are stocked on a new key:value pair along with the corresponding contexts, questions and the compared answers.
    """

    # Record start time for the entire cicle
    start_time = time.time()

    # Get jsonl experience path containing model_answers in order to save GPT judgments in streaming
    start_day, start_hour, end_day, end_hour = experience_timestamps
    directory = os.path.dirname(f"./streamed_results/{simplified_model_name}/GPT_35_comparison/")
    # Create the directory if it doesn't already exist
    os.makedirs(directory, exist_ok=True)
    gpt_35_judgment_jsonl_path = f"./streamed_results/{simplified_model_name}/GPT_35_comparison/GPT_35_comparison_{start_day}_{start_hour}_{end_day}_{end_hour}_experience.jsonl"
    
    # Initialization of the prompt template
    prompt_template = (
        "Based on the provided context and the answers' relation to the question, "
        "determine if the two answers mean the same thing."
        "If they do, respond just with 'Yes'. If they do not, respond just with 'No'.\n"
        "Context: {dataset_context}\n"
        "Question: {dataset_question}\n"
        "Answer from dataset: {dataset_answer}\n"
        "Answer from {model_name}: {model_answer}\n"
        "Judgment:"
    )

    # Create the recording jsonl file
    with open(gpt_35_judgment_jsonl_path, 'a+') as f:
        
        # For index corresponding to the rows in the dataset
        for idx in range(len(df_before_GPT)):
    
            # Retriving all values
            timestamp = df_before_GPT['timestamp'][idx]
            dataset_context = df_before_GPT['dataset_context'][idx]
            dataset_question = df_before_GPT['dataset_question'][idx]
            dataset_answer = df_before_GPT['dataset_answer'][idx]
            model_answer = df_before_GPT['model_answer'][idx]
            n_input_tokens = df_before_GPT['n_input_token'][idx]
            n_output_tokens = df_before_GPT['n_output_token'][idx]
            output_generation_time = df_before_GPT['generation_time'][idx]
    
            # Formatting the prompt template
            comparison_prompt = prompt_template.format(
                dataset_context=dataset_context,
                dataset_question=dataset_question,
                dataset_answer=dataset_answer,
                model_answer=model_answer,
                model_name=model_name
            )
    
            # Creating the message to pass as prompt to GPT 3.5 turbo
            message = [
                {
                    "role": "system",
                    "content": "You are a friendly chatbot who always work out its own judgment before rushing to a conclusion."
                },
                {"role": "user", "content": comparison_prompt},
             ]
    
            # Checking if GPT-3.5 turbo consider both answer to be similar or not. Adding the result to the df
            gpt_35_judgment = None
            try:
                response = client.chat.completions.create(model=deployment_name, messages=message, max_tokens=1000)
                response_text = response.choices[0].message.content
                if re.match(r'^Yes', response_text):
                    gpt_35_judgment = "Yes"
                    df_before_GPT.at[idx, 'GPT_35_judgment'] = "Yes"
                else:
                    gpt_35_judgment = "No"
                    df_before_GPT.at[idx, 'GPT_35_judgment'] = "No"
            except:
                gpt_35_judgment = "Error"
                df_before_GPT.at[idx, 'GPT_35_judgment'] = "Error"
                print("Error")
    
            # Collecting data for a single line in the jsonl
            model_line_jsonl = {
                 "timestamp":timestamp,
                 "generation_time":output_generation_time,
                 "n_input_token":str(n_input_tokens), 
                 "n_output_token":str(n_output_tokens), 
                 "dataset_context":dataset_context,
                 "dataset_question":dataset_question, 
                 "dataset_answer":dataset_answer,
                 "model_answer":model_answer,
                 "gpt_35_judgment":gpt_35_judgment
            }
    
            json.dump(model_line_jsonl, f)
            f.write('\n')

    # Record end time for the entire cicle
    end_time = time.time()
    total_judgment_time = end_time - start_time
    print(f"Total judgment time for GPT 3.5 turbo: {time_conversion(total_judgment_time)}")

    # Returning a df containing all informations
    df_after_GPT = df_before_GPT
    return df_before_GPT

In [6]:
# Function to retrieve all files in a timestamp range
def get_files_in_interval(folder_path, start_day, end_day, start_hour, end_hour):
    # Convert start and end day strings to datetime objects
    start_datetime = datetime.strptime(start_day + ' ' + start_hour, '%Y-%m-%d %H:%M:%S')
    end_datetime = datetime.strptime(end_day + ' ' + end_hour, '%Y-%m-%d %H:%M:%S')

    # Get all files in the folder
    all_files = os.listdir(folder_path)

    # Compile regex pattern to match date and time in file names
    pattern = re.compile(r'\d{4}-\d{2}-\d{2}_\d{2}:\d{2}:\d{2}')

    # Filter files based on the given interval
    files_in_interval = []
    for file in all_files:
        # Extract date and time from file name using regex
        match = re.search(pattern, file)
        if match:
            file_datetime = datetime.strptime(match.group(), '%Y-%m-%d_%H:%M:%S')

            # Check if the file datetime is within the specified interval
            if start_datetime <= file_datetime <= end_datetime:
                files_in_interval.append(os.path.join(folder_path, file))

    return files_in_interval

In [7]:
# This function reads a file containing test specifics, extracts the start and end times, and returns them
def get_start_end_times(tests_specifics_path):
    # Open file and read lines
    with open(tests_specifics_path, 'r') as file:
        lines = file.readlines()

    # Initialize variables
    start_day, start_hour, end_day, end_hour = None, None, None, None

    # Find end day and hour
    for line in reversed(lines):
        if line.startswith("Until"):
            end_day, end_hour = line.split(" = ")[1].split()
            break

    # Find start day and hour
    for line in reversed(lines):
        if line.startswith("Test on"):
            start_day, start_hour = line.split(" = ")[1].split()
            break

    # Return start and end times
    return start_day, start_hour, end_day, end_hour


In [8]:
def save_plots(start_day, end_day, start_hour, end_hour, simplified_model_name, df_final):
    """
    Save plots related to the test experience.

    Parameters:
        start_day: Start day of the test.
        end_day: End day of the test.
        start_hour: Start hour of the test.
        end_hour: End hour of the test.
        simplified_model_name (str): Simplified name of the model.
        df_final (pd.DataFrame): DataFrame containing the final data.
    """
    # Create directory for saving test plots
    if not os.path.exists(f'./streamed_results/{simplified_model_name}/{start_day}_{start_hour}_{end_day}_{end_hour}_experience/'):
        os.makedirs(f'./streamed_results/{simplified_model_name}/{start_day}_{start_hour}_{end_day}_{end_hour}_experience/')

    # Selecting the column for plotting
    column = df_final['cosine_similarity_between_answers']

    # Plotting the distribution
    plt.figure(figsize=(8, 6))
    plt.hist(column, bins=30, color='skyblue', edgecolor='black')
    plt.xlabel('Values')
    plt.ylabel('Frequency')
    plt.title('Distribution of Cosine Similarity Values')
    plt.savefig(f'./streamed_results/{simplified_model_name}/{start_day}_{start_hour}_{end_day}_{end_hour}_experience/cos_sim_distribution.png')

    # Filter data where GPT 3.5 judgment is "Yes" or "No"
    yes_df = df_final[(df_final['GPT_35_judgment'] == "Yes")]
    no_df = df_final[(df_final['GPT_35_judgment'] == "No")]

    # Set up the matplotlib figure
    plt.figure(figsize=(10, 5))

    # Plot the distribution of cosine similarity values when GPT 3.5 says "Yes"
    sns.histplot(yes_df['cosine_similarity_between_answers'], color="blue", label="Yes", kde=True, element="step", stat="density")

    # Plot the distribution of cosine similarity values when GPT 3.5 says "No"
    sns.histplot(no_df['cosine_similarity_between_answers'], color="red", label="No", kde=True, element="step", stat="density")

    # Add some plot aesthetics
    plt.xlabel('Cosine Similarity')
    plt.ylabel('Density')
    plt.title('Cosine Similarity Value Distributions for GPT 3.5 Judgments "Yes" and "No"')
    plt.legend()
    plt.savefig(f'./streamed_results/{simplified_model_name}/{start_day}_{start_hour}_{end_day}_{end_hour}_experience/cos_sim_distrib_on_GPT35_data.png')

    # Code for calculating number_of_outliers
    no_df = df_final[df_final['GPT_35_judgment'] == "No"]
    Q1 = no_df['cosine_similarity_between_answers'].quantile(0.25)
    Q3 = no_df['cosine_similarity_between_answers'].quantile(0.75)
    IQR = Q3 - Q1
    upper_bound = Q3 + 1.5 * IQR
    outliers = no_df[(no_df['cosine_similarity_between_answers'] > upper_bound) & (no_df['cosine_similarity_between_answers'] > 0.9)]
    number_of_outliers = len(outliers)

    # Calculate total number of judgments when GPT 3.5 says "No"
    total_no_judgments = no_df.shape[0]

    # Ratio between outliers over sim_cos 0.9 and total "No" answers by GPT 3.5
    percentage = number_of_outliers * 100 / total_no_judgments
    
    # Draw the boxplot
    plt.figure(figsize=(10, 8))
    sns.boxplot(x='GPT_35_judgment', y='cosine_similarity_between_answers', data=df_final)
    plt.xlabel(f"GPT-3.5 judgment\nOutliers when GPT-3.5 says 'No' with Cosine Similarity > 0.9: {number_of_outliers}/{total_no_judgments} ({round(percentage,2)}%)")
    plt.ylabel("Cosine Similarity")
    plt.title('Distribution of Cosine Similarity by GPT-3.5 Judgment')
    
    # Annotate the plot with the information about outliers
    # plt.text(0.5, 0.5, f"Outliers when GPT_3.5 says 'No' with Cosine Similarity > 0.9: {number_of_outliers}/{total_no_judgments} ({round(percentage,2)}%)",
    #          fontsize=12, ha='center')
    plt.tight_layout()
    plt.savefig(f'./streamed_results/{simplified_model_name}/{start_day}_{start_hour}_{end_day}_{end_hour}_experience/cos_sim_outliers_with_respect_GPT35.png')


    # Filter out rows with 'Error' in 'gpt_35_judgment' column
    filtered_df = df_final[df_final['GPT_35_judgment'] != 'Error']
    
    # Define the slices
    slices = [(0.7, 0.8), (0.8, 0.9), (0.9, 1)]
    
    # Create subplots
    fig, axs = plt.subplots(1, 3, figsize=(18, 6))
    
    # Loop through each slice and plot histogram
    for i, (lower, upper) in enumerate(slices):
        
        # Filter out cosine similarity values for the slice
        cosine_slice = filtered_df[(filtered_df['cosine_similarity_between_answers'] >= lower) & (filtered_df['cosine_similarity_between_answers'] < upper)]
        
        # Grouping the data by GPT-3.5 judgment
        grouped_df = cosine_slice.groupby('GPT_35_judgment')['cosine_similarity_between_answers']
        
        # Plotting the distribution of cosine similarity for each GPT-3.5 judgment category
        grouped_df.plot(kind='hist', alpha=0.5, bins=20, ax=axs[i], legend=True)
        axs[i].set_xlabel('Cosine Similarity')
        axs[i].set_ylabel('Frequency')
        axs[i].set_title(f'Distribution of Cosine Similarity for {lower} <= cos sim < {upper}')
        axs[i].axvline(x=lower, color='red', linestyle='--', linewidth=1, label=f'Cosine threshold ({lower})')
        axs[i].axvline(x=upper, color='red', linestyle='--', linewidth=1, label=f'Cosine threshold ({upper})')
        axs[i].legend()
        axs[i].grid(True)
    
    plt.savefig(f'./streamed_results/{simplified_model_name}/{start_day}_{start_hour}_{end_day}_{end_hour}_experience/cos_sim_distrib_slices.png')

In [9]:
# Function for recording all model generation parameter
def write_test_experience_score(tests_specifics_path, df_final):
    with open(tests_specifics_filepath, "a", encoding='utf-8') as f:
        f.write(f"Model score: {get_model_performance(df_final)}\n")
        f.write("_________________________________________________________________________________________________________________________\n")

In [10]:
# Function to calculate the model performance
def get_model_performance(df_final):

    # Calculate total number of samples
    total_values = len(df_final)
    
    # Count the number of samples with high similarity
    high_similarity_count = (df_final['cosine_similarity_between_answers'] >= 0.85).sum()
    
    # Count the number of low similarity samples with 'Yes' judgment
    remaining_yes_count = df_final.loc[df_final['cosine_similarity_between_answers'] < 0.85, 'GPT_35_judgment'].eq('Yes').sum()
    
    # Calculate the performance score
    score = (high_similarity_count + remaining_yes_count) / total_values
    return score

In [11]:
# Function for visualizing all statistical pngs saved in a folder
def visualize_images(folder_path):
    # Get a list of all PNG files in the folder
    image_files = glob.glob(folder_path + '/*.png')
    
    # Visualize each image
    for img_path in image_files:

        # Before calling show(), convert the image to 'RGB' if it has an alpha channel ('RGBA')
        img = Image.open(img_path)
        if img.mode == 'RGBA':
            img = img.convert('RGB')
        img.show()

In [12]:
# Function to convert time calculated just on seconds into time format days/hours/minutes/seconds
def time_conversion(seconds):
    
    # Calculate days
    days = int(seconds // (24 * 3600))
    # Subtract days to find remaining seconds
    seconds %= 24 * 3600
    
    # Calculate hours
    hours = int(seconds // 3600)
    # Subtract hours to find remaining seconds
    seconds %= 3600
    
    # Calculate minutes
    minutes = int(seconds // 60)
    # Subtract minutes to find remaining seconds
    seconds = round(seconds % 60, 2)  # Round seconds if necessary
    
    return f"{days}:{hours}:{minutes}:{seconds}"