#Libraries

In [None]:
import pandas as pd
import numpy as np
import json, ast, time
import sys
from tqdm.notebook import tqdm
from scipy.stats import pearsonr, spearmanr,kendalltau
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

data_path = "<Path of the Excel Files>"

#Reading Dataset

In [None]:
'''
LLM1- PalM2
LLM2- GPT3.5
LLM3- Llama2
'''
# do the same for all LLMs
data = pd.read_excel(data_path+"llm1_dataset.xlsx")
data.head()

#GPT Evaluation (Macro)

In [None]:
%%capture
!pip install openai==0.28

In [None]:
import openai

# openai.api_key ="<OPENAI-KEY>"  uncomment it


def get_gpt_output(inp, role='user', temp=0.0):
    message = [{'role': role, 'content': inp}]
    fail_count = 0

    while True:
        try:
            completion = openai.ChatCompletion.create(
                model="gpt-4o",
                messages=message,
                temperature=temp,
            )
            break
        except openai.error.RateLimitError as e:
            print(e)
            time.sleep(5)
        except openai.error.APIError as e:
            print(e)
            fail_count += 1
            if fail_count > 10:
                raise e
            time.sleep(5)
        except openai.error.ServiceUnavailableError as e:
            print(e)
            time.sleep(5)
        except openai.error.TimeoutError as e:
            print(e)
            time.sleep(5)

    reply_content = completion.choices[0].message.content
#     print('Reply: ', reply_content)

    return reply_content


In [None]:
get_gpt_output("Tell me about yourself")

"Sure! I'm an AI language model created by OpenAI, designed to assist with a wide range of tasks by generating human-like text based on the input I receive. I can help answer questions, provide explanations, generate creative content, and much more. My knowledge is based on a diverse range of sources up until my last update in October 2023. How can I assist you today?"

In [None]:
## TELR Prompts Level 1 to 4

prompt_1 = f"""
        Using 3 reviews given below draft a meta review:
        1.Review#1\n< {r1} >\n
        2.Review#2\n< {r2} >\n
        3.Review#3\n< {r3} >

        """

prompt_2 = f"""
        Using the three reviews, generate a meta review by incorporat-
        ing core contributions, common strengths, common weaknesses,
        common suggestions for improvement, and missing references.
        Common strengths, Common weaknesses, and Common sugges-
        tions are strengths, weaknesses, and suggestion for improvement
        respectively that are common in at least 2 reviews. Three reviews
        are as follows:\n
        1.Review#1\n< {r1} >\n
        2.Review#2\n< {r2} >\n
        3.Review#3\n< {r3} >

        """

prompt_3 = f"""
      You are a meta-review assistant. Using three reviews given as
      Review#1, Review#2, and Review#3, give me a meta-review by
      answering:
      (a) Mention core contributions with common contributions
      first.
      (b) A common strength is a strength that is mentioned in at
      least 2 reviews as a strength. Mention common strengths.
      (c) A common weakness is a weakness that is mentioned
      in at least 2 reviews as a weakness. Mention common
      weaknesses.
      (d) A common suggestion for improvement is a suggestion
      that is mentioned in at least 2 reviews as a suggestion for
      improvement. Mention common improvements suggested.
      (e) State whether reviews refer to missing references or not.
      A listing of missing references is not required.
      Three reviews are as follows:\n
      1.Review#1\n< {r1}>\n
      2.Review#2\n< {r2} >\n
      3.Review#3\n< {r3} >

"""


prompt_4 = f"""
      Using three reviews given as Review#1, Review#2, and Review#3,
      as a meta-reviewer, your task is to draft a meta review
      by answering the following bulleted questions:
      - What is the summary of core contributions? Provide answer
      with supporting evidence.
      - Which common strengths are referred to in the reviews? A
      common strength is a strength that is mentioned in at least 2
      reviews as a strength. Support your answer with explanation.
      - What common weaknesses are described in the reviews? A
      common weakness is a weakness that is mentioned in at least 2
      reviews as a weakness. Give evidence in support of the reply.
      - What suggestions for improvement have been provided by
      three reviews? A common suggestion for improvement is a
      suggestion that is mentioned in at least 2 reviews as a suggestion
      for improvement. Explain the basis for the answer.
      - Do the reviews mention about missing references? Answer with
      explanation is desirable but listing of missing references is not
      required.
      Reviews are as below:\n
      1.Review#1\n< {r1} >\n
      2.Review#2\n< {r2} >\n
      3.Review#3\n< {r3} >

"""

prompts = [prompt_1,prompt_2,prompt_3,prompt_4]

In [None]:
#Here, we will generate the ratings for a particular LLM

all_dataframes = list()

for p in range(1,5):
    # time.sleep(5)
    ratings = list()
    for i in range(len(data)):
        r1 = data['Review1'][i]
        r2 = data['Review2'][i]
        r3 = data['Review3'][i]
        gmr = data[f'res_prompt{p}'][i]
        amr = data['Meta_Review'][i]

        prompt = prompts[p-1]

        evaluation_prompt = f"""
        Imagine you are a human annotator and you are tasked with evaluating the performance of a model in generating a meta-review based on three individual reviews. You are provided with the following:

        - The **prompt** that guided the generation of the meta-review.
        - The **three individual reviews** (r1, r2, r3) which the model used to generate the meta-review.
        - The **generated meta-review** (gmr) created by the model.
        - The **actual expert-written meta-review** (amr) that serves as a reference for what the meta-review should ideally look like.

        Please carefully examine all of this information and evaluate the model's performance based on the following three criteria. Provide a score on a Likert scale from 1 (poor) to 5 (excellent) for each aspect:

        1. **Adherence to instructions**: How well did the model follow the specific instructions given in the prompt? Consider how accurately the model addressed the tasks or questions posed in the prompt when generating the meta-review.
          - Score: [1-5]

        2. **Ability to create useful Meta-Reviews **: Evaluate the usefulness of the generated meta-review in terms of its practicality and effectiveness for someone preparing a comprehensive meta-review. Consider whether the generated content helps in synthesizing the individual reviews into a coherent and insightful summary.
          - Score: [1-5]

        3. **Matching against actual expert-written meta-reviews**: Assess the extent to which the model-generated meta-review aligns with the expert-written meta-review. This involves comparing the content, tone, insights, and overall quality.
          - Score: [1-5]

        ### Input
        **Prompt:**
        {prompt}

        **Reviews:**
        - Review 1: {r1}
        - Review 2: {r2}
        - Review 3: {r3}

        **Generated Meta-review:**
        {gmr}

        **Actual Meta-review:**
        {amr}

        ### Output

        ### Please provide your evaluation in numerical format in a list:

        [Adherence to instructions,Ability to create useful MPSs, Matching against actual expert-written meta-reviews ]


        Please dont include any other comments or explanations.
        """

        response = get_gpt_output(evaluation_prompt)
        convert_list = ast.literal_eval(response) # convert generated response into dictionary
        ratings.append(convert_list) # add it to qa list
        print(f"prompt:{p}", f"paper: {i+1}", response)
    d = pd.DataFrame(ratings)
    d.columns = [f'LLM3P{p}[S1]',f'LLM3P{p}[S2]',f'LLM3P{p}[S3]']
    all_dataframes.append(d)

In [None]:
final_df = pd.concat(all_dataframes,axis = 1, ignore_index=False)
final_df

In [None]:
## change the sheet name with the LLM that are currently evaluating
final_df.to_excel(data_path+'Macro_GPT_score.xlsx',sheet_name = 'palm2', index = False)

In [None]:
# file_name = data_path+'Macro_GPT_score.xlsx'
# # Add a new sheet to an existing Excel file
# with pd.ExcelWriter(file_name, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer:
#     # Save the modified DataFrame to a new sheet
#     final_df.to_excel(writer, sheet_name = 'llama2', index = False)

## Normalizing Scores for Prompt Levels

We have human annotation for four aggregrated samples, therefore we need to average the four paper scores. S1, S2, S3 represents the Statement 1, Statement 2, and Statement 3 respectively.

In [None]:
score = pd.read_excel(data_path+'Macro_GPT_score.xlsx',sheet_name = 'llama2')
score

In [None]:
# Initialize a new DataFrame to store the averaged values
new_score = pd.DataFrame()

# Loop through each column in the DataFrame
for column in score.columns:
    averaged_values = []

    # Calculate the mean of every set of four rows for the current column
    for i in range(0, 40, 4):
        avg_value = score[column].iloc[i:i+4].mean()
        averaged_values.append(avg_value)

    # Add the averaged values as a new column in the new DataFrame
    new_score[column] = averaged_values

# new_score

In [None]:
# Custom rounding function
def custom_round(value):
    # Check if the decimal part is 0.50 or greater
    if value % 1 > 0.5:
        return np.ceil(value)
    else:
        return np.floor(value)

# Apply custom rounding function to the DataFrame
new_score = new_score.applymap(custom_round)
# new_score

In [None]:
# Extract suffixes and create a dictionary to store column names by suffix
suffix_groups = {}
for col in new_score.columns:
    # Extract suffix from column name
    suffix = col.split('[')[-1].strip(']')
    if suffix not in suffix_groups:
        suffix_groups[suffix] = []
    suffix_groups[suffix].append(col)

# Initialize a new DataFrame to store the averaged values
final_score = pd.DataFrame()

# Compute the row-wise mean for each suffix group and add to the new DataFrame
for suffix, columns in suffix_groups.items():
    final_score[f'LLM1[{suffix}]'] = new_score[columns].mean(axis=1)

final_score

In [None]:
# Custom rounding function
def custom_round(value):
    return np.floor(value)

# Apply custom rounding function to the DataFrame
final_score = final_score.applymap(custom_round)
final_score

Unnamed: 0,LLM1[S1],LLM1[S2],LLM1[S3]
0,4.0,4.0,3.0
1,4.0,3.0,2.0
2,4.0,3.0,2.0
3,4.0,4.0,3.0
4,3.0,3.0,2.0
5,4.0,4.0,3.0
6,4.0,4.0,3.0
7,4.0,4.0,3.0
8,4.0,3.0,2.0
9,4.0,4.0,3.0


In [None]:
file_name = data_path+'Macro_GPT_score.xlsx'
# Add a new sheet to an existing Excel file
with pd.ExcelWriter(file_name, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer:
    # Save the modified DataFrame to a new sheet
    final_score.to_excel(writer, sheet_name = 'llama2_step1', index = False)

## Correlation

In [None]:
# Load the human vs averaged GPT4 evaluation
# the human and GPT-4 annoation were merged into the step2 file mannually.
palm2_score = pd.read_excel(data_path+'Macro_GPT_score.xlsx',sheet_name = 'palm2_step2')
gpt_score = pd.read_excel(data_path+'Macro_GPT_score.xlsx',sheet_name = 'gpt3.5_step2')
llama2_score = pd.read_excel(data_path+'Macro_GPT_score.xlsx',sheet_name = 'llama2_step2')


In [None]:
def get_correlation(new_df):
  # Separate human and automatic evaluations
  human_scores = new_df[new_df['Eval_Type'] == 'Human'].drop(columns=['Eval_Type', 'Paper#'])
  auto_scores = new_df[new_df['Eval_Type'] == 'GPT'].drop(columns=['Eval_Type', 'Paper#'])

  # Ensure the order of papers is the same
  assert all(new_df[new_df['Eval_Type'] == 'Human']['Paper#'].values == new_df[new_df['Eval_Type'] == 'GPT']['Paper#'].values)

  # Calculate correlation for each aspect
  correlations_aspect = {'Pearson': [], 'Spearman': [], 'Kendall': []}

  # Calculate correlation for each aspect
  correlations = {}
  for column in human_scores.columns:
      # print(column)
      human_column_scores = human_scores[column]
      auto_column_scores = auto_scores[column]
      pearson_corr, _ = pearsonr(human_column_scores, auto_column_scores)
      spearman_corr, _ = spearmanr(human_column_scores, auto_column_scores)
      kendall_corr, _ = kendalltau(human_column_scores, auto_column_scores)

      correlations[column] = {
          'Pearson': pearson_corr,
          'Spearman': spearman_corr,
          'Kendall': kendall_corr
      }

  # Convert to DataFrame for better visualization
  correlation_df = pd.DataFrame(correlations).T
  correlation_df.columns = ['Pearson', 'Spearman', 'Kendall']

  return correlation_df

In [None]:
# Correlation for PalM2
palm2_crr = get_correlation(palm2_score)
print(palm2_crr)

           Pearson  Spearman   Kendall
LLM1[S1]  0.102062  0.102062  0.102062
LLM1[S2]  0.356348  0.356348  0.356348
LLM1[S3] -0.128037 -0.094491 -0.089803


In [None]:
# Correlation for GPT-3.5
gpt_crr = get_correlation(gpt_score)
print(gpt_crr)

          Pearson  Spearman   Kendall
LLM1[S1]      NaN       NaN       NaN
LLM1[S2]   -0.212 -0.185695 -0.174078
LLM1[S3]    0.000  0.062770  0.058926


In [None]:
# Correlation for Llama2
llama2_crr = get_correlation(llama2_score)
print(llama2_crr)

           Pearson  Spearman   Kendall
LLM1[S1]  0.534522  0.496904  0.471405
LLM1[S2]  0.442807  0.395285  0.383065
LLM1[S3]  0.263523  0.263523  0.248734
