# Load dependencies


In [1]:
import json
import os
import re

import numpy as np
import pandas as pd
from dotenv import find_dotenv, load_dotenv
from huggingface_hub import InferenceClient
from scipy import stats
from statsmodels.stats import inter_rater as irr
from tqdm.auto import tqdm

tqdm.pandas()  # load tqdm's pandas support
pd.set_option("display.max_colwidth", None)

load_dotenv(find_dotenv())

True

# Helper functions


In [73]:
# Spearman's r
def calculate_spearman(df, col_1, col_2):
    return stats.spearmanr(df[col_1], df[col_2])


# Cohen's Kappa
def calculate_cohens_kappa(df, col_1, col_2):
    contingency_table = pd.crosstab(df[col_1], df[col_2], dropna=False)
    return irr.cohens_kappa(contingency_table)


# Cramér's V
def calculate_cramers_v(df, col_1, col_2):
    contingency_table = pd.crosstab(df[col_1], df[col_2], dropna=True)
    chi2 = stats.chi2_contingency(contingency_table)[0]
    n = contingency_table.sum().sum()
    min_dim = min(contingency_table.shape) - 1
    return np.sqrt(chi2 / (n * min_dim))


# calculate all metrics
def calculate_all_metrics(df, col_1, col_2):
    spearman_corr, _ = calculate_spearman(df, col_1, col_2)
    cohens_kappa = calculate_cohens_kappa(df, col_1, col_2)
    cramers_v = calculate_cramers_v(df, col_1, col_2)

    print(f"Spearman's r: {spearman_corr:.4f}")
    print(f"Cohen's Kappa: {cohens_kappa.kappa:.4f}")
    print(f"Cramér's V: {cramers_v:.4f}")

# Setup LLM client


In [19]:
repo_id = "meta-llama/Meta-Llama-3-8B-Instruct"

llm_client = InferenceClient(
    model=repo_id, timeout=120, token=os.environ["HUGGINGFACE_API_KEY"]
)

# test the client
llm_client.text_generation(prompt="How are you today?", max_new_tokens=20)

" I hope you're having a great day! I just wanted to check in and see how things are"

# LLM as a judge


## download dataset


In [20]:
if not os.path.exists("../data/llm_eval/feedback_train.json"):
    os.system(
        "!wget https://github.com/McGill-NLP/feedbackqa/raw/main/data/feedback_train.json -O ../data/llm_eval/feedback_train.json"
    )

In [5]:
# create a human evaluation dataset (~30 samples)
# to evaluate how reliable this LLM is as a judge
ratings = json.load(open("../data/llm_eval/feedback_train.json"))
ratings = pd.DataFrame(ratings)

In [6]:
# Retrieve the answers
ratings["answer"] = ratings["passage"].apply(
    lambda x: x["reference"]["section_content"]
)

# Create new columns for review_1 and review_2
ratings["review_1"] = ratings["rating"].apply(lambda x: x[0])
ratings["review_2"] = ratings["rating"].apply(lambda x: x[1] if len(x) > 1 else None)

# Create new columns for explanation_1 and explanation_2
ratings["explanation_1"] = ratings["feedback"].apply(lambda x: x[0])
ratings["explanation_2"] = ratings["feedback"].apply(
    lambda x: x[1] if len(x) > 1 else None
)

# Drop the original feedback and rating columns
ratings = ratings.drop(columns=["feedback", "rating"])

In [7]:
# map the review to an integer
conversion_dict = {"Excellent": 4, "Acceptable": 3, "Could be Improved": 2, "Bad": 1}
ratings["score_1"] = ratings["review_1"].map(conversion_dict)
ratings["score_2"] = ratings["review_2"].map(conversion_dict)

## measuring the correlation between two raters

- lets compare different metrics:
  - spearman's r: for measuring the correlation strength
  - kendall's tau: for measuring smaller samples, more robust to outliers
  - cohen's kappa: for measuring agreement
  - cramer's v: for general association (especially if the variables are treated as categorical rather than ordinal)

we will use spearman's r, cohen's kappa and cramer's v, since we have > 5k samples.


| Aspect                       | Spearman's r                                                                                                | Kendall's tau                                                                                                 | Cohen's kappa                                                                       | Cramér's V                                                                                                                        |
| ---------------------------- | ----------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------- |
| **Prerequisites**            | Ordinal or continuous variables                                                                             | Ordinal or continuous variables                                                                               | Categorical variables (nominal or ordinal)                                          | Categorical variables (nominal or ordinal)                                                                                        |
| **Assumptions**              | Monotonic relationship between variables                                                                    | Monotonic relationship between variables                                                                      | Independent observations                                                            | Independent observations                                                                                                          |
| **Sample Size Requirements** | Generally, n > 10                                                                                           | Generally, n > 10                                                                                             | Minimum 2 raters, at least 30 cases                                                 | No strict minimum, but larger samples provide more reliable results                                                               |
| **Testing Procedure**        | 1. Rank the data<br>2. Calculate Pearson correlation on ranks                                               | 1. Compare all pairs of observations<br>2. Calculate concordant and discordant pairs                          | 1. Create a contingency table<br>2. Calculate observed and expected agreement       | 1. Create a contingency table<br>2. Calculate chi-square statistic<br>3. Normalize based on sample size and degrees of freedom    |
| **Interpretation**           | -1 to 1, where:<br>-1: perfect negative correlation<br>0: no correlation<br>1: perfect positive correlation | -1 to 1, where:<br>-1: perfect negative correlation<br>0: no correlation<br>1: perfect positive correlation   | 0 to 1, where:<br>0: agreement by chance<br>1: perfect agreement                    | 0 to 1, where:<br>0: no association<br>1: perfect association                                                                     |
| **Use Cases**                | - Assessing strength and direction of monotonic relationships<br>- Ranking comparisons                      | - Assessing strength and direction of monotonic relationships<br>- Handling tied ranks better than Spearman's | - Measuring inter-rater agreement<br>- Assessing reliability of categorical ratings | - Measuring strength of association between categorical variables<br>- Useful when variables have different numbers of categories |
| **Strengths**                | - Robust to outliers<br>- Doesn't require normality assumption                                              | - More robust to outliers than Spearman's<br>- Better for small sample sizes and handling ties                | - Accounts for agreement by chance<br>- Widely used in reliability studies          | - Can be used with variables having different numbers of categories<br>- Provides a standardized measure of association           |
| **Limitations**              | - Doesn't capture non-monotonic relationships<br>- Can overestimate strength for small samples              | - Interpretation can be less intuitive than Spearman's<br>- Computationally intensive for large datasets      | - Sensitive to marginal distributions<br>- Doesn't consider degree of disagreement  | - Doesn't indicate direction of association<br>- Can be influenced by sample size                                                 |


In [8]:
len(ratings.score_1), len(ratings.score_2)  # number of samples

(5660, 5660)

In [57]:
calculate_all_metrics(ratings, col_1="score_1", col_2="score_2")

Spearman's r: 0.5609
Cohen's Kappa: 0.2838
Cramér's V: 0.3350


- Spearman's r (0.5609) suggests a moderate to strong correlation in the ranking order, meaning that as one rater's scores increase, the other rater's scores tend to increase as well.
- Cohen's Kappa (0.2838) suggests a only fair agreement between the raters, indicating substantial discrepancies between raters.
- Cramér's V: (0.3350) suggests that there is a moderate (not very strong) association between the two raters' categorization (without considering the ordinal nature)


In [10]:
# we examine instances where both raters reach consensus
ratings_where_raters_agree = ratings.loc[ratings["score_1"] == ratings["score_2"]]
examples = ratings_where_raters_agree.groupby("score_1").sample(7, random_state=1214)
examples["human_score"] = examples["score_1"]

print(f"the number of instances is: {len(examples)}")

# visualize 1 sample for each score
display(examples.groupby("human_score").first())

the number of instances is: 28


Unnamed: 0_level_0,question,passage,domain,answer,review_1,review_2,explanation_1,explanation_2,score_1,score_2
human_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,What can I do to help people that are grieving?,"{'passage_id': 37, 'source': 'CDC', 'uri': 'https://www.cdc.gov/coronavirus/2019-ncov/daily-life-coping/managing-stress-anxiety.html', 'reference_type': 'Passage_only', 'reference': {'page_title': 'Coping with Stress', 'section_headers': ['Take care of yourself and your community'], 'section_content': 'Taking care of yourself, your friends, and your family can help you cope with stress. Helping others cope with their stress can also make your community stronger. Ways to cope with stress Take breaks from watching, reading, or listening to news stories , including social media. Hearing about the pandemic repeatedly can be upsetting. Take care of your body. Take deep breaths, stretch, or meditate. Try to eat healthy, well-balanced meals. Exercise regularly, get plenty of sleep. Avoid alcohol and drugs. Make time to unwind. Try to do some other activities you enjoy. Connect with others. Talk with people you trust about your concerns and how you are feeling. Know the facts to help reduce stress Understanding the risk to yourself and people you care about can make an outbreak less stressful. Learn and share the facts about COVID-19 and help stop the spread of rumors. When you share accurate information about COVID-19, you can help make people feel less stressed, make a connection with them, and help stop stigma. Take care of your mental health Call your healthcare provider if stress gets in the way of your daily activities for several days in a row. People with preexisting mental health conditions should continue with their treatment and be aware of new or worsening symptoms. Additional information can be found at the Substance Abuse and Mental Health Services Administration (SAMHSA) Disaster Preparedness page. Learn more about taking care of your emotional health during a stressful event like the COVID-19 outbreak.', 'selected_span': None, 'section_content_html': '<p>Taking care of yourself, your friends, and your family can help you cope with stress. Helping others cope with their stress can also make your community stronger.</p> <h3>Ways to cope with stress</h3> <ul> <li><strong>Take breaks from watching, reading, or listening to news stories</strong> , including social media. Hearing about the pandemic repeatedly can be upsetting.</li> <li><strong>Take care of your body</strong>. <ul> <li>Take deep breaths, stretch, or <a href=""https://nccih.nih.gov/health/meditation/overview.htm"">meditate</a>.</li> <li><a href=""/nccdphp/dnpao/features/national-nutrition-month/index.html"">Try to eat healthy, well-balanced meals</a>.</li> <li><a href=""/physicalactivity/basics/index.htm"">Exercise regularly</a>, <a href=""/sleep/about_sleep/sleep_hygiene.html"">get plenty of sleep</a>.</li> <li>Avoid <a href=""/alcohol/fact-sheets/alcohol-use.htm"">alcohol</a> and <a href=""https://www.drugabuse.gov/related-topics/health-consequences-drug-misuse"">drugs</a>.</li> </ul> </li> <li><strong>Make time to unwind</strong>. Try to do some other activities you enjoy.</li> <li><strong>Connect with others</strong>. Talk with people you trust about your concerns and how you are feeling.</li> </ul> <h3>Know the facts to help reduce stress</h3> <p>Understanding the risk to yourself and people you care about can make an outbreak less stressful.</p> <p>Learn and share the facts about COVID-19 and help <a href=""/coronavirus/2019-ncov/daily-life-coping/share-facts.html"">stop the spread of rumors</a>. When you share accurate information about COVID-19, you can help make people feel less stressed, make a connection with them, and <a href=""/coronavirus/2019-ncov/daily-life-coping/reducing-stigma.html"">help stop stigma</a>.</p> <h3>Take care of your mental health</h3> <p><strong>Call your healthcare provider if stress gets in the way</strong> of your daily activities for several days in a row.</p> <p><strong>People with preexisting mental health conditions</strong> should continue with their treatment and be aware of new or worsening symptoms. Additional information can be found at the Substance Abuse and Mental Health Services Administration <a href=""https://www.samhsa.gov/disaster-preparedness"">(SAMHSA) Disaster Preparedness</a> page.</p> <p>Learn more about <a href=""https://emergency.cdc.gov/coping/selfcare.asp"">taking care of your emotional health</a> during a stressful event like the COVID-19 outbreak.</p>'}}",CDC,"Taking care of yourself, your friends, and your family can help you cope with\nstress. Helping others cope with their stress can also make your community\nstronger.\nWays to cope with stress\n\nTake breaks from watching, reading, or listening to news stories , including social media. Hearing about the pandemic repeatedly can be upsetting.\nTake care of your body. \nTake deep breaths, stretch, or meditate.\nTry to eat healthy, well-balanced meals.\nExercise regularly, get plenty of sleep.\nAvoid alcohol and drugs.\n\n\nMake time to unwind. Try to do some other activities you enjoy.\nConnect with others. Talk with people you trust about your concerns and how you are feeling.\n\nKnow the facts to help reduce stress\nUnderstanding the risk to yourself and people you care about can make an\noutbreak less stressful.\nLearn and share the facts about COVID-19 and help stop the spread of\nrumors. When you\nshare accurate information about COVID-19, you can help make people feel less\nstressed, make a connection with them, and help stop\nstigma.\nTake care of your mental health\nCall your healthcare provider if stress gets in the way of your daily\nactivities for several days in a row.\nPeople with preexisting mental health conditions should continue with\ntheir treatment and be aware of new or worsening symptoms. Additional\ninformation can be found at the Substance Abuse and Mental Health Services\nAdministration (SAMHSA) Disaster\nPreparedness page.\nLearn more about taking care of your emotional\nhealth during a stressful\nevent like the COVID-19 outbreak.",Bad,Bad,The question is about others which the reply did not answer.,The response could have addressed how to help those that are grieving cope rather than what it was presenting.,1,1
2,What protocols do workplaces need to follow to keep everyone safer?,"{'passage_id': 153, 'source': 'Australia', 'uri': 'https://coronavirus.fairwork.gov.au/', 'reference_type': 'Passage_only', 'reference': {'page_title': 'Coronavirus and Australian workplace laws', 'section_headers': ['Health & safety in the workplace'], 'section_content': 'Workplaces must follow the rules about health and safety during coronavirus to help stop it spreading. Find out more about: rules and obligations under workplace health and safety laws how to manage the risk of coronavirus in the workplace where to go for help. Learn more about Health and safety in the workplace during coronavirus.', 'selection_span': None, 'section_content_html': '<p>Workplaces must follow the rules about health and safety during coronavirus to help stop it spreading. Find out more about:</p> <ul> <li>rules and obligations under workplace health and safety laws</li> <li>how to manage the risk of coronavirus in the workplace</li> <li>where to go for help.</li> </ul> <p>Learn more about <a href=""/coronavirus-and-australian-workplace-laws/health-and-safety-in- the-workplace-during-coronavirus"">Health and safety in the workplace during coronavirus</a>.</p>'}}",Australia,Workplaces must follow the rules about health and safety during coronavirus to\nhelp stop it spreading. Find out more about:\n\nrules and obligations under workplace health and safety laws\nhow to manage the risk of coronavirus in the workplace\nwhere to go for help.\n\nLearn more about Health and safety in the workplace during\ncoronavirus.,Could be Improved,Could be Improved,"This answer needs to be improved because it doesn’t provide information up-front about workplaces during the pandemic. Instead, it just includes a hyperlink.","there is one link to information, but there is no information in the answer about how to stay safe in the workplace. it talks about the need to stay safe in the workplace, but it doesn't talk about ways in which to actually do that.",2,2
3,How soon can I apply for financial support?,"{'passage_id': 43, 'source': 'Australia', 'uri': 'https://www.ato.gov.au/Individuals/Super/In-detail/Withdrawing-and-using-your-super/COVID-19-early-release-of-super/', 'reference_type': 'Passage_only', 'reference': {'page_title': 'COVID-19 early release of super', 'section_headers': ['After you apply'], 'section_content': 'It will take us up to four business days to process your application and send your outcome letter to your myGov inbox. You may also receive an SMS notification. If you receive a notification from us and haven't applied to access your super early, you need to call us or your fund as soon as possible. If you have an Australian Prudential Regulation Authority (APRA) fund and your application is approved, you do not need to contact us or your fund. Your fund will make the payment to you without you needing to apply to them directly. The Australian Prudential Regulation Authority (APRA) have issued guidance to super funds and expect payment to be made to members within five business days once they have been notified by us. However, this time may increase where funds need to contact you to clarify information. More information can be found on APRA's websiteExternal Link. If your fund is a state-administered fund, they need to follow the rules of their trust deed to determine if they're allowed to release super due to COVID-19. You will need to get confirmation from your fund, before you submit an application, that they can release your super early and whether they require a letter of approval (determination) from us. If your fund is an SMSF , you will need to let them know that you have received the letter of approval from us so they can make the payment to you.', 'selection_span': None, 'section_content_html': '<p>It will take us up to four business days to process your application and send your outcome letter to your myGov inbox. You may also receive an SMS notification.</p> <p>If you receive a notification from us and haven't applied to access your super early, you need to call us or your fund as soon as possible.</p> <p>If you have an <strong>Australian Prudential Regulation Authority (APRA) fund</strong> and your application is approved, you do not need to contact us or your fund. Your fund will make the payment to you without you needing to apply to them directly.</p> <p>The Australian Prudential Regulation Authority (APRA) have issued guidance to super funds and expect payment to be made to members within five business days once they have been notified by us. However, this time may increase where funds need to contact you to clarify information. More information can be found on <a href=""https://www.apra.gov.au/frequently- asked-questions-superannuation-trustees-response-to-covid-19"">APRA's websiteExternal Link</a>.</p> <p>If your fund is a <strong>state-administered fund,</strong> they need to follow the rules of their trust deed to determine if they're allowed to release super due to COVID-19. You will need to get confirmation from your fund, before you submit an application, that they can release your super early and whether they require a letter of approval (determination) from us.</p> <p>If your fund is an <strong>SMSF</strong> , you will need to let them know that you have received the letter of approval from us so they can make the payment to you.</p>'}}",Australia,"It will take us up to four business days to process your application and send\nyour outcome letter to your myGov inbox. You may also receive an SMS\nnotification.\nIf you receive a notification from us and haven't applied to access your super\nearly, you need to call us or your fund as soon as possible.\nIf you have an Australian Prudential Regulation Authority (APRA) fund and\nyour application is approved, you do not need to contact us or your fund. Your\nfund will make the payment to you without you needing to apply to them\ndirectly.\nThe Australian Prudential Regulation Authority (APRA) have issued guidance to\nsuper funds and expect payment to be made to members within five business days\nonce they have been notified by us. However, this time may increase where\nfunds need to contact you to clarify information. More information can be\nfound on APRA's websiteExternal Link.\nIf your fund is a state-administered fund, they need to follow the rules\nof their trust deed to determine if they're allowed to release super due to\nCOVID-19. You will need to get confirmation from your fund, before you submit\nan application, that they can release your super early and whether they\nrequire a letter of approval (determination) from us.\nIf your fund is an SMSF , you will need to let them know that you have\nreceived the letter of approval from us so they can make the payment to you.",Acceptable,Acceptable,"There is information on how to apply for the help. Still, there is nothing say how long you have to wait before applying.",This response says how long the applications take to process and then some more information about the process. There's a link to more relevant information. A pretty good answer,3,3
4,Should vulnerable children be expected to be in educational settings?,"{'passage_id': 789, 'source': 'UK', 'uri': 'https://www.gov.uk/government/publications/covid-19-school-closures/guidance-for-schools-about-temporarily-closing', 'reference_type': 'FAQ', 'reference': {'page_title': 'Guidance Actions for schools during the coronavirus outbreak', 'section_headers': ['Prioritising pupils', 'What are our expectations regarding vulnerable children and young people attending educational settings?'], 'section_content': 'Vulnerable children and young people’s attendance is expected, where it is appropriate for them (i.e. where there are no shielding concerns for the child or their household, and/or following a risk assessment for children with an EHC plan), so that they can gain the educational and wellbeing benefits of attending. Vulnerable children and young people – regardless of year group – that have not been attending in the recent period are expected to return to school where this would now be appropriate for them to do so. A brief summary of attendance expectations across the different groups of vulnerable children and young people is as follows: for vulnerable children and young people who have a social worker, attendance is expected unless the child/household is shielding or clinically vulnerable (see the advice set out by Public Health England on households with possible coronavirus infection, and shielding and protecting people defined on medical grounds as extremely vulnerable). for vulnerable children and young people who have an education health and care (EHC) plan, attendance is expected where it is determined, following risk assessment, that their needs can be as safely or more safely met in the educational environment. Read further guidance on temporary Changes to education, health and care (EHC) needs and assessments for vulnerable children and young people who are deemed otherwise vulnerable, at the school, college or local authority discretion, attendance is expected unless the child/household is shielding or clinically vulnerable (see the advice set out by Public Health England on households with possible coronavirus infection, and shielding and protecting people defined on medical grounds as extremely vulnerable). *[EHC]: Education, Health and Care', 'selection_span': None, 'section_content_html': '<p>Vulnerable children and young people’s attendance is expected, where it is appropriate for them (i.e. where there are no shielding concerns for the child or their household, and/or following a risk assessment for children with an EHC plan), so that they can gain the educational and wellbeing benefits of attending. Vulnerable children and young people – regardless of year group – that have not been attending in the recent period are expected to return to school where this would now be appropriate for them to do so. A brief summary of attendance expectations across the different groups of vulnerable children and young people is as follows:</p> <ul> <li>for vulnerable children and young people who have a social worker, attendance is expected unless the child/household is shielding or clinically vulnerable (see the advice set out by Public Health England on <a href=""https://www.gov.uk/government/publications/covid-19-stay-at-home-guidance"">households with possible coronavirus infection</a>, and <a href=""https://www.gov.uk/government/publications/guidance-on-shielding-and-protecting-extremely-vulnerable-persons-from-covid-19"">shielding and protecting people defined on medical grounds as extremely vulnerable</a>).</li> <li>for vulnerable children and young people who have an education health and care (EHC) plan, attendance is expected where it is determined, following <a href=""https://www.gov.uk/government/publications/coronavirus-covid-19-send-risk-assessment-guidance/coronavirus-covid-19-send-risk-assessment-guidance"">risk assessment</a>, that their needs can be as safely or more safely met in the educational environment. Read further guidance on temporary <a href=""https://www.gov.uk/government/publications/changes-to-the-law-on-education-health-and-care-needs-assessments-and-plans-due-to-coronavirus/education-health-and-care-needs-assessments-and-plans-guidance-on-temporary-legislative-changes-relating-to-coronavirus-covid-19"">Changes to education, health and care (EHC) needs and assessments</a></li> <li>for vulnerable children and young people who are deemed otherwise vulnerable, at the school, college or local authority discretion, attendance is expected unless the child/household is shielding or clinically vulnerable (see the advice set out by Public Health England on <a href=""https://www.gov.uk/government/publications/covid-19-stay-at-home-guidance"">households with possible coronavirus infection</a>, and <a href=""https://www.gov.uk/government/publications/guidance-on-shielding-and-protecting-extremely-vulnerable-persons-from-covid-19"">shielding and protecting people defined on medical grounds as extremely vulnerable</a>).</li> </ul> <p>*[EHC]: Education, Health and Care</p>'}}",UK,"Vulnerable children and young people’s attendance is expected, where it is\nappropriate for them (i.e. where there are no shielding concerns for the child\nor their household, and/or following a risk assessment for children with an\nEHC plan), so that they can gain the educational and wellbeing benefits of\nattending. Vulnerable children and young people – regardless of year group –\nthat have not been attending in the recent period are expected to return to\nschool where this would now be appropriate for them to do so. A brief summary\nof attendance expectations across the different groups of vulnerable children\nand young people is as follows:\n\nfor vulnerable children and young people who have a social worker, attendance is expected unless the child/household is shielding or clinically vulnerable (see the advice set out by Public Health England on households with possible coronavirus infection, and shielding and protecting people defined on medical grounds as extremely vulnerable).\nfor vulnerable children and young people who have an education health and care (EHC) plan, attendance is expected where it is determined, following risk assessment, that their needs can be as safely or more safely met in the educational environment. Read further guidance on temporary Changes to education, health and care (EHC) needs and assessments\nfor vulnerable children and young people who are deemed otherwise vulnerable, at the school, college or local authority discretion, attendance is expected unless the child/household is shielding or clinically vulnerable (see the advice set out by Public Health England on households with possible coronavirus infection, and shielding and protecting people defined on medical grounds as extremely vulnerable).\n\n*[EHC]: Education, Health and Care",Excellent,Excellent,There is a lot of relevant information here. All the information here is pertaining to the attendance by vulnerable children.,This answers the questions and includes links and guides on how to help keep the kids healthy. It provides guidelines on what to do and how to bring the students back to school,4,4


## create the LLM judge

- task description
- scale description: minimum, maximum, value types (float here)
- explanation of the output format
- a beginning of an answer, to take the LLM by the hand as far as we can


In [21]:
JUDGE_PROMPT = """
You will be given a user_question and system_answer couple.
Your task is to provide a 'total rating' scoring how well the system_answer answers the user concerns expressed in the user_question.
Give your answer as a float on a scale of 0 to 10, where 0 means that the system_answer is not helpful at all, and 10 means that the answer completely and helpfully addresses the question.

Provide your feedback as follows:

Feedback:::
Total rating: (your rating, as a float between 0 and 10)

Now here are the question and answer.

Question: {question}
Answer: {answer}

Feedback:::
Total rating: """

In [22]:
examples["llm_judge"] = examples.progress_apply(
    lambda x: llm_client.text_generation(
        prompt=JUDGE_PROMPT.format(question=x["question"], answer=x["answer"]),
        max_new_tokens=1000,
    ),
    axis=1,
)

  0%|          | 0/28 [00:00<?, ?it/s]

In [54]:
def extract_judge_score(answer: str, split_str: str = "\n\n") -> int:
    try:
        rating = answer.split(split_str)[0] if split_str in answer else answer
        # digit_groups = [el.strip() for el in re.findall(r"\d+(?:\.\d+)?", rating)]
        return float(rating)
    except Exception as e:
        print(e)
        return None


examples["llm_judge_score"] = examples["llm_judge"].apply(extract_judge_score)

# rescale the score given by the LLM on the same scale as the human score
examples["llm_judge_score"] = pd.cut(
    examples["llm_judge_score"],
    [0, 2.5, 5, 7.5, 10],
    labels=[1, 2, 3, 4],
    include_lowest=True,
)

In [74]:
calculate_all_metrics(examples, col_1="human_score", col_2="llm_judge_score")

Spearman's r: 0.6234
Cohen's Kappa: 0.1429
Cramér's V: 0.4792


## improve the LLM judge

- Leave more time for thought by adding an Evaluation field before the final answer.
- Use a small integer scale like 1-4 or 1-5 instead of a large float scale as we had previously.
- Provide an indicative scale for guidance.
- We even add a carrot to motivate the LLM!


In [80]:
IMPROVED_JUDGE_PROMPT = """
You will be given a user_question and system_answer couple.
Your task is to provide a 'total rating' scoring how well the system_answer answers the user concerns expressed in the user_question.
Give your answer on a scale of 1 to 4, where 1 means that the system_answer is not helpful at all, and 4 means that the system_answer completely and helpfully addresses the user_question.

Here is the scale you should use to build your answer:
1: The system_answer is terrible: completely irrelevant to the question asked, or very partial
2: The system_answer is mostly not helpful: misses some key aspects of the question
3: The system_answer is mostly helpful: provides support, but still could be improved
4: The system_answer is excellent: relevant, direct, detailed, and addresses all the concerns raised in the question

Provide your feedback as follows:

Feedback:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 4)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here are the question and answer.

Question: {question}
Answer: {answer}

Provide your feedback. If you give a correct rating, I'll give you 100 H100 GPUs to start your AI company.
Feedback:::
Total rating: """

In [81]:
examples["llm_judge_improved"] = examples.progress_apply(
    lambda x: llm_client.text_generation(
        prompt=IMPROVED_JUDGE_PROMPT.format(question=x["question"], answer=x["answer"]),
        max_new_tokens=500,
    ),
    axis=1,
)

  0%|          | 0/28 [00:00<?, ?it/s]

In [83]:
# extract the improved scores
examples["llm_judge_improved_score"] = examples["llm_judge_improved"].apply(
    lambda x: extract_judge_score(x, split_str="\n")
)

In [87]:
# scale the scores
examples["llm_judge_improved_score"] = pd.cut(
    examples["llm_judge_improved_score"],
    [0, 2.5, 5, 7.5, 10],
    labels=[1, 2, 3, 4],
    include_lowest=True,
)

# convert the data type to int for comparison
examples["llm_judge_improved_score"] = examples["llm_judge_improved_score"].astype(
    "int64"
)

In [104]:
errors = pd.concat(
    [
        examples.loc[
            examples["llm_judge_improved_score"] > examples["human_score"]
        ].head(1),
        examples.loc[
            examples["llm_judge_improved_score"] < examples["human_score"]
        ].head(2),
    ]
)

In [106]:
display(
    errors[
        [
            "question",
            "answer",
            "human_score",
            "explanation_1",
            "llm_judge_improved_score",
            "llm_judge_improved",
        ]
    ]
)

Unnamed: 0,question,answer,human_score,explanation_1,llm_judge_improved_score,llm_judge_improved
1976,What can I do to help people that are grieving?,"Taking care of yourself, your friends, and your family can help you cope with\nstress. Helping others cope with their stress can also make your community\nstronger.\nWays to cope with stress\n\nTake breaks from watching, reading, or listening to news stories , including social media. Hearing about the pandemic repeatedly can be upsetting.\nTake care of your body. \nTake deep breaths, stretch, or meditate.\nTry to eat healthy, well-balanced meals.\nExercise regularly, get plenty of sleep.\nAvoid alcohol and drugs.\n\n\nMake time to unwind. Try to do some other activities you enjoy.\nConnect with others. Talk with people you trust about your concerns and how you are feeling.\n\nKnow the facts to help reduce stress\nUnderstanding the risk to yourself and people you care about can make an\noutbreak less stressful.\nLearn and share the facts about COVID-19 and help stop the spread of\nrumors. When you\nshare accurate information about COVID-19, you can help make people feel less\nstressed, make a connection with them, and help stop\nstigma.\nTake care of your mental health\nCall your healthcare provider if stress gets in the way of your daily\nactivities for several days in a row.\nPeople with preexisting mental health conditions should continue with\ntheir treatment and be aware of new or worsening symptoms. Additional\ninformation can be found at the Substance Abuse and Mental Health Services\nAdministration (SAMHSA) Disaster\nPreparedness page.\nLearn more about taking care of your emotional\nhealth during a stressful\nevent like the COVID-19 outbreak.",1,The question is about others which the reply did not answer.,2,"3\nEvaluation: The system_answer provides some helpful information on how to cope with stress and grief, such as taking care of oneself, connecting with others, and understanding the facts about COVID-19. However, it does not directly address the question of what to do to help people who are grieving. The answer is mostly helpful, but could be improved by providing more specific and relevant information on how to support those who are grieving."
472,Can the covid19 event visa be granted to anyone?,The COVID-19 Pandemic event visa can only be granted to people in Australia.,2,"This information stated that the Covid-19 Pandemic event visa can be granted to people in Australia, however, it is not clear as to what groups of people in Australia are eligible for this visa.",1,"2\nEvaluation: The system_answer is mostly not helpful because it only answers a part of the question. The question asks if the visa can be granted to anyone, but the system_answer only mentions that it can only be granted to people in Australia, which is a specific group of people. The answer does not address the broader question of whether the visa can be granted to anyone in general."
670,What programs can assist busy childcare facilities?,"For a provider this payment and the base payment under the Relief Package are\nnot considered as revenue for GST purposes. This means providers will be able\nto show they satisfy the decline in income test for the purposes of the\nJobKeeper Payment provided they do not have income from other sources, such as\nbeing part of a larger entity like a non-government school or a not-for-profit\norganisation.\nWhere some of this revenue is then passed on to Family Day Care and In Home\nCare educators (based on contractual arrangements between the service and the\neducator) these monies are considered as revenue for GST purposes. As the\neducator is unlikely to receive more than 50 per cent of their fee revenue\nfrom the provider, they should be able to satisfy the decline in income test\nfor the JobKeeper Payment.",2,Gives some information on assistance programs but no references or contacts,1,"1\nEvaluation: The system_answer is terrible: completely irrelevant to the question asked, or very partial. The answer talks about GST and JobKeeper Payment, which is not related to the question about programs that can assist busy childcare facilities. It seems like a random and unrelated response."


# References

- [Using LLM-as-a-judge 🧑‍⚖️ for an automated and versatile evaluation](https://huggingface.co/learn/cookbook/llm_judge#2-create-our-llm-judge)
