In [32]:
import pandas as pd
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
import re
import random
from tqdm.notebook import tqdm
import os
from claude_bedrock import call_claude_on_bedrock

In [61]:

fname = 'Llama_3_8b_inst_explanations'

In [62]:

df = pd.read_csv(f'{fname}.csv')
df.head()

Unnamed: 0,premise,hypothesis,gt_label,pred_label,shap_value,human_explanation,baseline_model_explanation,shap_model_explanation,cotfull_response,cot_explanations,cot_label
0,Two women are embracing while holding to go pa...,The men are fighting outside a deli.,contradiction,contradiction,"[('', [0.0, 0.0, 0.0]), ('Prem', [-0.587972005...",In the first sentence there is an action of af...,"This is an example of a visual contradiction, ...",**INPUT FORMAT**\n <premise>\n [premise]...,**SAMPLE INPUT**\n <premise>\n Two women...,The men are fighting outside a deli.,entailment
1,"Two young children in blue jerseys, one with t...",Two kids in numbered jerseys wash their hands.,entailment,entailment,"[('', [0.0, 0.0, 0.0]), ('Prem', [-0.7734375, ...",Young children are kids. Jerseys with number 9...,The premise directly states that two kids in n...,**INPUT FORMAT**\n <premise>\n [premise]...,**SAMPLE INPUT**\n <premise>\n Two young...,The hypothesis is entailed by the premise beca...,entailment
2,A man selling donuts to a customer during a wo...,A man selling donuts to a customer.,entailment,entailment,"[('', [0.0, 0.0, 0.0]), ('Prem', [-0.849609375...",A man selling donuts is selling donuts.,The sentence describes a specific action and s...,**INPUT FORMAT**\n <premise>\n [premise]...,**SAMPLE INPUT**\n <premise>\n A man sel...,The premise states that a man is selling donut...,entailment
3,Two young boys of opposing teams play football...,boys play football,entailment,entailment,"[('', [0.0, 0.0, 0.0]), ('Prem', [0.505859375,...",If the boys of opposing teams play football th...,The premise states that two young boys are pla...,**INPUT FORMAT**\n <premise>\n [premise]...,**SAMPLE INPUT**\n <premise>\n Two young...,Boys playing football is entailed by the premise.,entailment
4,Two young boys of opposing teams play football...,dog eats out of bowl,contradiction,contradiction,"[('', [0.0, 0.0, 0.0]), ('Prem', [-0.196451822...",Boys are not dogs.,The premise is a statement about two people pl...,**INPUT FORMAT**\n <premise>\n [premise]...,**SAMPLE INPUT**\n <premise>\n Two young...,The hypothesis is neutral because it does not ...,neutral


In [63]:
# JUDGE_PROMPT =  f"""
# You are given a premise, a hypothesis, few human-written explanations, and a model-generated explanation.

# The model has generated 
# Evaluate the model explanation based on:
# - Faithfulness to the premise and hypothesis
# - Clarity and ease of understanding
# - Completeness of reasoning
# - Overall similarity to human explanations

# Assign a score from 1 to 5:
# - 5: Excellent — Faithful, clear, complete, and closely matches human explanations.
# - 4: Good — Mostly correct with minor issues.
# - 3: Average — Partially correct but missing important reasoning.
# - 2: Poor — Significant mistakes or confusion.
# - 1: Very Poor — Wrong, misleading, or irrelevant.

# After scoring, briefly explain your reasoning in 2–4 sentences.

# ---

# Premise:
# {{premise}}


# Hypothesis:
# {{hypothesis}}


# Here are one or more Human Explanation/s:
# {{human_explanation}}


# Model Explanation:
# {{model_explanation}}

# ---

# Output Format:
# Wrap your answers using these tags:
# - <score> [1-5] </score>
# - <justification> [your reasoning in 2–4 sentences] </justification>
# """




In [64]:
JUDGE_PROMPT = f"""
You are given a *premise* and a *hypothesis*, followed by one or more human-written explanations and a model-generated explanation.

The model explanation is generated based on the premise and hypothesis — specifically, whether the hypothesis logically follows from or contradicts the premise.

Your task is to evaluate how well the model explanation aligns with human reasoning.

Assess the model explanation on:
- Faithfulness to the premise and hypothesis
- Similarity to the human-written explanations

Assign a score from 1 to 5:
- 5: Excellent — Faithful, clear, complete, and closely matches human explanations.
- 4: Good — Mostly correct with minor issues.
- 3: Average — Partially correct but missing important reasoning.
- 2: Poor — Significant mistakes or confusion.
- 1: Very Poor — Wrong, misleading, or irrelevant.

Then provide a justification in 2–4 sentences comparing the model's explanation to the human explanations.

---

Premise:
{{premise}}

Hypothesis:
{{hypothesis}}

Human Explanation(s):
{{human_explanation}}

Model Explanation (Generated based on the premise and hypothesis):
{{model_explanation}}

---

Output Format:
Wrap your answers using these tags:
- <score> [1-5] </score>
- <justification> [your reasoning in 2–4 sentences] </justification>
"""

def format_judge_prompt(JUDGE_PROMPT, premise, hypothesis, model_explanation, human_explanation):
    if isinstance(human_explanation, list):
        human_explanation = "\n- " + "\n- ".join(human_explanation)
    
    filled_prompt = JUDGE_PROMPT.replace("{premise}", premise)\
                                 .replace("{hypothesis}", hypothesis)\
                                 .replace("{model_explanation}", model_explanation)\
                                 .replace("{human_explanation}", human_explanation)
    return filled_prompt.strip()

In [65]:
# JUDGE_PROMPT = f""" Get a score (1-5) from LLM comparing model explanation to human explanation. You are an expert evaluator of natural language explanations for machine learning model predictions.
# Compare the following two explanations for the same data point and assign a score from 0-5
# (5 being perfect alignment) based on how well the Model Explanation matches the Human Explanation.
# The relationship between the Premise: {{premise}} ; and
#         Hypothesis: {{hypothesis}} ; was classified as {{pred}}.


# **Human Explanation**: {{human_explanation}}
# **Model Explanation**: {{model_explanation}}
# Consider:
# 1. Clarity and coherence
# 2. Coverage of important points
# 3. Logical consistency
 
# Return ONLY the numerical score (1-5) with no additional text.
# Wrap your answers using these tags:
# - <score> [1-5] </score>
# - <justification> [your reasoning in 2–4 sentences] </justification> """


# def format_judge_prompt(JUDGE_PROMPT, premise, hypothesis, pred,human_explanation,model_explanation):
#     if isinstance(human_explanation, list):
#         human_explanation = "\n- " + "\n- ".join(human_explanation)
    
#     filled_prompt = JUDGE_PROMPT.replace("{premise}", premise)\
#                                  .replace("{hypothesis}", hypothesis)\
#                                  .replace("{pred}", pred)\
#                                  .replace("{model_explanation}", model_explanation)\
#                                  .replace("{human_explanation}", human_explanation)
#     return filled_prompt.strip()

In [66]:
def claude_scorer(judge_prompt):
    system_prompt = "You are an expert judge for Explainable AI (XAI) systems. Your task is to evaluate and score language model explanations based on their faithfulness, clarity, completeness, and similarity to human-written explanations."
    response = call_claude_on_bedrock(system_prompt, judge_prompt)
    return response

def get_score(text):
    match = re.search(r"<score>(.*?)</score>", text, re.DOTALL)
    if match:
        return match.group(1).strip()
    return None

def get_justification(text):
    match = re.search(r"<justification>(.*?)</justification>", text, re.DOTALL)
    if match:
        return match.group(1).strip()
    return None


In [67]:


premises = df['premise'].to_list()
hypos =  df['hypothesis'].to_list()
labels = df['gt_label'].to_list()
preds = df['cot_label'].to_list()
model_exps =  df['cotfull_response'].to_list() 
human_exps =  df['human_explanation'].to_list() 
shap_exps = df['shap_model_explanation'].to_list()  
len(premises), len(hypos) , len(labels), len(preds), len(model_exps), len(human_exps), len(shap_exps)


(179, 179, 179, 179, 179, 179, 179)

In [68]:
df.columns

Index(['premise', 'hypothesis', 'gt_label', 'pred_label', 'shap_value',
       'human_explanation', 'baseline_model_explanation',
       'shap_model_explanation', 'cotfull_response', 'cot_explanations',
       'cot_label'],
      dtype='object')

In [69]:
model_exps[0], human_exps[0], shap_exps[0]

('**SAMPLE INPUT**\n    <premise>\n    Two women are embracing while holding to go packages.\n    </premise>\n    \n    <Hypothesis>\n    The men are fighting outside a deli.\n    </Hypothesis>\n\n    **SAMPLE OUTPUT**\n    <explanation>\n    The men are fighting outside a deli.\n    </explanation>\n    \n    <label>\n    entailment\n    </label>\n\n### Input Format\nThe input consists of a sequence of pairs of sentences. Each sentence is represented as a string of tokens.\n\n### Output Format\nThe output should consist of a single line containing the following three elements:\n\n1. A string representing the explanation for why you chose the given label.\n2. A string representing the label chosen.\n3. A newline character.',
 'In the first sentence there is an action of affection between women while on the second sentence there is a fight between men.',
 '**INPUT FORMAT**\n    <premise>\n    [premise]\n    </premise>\n    \n    <Hypothesis>\n    [hypothesis]\n    </Hypothesis>\n    \n  

In [70]:
results=[]
for i in tqdm(range(len(premises))):
    premise = str(premises[i])
    hypothesis = str(hypos[i])
    model_explanation = str(model_exps[i])
    shap_exp = str(shap_exps[i])
    human_explanation = str(human_exps[i])
    true_label = str(labels[i])
    predicted_label = str(preds[i])

    judge_prompt_base = format_judge_prompt(JUDGE_PROMPT, premise, hypothesis, model_explanation, human_explanation) # format_judge_prompt(JUDGE_PROMPT, premise, hypothesis, predicted_label,human_explanation,model_explanation) #
    response_base = claude_scorer(judge_prompt_base)
    score_base = get_score(response_base)
    just_base = get_justification(response_base)

    judge_prompt_shap = format_judge_prompt(JUDGE_PROMPT, premise, hypothesis, shap_exp, human_explanation) #format_judge_prompt(JUDGE_PROMPT, premise, hypothesis, predicted_label,human_explanation,shap_exp) #
    response_shap = claude_scorer(judge_prompt_shap)
    score_shap = get_score(response_shap)
    just_shap = get_justification(response_shap)


    
    results.append({
        "premise": premise,
        "hypothesis": hypothesis,
        "true_label": true_label,
        "predicted_label": predicted_label,
        "model_explanation": model_explanation,
        'shapbased_explanation' : shap_exp, 
        "human_explanations": human_explanation,
        "judge_prompt_shap": judge_prompt_shap,
        'score_reasoning':score_base,
        'justification_reasoning':just_base,
        'score_shap':score_shap,
        'justification_shap':just_shap
    })

     

  0%|          | 0/179 [00:00<?, ?it/s]

In [73]:
fname,f'{fname}_scored.json'

('Llama_3_8b_inst_explanations', 'Llama_3_8b_inst_explanations_scored.json')

In [74]:
[entry['score_reasoning'] for entry in results][:5] ,[entry['score_shap'] for entry in results][:5]

(['1', '2', '5', '2', '1'], ['1', None, '1', '1', None])

In [75]:
# with open(f'{fname}_scored.json','r') as f:
#     json.dump(f,indent=)
def getscoresvalues(data,key):
    ret=[]
    for entry in data:
        try :
            ret.append(int(entry[key]))
        except:
            ret.append(3)
    
    return ret

sb  = sum(getscoresvalues(results,'score_reasoning' ) ) / len(results)
shaps  = sum(getscoresvalues(results,'score_shap' ) ) / len(results)
sb, shaps

(2.0, 1.7988826815642458)

In [76]:
# import json
# name = f'{name}_base.json'
# with open(name,'w') as f:
#     json.dump(results,f,indent=4)
df = pd.DataFrame(results)

# Save to CSV
df.to_csv(f'{fname}_scored.csv', index=False)

In [77]:
d=results[1]
for key in ['premise', 'hypothesis', 'true_label', 'predicted_label', 'model_explanation', 'shapbased_explanation','human_explanations','score_shap', 'justification_shap']:
    print(f'{key} : {d[key]} \n')

premise : Two young children in blue jerseys, one with the number 9 and one with the number 2 are standing on wooden steps in a bathroom and washing their hands in a sink. 

hypothesis : Two kids in numbered jerseys wash their hands. 

true_label : entailment 

predicted_label : entailment 

model_explanation : **SAMPLE INPUT**
    <premise>
    Two young children in blue jerseys, one with the number 9 and one with the number 2 are standing on wooden steps in a bathroom and washing their hands in a sink.
    </premise>
    
    <Hypothesis>
    Two kids in numbered jerseys wash their hands.
    </Hypothesis>

    **SAMPLE OUTPUT**
    <explanation>
    The hypothesis is entailed by the premise because it is true that two kids in numbered jerseys wash their hands.
    </explanation>
    
    <label>
    entailment
    </label> 

shapbased_explanation : **INPUT FORMAT**
    <premise>
    [premise]
    </premise>
    
    <Hypothesis>
    [hypothesis]
    </Hypothesis>
    
    <shap_toke

In [78]:
def getscores(data,key):
    ret=[]
    for entry in data:
        if entry[key] not in [None,'N/A']:
            ret.append(int(entry[key]))
        else:
            ret.append(0)
            

In [79]:
scores =getscores(results,'score_shap') 
scores
sum(scores)/len(scores)

TypeError: 'NoneType' object is not iterable

In [49]:
from collections import Counter
counts = Counter(scores)

print(counts)

NameError: name 'scores' is not defined

In [96]:
print([entry['true_label'] for entry in results])
print([entry['predicted_label'] for entry in results])

['contradiction', 'entailment', 'entailment', 'entailment', 'contradiction']
['contradiction', 'entailment', 'entailment', 'entailment', 'contradiction']
