In [3]:
!pip install datasets==1.18.3
!pip install bert_score
!pip install rouge_score
!pip install blanc
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import re

def calculate_metrics(true_labels, predicted_labels):
    accuracy = accuracy_score(true_labels, predicted_labels)
    precision = precision_score(true_labels, predicted_labels, average='macro')
    recall = recall_score(true_labels, predicted_labels, average='macro')
    f1 = f1_score(true_labels, predicted_labels, average='macro')
    return accuracy, precision, recall, f1

df = pd.read_csv('/content/cpt_pred_expl_result.csv')

def clean(text):
  text = text[:50].lower()
  positive_terms = ["succeeded", "succeeds", "succeed", "approved", "approve", "affirmed", "accept", "allow", "allowed", "granted", "accepted"]
  negative_terms = ["dismiss", "reject", "remand", "denied", "rejected", "disapproved", "dismissed", "revoked", "annulled", "invalidated", "disallowed", "dismissed", "revoke"]

  def clean_text(text):
      return re.sub(r'[^\w\s]', '', text.lower())

  def determine_label(pred):
      cleaned_pred = clean_text(pred)
      has_positive = any(term in cleaned_pred for term in positive_terms)
      has_negative = any(term in cleaned_pred for term in negative_terms)
      if has_positive and has_negative:
          return 2
      elif has_positive:
          return 1
      elif has_negative:
          return 0
      else:
          return 3
  return determine_label(text)

pred_list = df['llama2_pred'].to_list()
actual = [int(i) for i in df['Label'].tolist()[1:]]
pred = [clean(i) for i in pred_list[1:]]

a1 = []
p1 = []
for i,e in enumerate(pred):
  if e == 1 or e==0:
    a1.append(actual[i])
    p1.append(e)

accuracy, precision, recall, f1 = calculate_metrics(a1, p1)
print("Accuracy:", accuracy)
print("Macro Precision:", precision)
print("Macro Recall:", recall)
print("Macro F1-score:", f1)

from sklearn.metrics import confusion_matrix

def class_wise_accuracy(true_labels, predicted_labels):
    cm = confusion_matrix(true_labels, predicted_labels)
    class_wise_accuracy = []
    for i in range(len(cm)):
        class_accuracy = cm[i][i] / sum(cm[i])
        class_wise_accuracy.append(class_accuracy)
    return class_wise_accuracy

cwa = class_wise_accuracy(a1, p1)
for i, acc in enumerate(cwa):
    print(f"Class {i+1} accuracy: {acc}")

# ========== EXPLANATION PART ==========

from datasets import load_metric
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
nltk.download('punkt')
nltk.download('punkt_tab')

bertscore = load_metric("bertscore", trust_remote_code=True)
meteor = load_metric("meteor", trust_remote_code=True)
bleu = load_metric("bleu", trust_remote_code=True)
rouge = load_metric("rouge", trust_remote_code=True)

def calculate_bleu_score(candidate, references):
    candidate_tokens = nltk.word_tokenize(candidate)
    reference_tokens = [nltk.word_tokenize(ref) for ref in references]
    smoothie = SmoothingFunction().method4
    return sentence_bleu(reference_tokens, candidate_tokens, smoothing_function=smoothie)

def metrics(actual, pred):
  predictions = [pred]
  references = [actual]
  metrics = {}

  # ✅ FIXED ROUGE COMPUTATION
  rouge_score = rouge.compute(predictions=predictions, references=references)
  metrics["rouge"] = [{
      "rouge1": rouge_score["rouge1"].mid.fmeasure,
      "rouge2": rouge_score["rouge2"].mid.fmeasure,
      "rougeL": rouge_score["rougeL"].mid.fmeasure
  }]

  bert_score = bertscore.compute(predictions=predictions, references=references, model_type="bert-base-uncased")
  metrics["bert"] = bert_score["f1"][0]

  meteor_score = meteor.compute(predictions=predictions, references=references)
  metrics["meteor"] = meteor_score["meteor"]

  bleu_score = calculate_bleu_score(predictions[0], references)
  metrics["bleu"] = bleu_score

  return metrics

from tqdm import tqdm
import json
all_metrics = []
for i, row in tqdm(df.iterrows()):
    if isinstance(row['Output'], str):
        actual = row['Output']
        pred = row['llama2_pred']
        metric = metrics(actual, pred)
        all_metrics.append(metric)
    else:
        continue

# specify where to save
with open("/content/all_metrics.json", "w") as outfile:
    json.dump(all_metrics, outfile)

def avg(l):
  return sum(l)/len(l)

# Rouge
r1 = [m['rouge'][0]['rouge1'] for m in all_metrics]
r2 = [m['rouge'][0]['rouge2'] for m in all_metrics]
r3 = [m['rouge'][0]['rougeL'] for m in all_metrics]
print("Average R1: ", avg(r1))
print("Average R2: ", avg(r2))
print("Average R3: ", avg(r3))

# BLEU
blue = [m['bleu'] for m in all_metrics]
print("Average BLEU: ", avg(blue))

# METEOR
meteor_scores = [m['meteor'] for m in all_metrics]
print("Average METEOR: ", avg(meteor_scores))

# BERT
bert_scores = [m['bert'] for m in all_metrics]
print("Average BERT: ", avg(bert_scores))

# ========== BLANC ==========
from blanc import BlancHelp
import nltk
nltk.download('punkt')

bl = BlancHelp(device='cuda', inference_batch_size=128)

def cal_BLANC(actual, pred):
  return bl.eval_once(actual, pred)

all_blanc = []
for i,row in tqdm(df.iterrows()):
  if isinstance(row['Output'], str):
    actual = row['Output']
    pred = row['llama2_pred']
    metric = cal_BLANC(actual, pred)
    all_blanc.append(metric)
  else:
    continue

print("Average BLANC: ", avg(all_blanc))


Collecting blanc
  Using cached blanc-0.3.4-py3-none-any.whl.metadata (14 kB)
Collecting numpy<2.0,>=1.0 (from blanc)
  Using cached numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
INFO: pip is looking at multiple versions of blanc to determine which version is compatible with other requirements. This could take a while.
Collecting blanc
  Using cached blanc-0.3.3-py3-none-any.whl.metadata (14 kB)
  Using cached blanc-0.3.2-py3-none-any.whl.metadata (14 kB)
  Using cached blanc-0.3.1-py3-none-any.whl.metadata (14 kB)
  Using cached blanc-0.3.0-py3-none-any.whl.metadata (14 kB)
  Using cached blanc-0.2.8-py3-none-any.whl.metadata (14 kB)
  Using cached blanc-0.2.7-py3-none-any.whl.metadata (14 kB)
  Using cached blanc-0.2.6-py3-none-any.whl.metadata (14 kB)
INFO: pip is still looking at multiple versions of blanc to determine which version is compatible with other requirements. This could take a while.
  Using cached blanc-0.2.5-py3-none-any.whl.

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
3045it [3:46:04,  4.45s/it]

Average R1:  0.33548035868412995
Average R2:  0.15489115363851957
Average R3:  0.2287480426845999
Average BLEU:  0.0898416442352854
Average METEOR:  0.23260928623376995
Average BERT:  0.5834046841548581





ModuleNotFoundError: No module named 'blanc'