In [None]:
!pip install --upgrade pip setuptools wheel
!pip install --no-deps blanc==0.3.0
!pip install transformers==4.39.3 datasets==2.19.1 evaluate==0.4.1 rouge-score==0.1.2 bert-score==0.3.13


Collecting datasets==2.19.1
  Using cached datasets-2.19.1-py3-none-any.whl.metadata (19 kB)
Collecting evaluate==0.4.1
  Using cached evaluate-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Using cached datasets-2.19.1-py3-none-any.whl (542 kB)
Using cached evaluate-0.4.1-py3-none-any.whl (84 kB)
[0mInstalling collected packages: datasets, evaluate
[2K  Attempting uninstall: datasets
[2K    Found existing installation: datasets 4.0.0
[2K    Uninstalling datasets-4.0.0:
[2K      Successfully uninstalled datasets-4.0.0
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [evaluate]
[0mSuccessfully installed datasets-2.19.1 evaluate-0.4.1


In [5]:
# ✅ INSTALL REQUIRED PACKAGES
!pip install --upgrade pip setuptools wheel
!pip install bert_score==0.3.13 rouge-score==0.1.2 nltk==3.9.1 tqdm scikit-learn pandas

# ✅ IMPORTS
import pandas as pd
import re
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix
from tqdm import tqdm
import nltk
import json
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer
from bert_score import score as bert_score_fn

# ✅ DOWNLOAD REQUIRED NLTK MODELS
for pkg in ["punkt", "punkt_tab", "wordnet", "omw-1.4"]:
    nltk.download(pkg, quiet=True)

# ✅ LOAD DATA
df = pd.read_csv('/content/sft_pred%2Bexpl_result.csv')  # <-- change path if needed

# ---------------------------------------------------
# 🧩 1. CLASSIFICATION METRICS
# ---------------------------------------------------
def extract_first_int(s):
    match = re.search(r'\d', str(s))
    return int(match.group(0)) if match else None

df['llama2_pred_extracted'] = df['llama2_pred'].apply(extract_first_int)

num_zeros = (df['llama2_pred_extracted'] == 0).sum()
num_ones = (df['llama2_pred_extracted'] == 1).sum()
matches = (df['llama2_pred_extracted'] == df['Label']).sum()
correct_zeros = ((df['llama2_pred_extracted'] == 0) & (df['Label'] == 0)).sum()
correct_ones = ((df['llama2_pred_extracted'] == 1) & (df['Label'] == 1)).sum()

print(f"Number of 0's: {num_zeros}")
print(f"Number of 1's: {num_ones}")
print(f"Matches: {matches}")
print(f"Correct 0's: {correct_zeros}")
print(f"Correct 1's: {correct_ones}")

accuracy = accuracy_score(df['Label'], df['llama2_pred_extracted'])
precision = precision_score(df['Label'], df['llama2_pred_extracted'])
recall = recall_score(df['Label'], df['llama2_pred_extracted'])
f1 = f1_score(df['Label'], df['llama2_pred_extracted'])

print("\n=== CLASSIFICATION METRICS ===")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

cm = confusion_matrix(df['Label'], df['llama2_pred_extracted'])
for i in range(len(cm)):
    acc = cm[i][i] / sum(cm[i])
    print(f"Class {i} accuracy: {acc:.4f}")

# ---------------------------------------------------
# 🧩 2. EXPLANATION METRICS (robust local version)
# ---------------------------------------------------

def safe_tokenize(text):
    try:
        return nltk.word_tokenize(text)
    except Exception:
        return text.split()

def calculate_bleu(candidate, references):
    smoothie = SmoothingFunction().method4
    candidate_tokens = safe_tokenize(candidate)
    reference_tokens = [safe_tokenize(ref) for ref in references]
    if len(candidate_tokens) == 0 or len(reference_tokens[0]) == 0:
        return 0.0
    return sentence_bleu(reference_tokens, candidate_tokens, smoothing_function=smoothie)

def calculate_rouge(candidate, reference):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, candidate)
    return {k: v.fmeasure for k, v in scores.items()}

def calculate_metrics(actual, pred):
    metrics = {}

    # ROUGE
    rouge_vals = calculate_rouge(pred, actual)
    metrics["rouge"] = rouge_vals

    # BLEU
    bleu_val = calculate_bleu(pred, [actual])
    metrics["bleu"] = bleu_val

    # METEOR (tokenized inputs)
    ref_tokens = safe_tokenize(actual)
    hyp_tokens = safe_tokenize(pred)
    if len(ref_tokens) == 0 or len(hyp_tokens) == 0:
        meteor_val = 0.0
    else:
        meteor_val = meteor_score([ref_tokens], hyp_tokens)
    metrics["meteor"] = meteor_val

    # BERTScore
    P, R, F1 = bert_score_fn([pred], [actual], lang="en", verbose=False, rescale_with_baseline=True)
    metrics["bert"] = float(F1.mean())

    return metrics

# ---------------------------------------------------
# 🧩 3. COMPUTE METRICS
# ---------------------------------------------------
all_metrics = []
for i, row in tqdm(df.iterrows(), total=len(df)):
    if isinstance(row['Output'], str) and isinstance(row['llama2_pred'], str):
        try:
            m = calculate_metrics(row['Output'], row['llama2_pred'])
            all_metrics.append(m)
        except Exception as e:
            print(f"⚠️ Error at row {i}: {e}")
            continue

# save to file
with open("/content/all_metrics.json", "w") as outfile:
    json.dump(all_metrics, outfile)

# ---------------------------------------------------
# 🧩 4. AVERAGE SCORES
# ---------------------------------------------------
def avg(lst): return sum(lst)/len(lst) if lst else 0

r1 = [m['rouge']['rouge1'] for m in all_metrics]
r2 = [m['rouge']['rouge2'] for m in all_metrics]
rL = [m['rouge']['rougeL'] for m in all_metrics]
b = [m['bleu'] for m in all_metrics]
mt = [m['meteor'] for m in all_metrics]
bs = [m['bert'] for m in all_metrics]

print("\n=== EXPLANATION METRICS (AVERAGE) ===")
print(f"Average ROUGE-1: {avg(r1):.4f}")
print(f"Average ROUGE-2: {avg(r2):.4f}")
print(f"Average ROUGE-L: {avg(rL):.4f}")
print(f"Average BLEU:    {avg(b):.4f}")
print(f"Average METEOR:  {avg(mt):.4f}")
print(f"Average BERT:    {avg(bs):.4f}")


[0mNumber of 0's: 1397
Number of 1's: 1647
Matches: 2315
Correct 0's: 1095
Correct 1's: 1220

=== CLASSIFICATION METRICS ===
Accuracy: 0.7605
Precision: 0.7407
Recall: 0.8016
F1 Score: 0.7700
Class 0 accuracy: 0.7194
Class 1 accuracy: 0.8016


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the 


=== EXPLANATION METRICS (AVERAGE) ===
Average ROUGE-1: 0.5117
Average ROUGE-2: 0.4355
Average ROUGE-L: 0.4388
Average BLEU:    0.2555
Average METEOR:  0.3643
Average BERT:    0.1657



