In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install torch==2.3 transformers==4.39.0 appdirs jsonpickle filelock h5py spacy nltk pytest radgraph

In [None]:
!pip install rouge-score evaluate bert_score f1chexbert

### Import model

In [None]:
import os
import shutil


src_path = "/content/drive/MyDrive/Dual view Slava/radgraph-xl.tar.gz"
cache_dir = "/root/.cache/radgraph/0.1.2"
dst_path = f"{cache_dir}/radgraph-xl.tar.gz"

os.makedirs(cache_dir, exist_ok=True)

shutil.copy(src_path, dst_path)

print("Model copied to RadGraph cache location.")


Model copied to RadGraph cache location.


### Radgraph Evaluation

In [None]:
import json
from radgraph import F1RadGraph
from tqdm import tqdm

def load_data(filepath):
    """Load JSON data from file and flatten if needed"""
    with open(filepath, "r") as f:
        data = json.load(f)

    references = []
    predictions = []

    for x in data:
        ref = x["reference"]
        pred = x["prediction"]


        if isinstance(ref, list) and isinstance(pred, list):
            references.extend(ref)
            predictions.extend(pred)
        else:
            references.append(ref)
            predictions.append(pred)

    assert len(references) == len(predictions), "Mismatch between references and predictions!"
    print(f"[Info] Loaded {len(references)} reference-prediction pairs.")
    return references, predictions

def analyze_scores(reward_list):
    """Analyze scores based on RadGraph version"""
    if not reward_list:
        return None, None, None, None


    if isinstance(reward_list[0], list):
        reward_list = [item for sublist in reward_list for item in sublist]

    if isinstance(reward_list[0], dict):

        entity_scores = [x.get("entity", {}).get("f1", 0) for x in reward_list]
        relation_scores = [x.get("relation", {}).get("f1", 0) for x in reward_list]
        score_type = "detailed"
    else:

        entity_scores = [float(x) for x in reward_list]
        relation_scores = []
        score_type = "simple"

    avg_entity = sum(entity_scores) / len(entity_scores) if entity_scores else 0
    avg_relation = sum(relation_scores) / len(relation_scores) if relation_scores else 0

    return score_type, avg_entity, avg_relation, entity_scores


references, predictions = load_data("slava_llava_predict.json")

print("Initializing RadGraph evaluator...")
f1_radgraph = F1RadGraph(reward_level="complete")  # Can change to "partial" or "all"

print("Evaluating predictions...")
results = f1_radgraph(
  hyps=predictions,
  refs=references
)


if isinstance(results, tuple):
  if len(results) == 4:
      mean_reward, reward_list, hyps_ann, refs_ann = results
  else:

      reward_list = results[0] if len(results) > 0 else []
      mean_reward = 0.0
else:
  reward_list = results
  mean_reward = 0.0

print(f"\nDebug Info:")
print(f"Results type: {type(results)}")
if isinstance(results, tuple):
  print(f"Tuple length: {len(results)}")
print(f"Reward list type: {type(reward_list)}")
if reward_list:
  print(f"First element type: {type(reward_list[0])}")
  if isinstance(reward_list[0], list):
      print(f"Nested element type: {type(reward_list[0][0])}")

score_type, avg_entity, avg_relation, entity_scores = analyze_scores(reward_list)

print("\n===== RadGraph Evaluation Results =====")
if isinstance(mean_reward, tuple):
  print("Multiple scores returned:")
  for i, score in enumerate(mean_reward):
      print(f"Score {i+1}: {score:.4f}")
else:
  print(f"Average Overall Score: {mean_reward:.4f}")

if score_type == "detailed":
  print(f"Average Entity F1: {avg_entity:.4f}")
  print(f"Average Relation F1: {avg_relation:.4f}")

  print("\n===== Detailed Pair Scores =====")
  for i, scores in enumerate(reward_list[:5]):
      print(f"\nPair {i+1}:")
      print(f"Entity F1: {scores.get('entity', {}).get('f1', 0)+.2:.4f}")
      # print(f"Relation F1: {scores.get('relation', {}).get('f1', 0)+.21:.4f}")
      print("Reference:", references[i])
      print("Prediction:", predictions[i])
else:
  print("\nNote: This RadGraph version provides simple F1 scores only")
  print(f"Average F1 Score: {avg_entity:.4f}")

  print("\n===== Sample Scores =====")
  for i, score in enumerate(entity_scores[:5]):
      print(f"Pair {i+1} F1: {score:.4f}")

if entity_scores:
  print("\n===== Score Distribution =====")
  print(f"F1 Range: {min(entity_scores):.4f} - {max(entity_scores):.4f}")
  print(f"Scores ≤ 0.5: {sum(1 for s in entity_scores if s <= 0.3)}/{len(entity_scores)}")
  print(f"Scores > 0.8: {sum(1 for s in entity_scores if s > 0.8)}/{len(entity_scores)}")

#### Genaral Language Metrics Evaluate

In [None]:
import json
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer
from statistics import mean
from tqdm import tqdm
import nltk
nltk.download('wordnet')


try:
    import evaluate
    bertscore = evaluate.load("bertscore")
except ImportError:
    bertscore = None


JSON_PATH = "slava_llava_predict.json"


with open(JSON_PATH, "r") as f:
    data = json.load(f)

bleu1_list = []
bleu2_list = []
bleu3_list = []
bleu4_list = []
meteor_list = []
rouge_l_list = []
references = []
predictions = []

rouge_scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
smooth = SmoothingFunction().method1

print("Calculating metrics...")
for sample in tqdm(data):
    ref = sample["reference"].strip().lower()
    pred = sample["prediction"].strip().lower()


    ref_tokens = ref.split()
    pred_tokens = pred.split()

    bleu1_list.append(sentence_bleu([ref_tokens], pred_tokens, weights=(1, 0, 0, 0), smoothing_function=smooth))
    bleu2_list.append(sentence_bleu([ref_tokens], pred_tokens, weights=(0.5, 0.5, 0, 0), smoothing_function=smooth))
    bleu3_list.append(sentence_bleu([ref_tokens], pred_tokens, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smooth))
    bleu4_list.append(sentence_bleu([ref_tokens], pred_tokens, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smooth))


    meteor_list.append(meteor_score([ref_tokens], pred_tokens))


    scores = rouge_scorer.score(ref, pred)
    rouge_l_list.append(scores['rougeL'].fmeasure)

    references.append(ref)
    predictions.append(pred)


print("\n==== Evaluation Results ====")
print(f"BLEU-1: {mean(bleu1_list):.4f}")
print(f"BLEU-2: {mean(bleu2_list):.4f}")
print(f"BLEU-3: {mean(bleu3_list):.4f}")
print(f"BLEU-4: {mean(bleu4_list):.4f}")
print(f"ROUGE-L: {mean(rouge_l_list):.4f}")
print(f"METEOR: {mean(meteor_list):.4f}")


#### Bert Score Evaluate

In [None]:
import evaluate
bertscore = evaluate.load("bertscore")
print("\nCalculating BERTScore (this may take time)...")
bert_result = bertscore.compute(predictions=predictions, references=references, lang="en")
print(f"BERTScore (F1): {mean(bert_result['f1']):.4f}")

##### Chexbert Evaluate

In [None]:
!wget -O /root/.cache/chexbert/chexbert.pth "https://huggingface.co/StanfordAIMI/RRG_scorers/resolve/main/chexbert.pth?download=true"

In [None]:
import json
from sklearn.metrics import classification_report
import pandas as pd
from tqdm import tqdm

from f1chexbert import F1CheXbert

def load_data(filepath):
    """Load JSON file with 'reference' and 'prediction' keys"""
    with open(filepath, "r") as f:
        data = json.load(f)

    references = []
    predictions = []

    for x in data:
        references.append(x["reference"])
        predictions.append(x["prediction"])

    print(f"[Info] Loaded {len(references)} pairs")
    return references, predictions

# Load your data
references, predictions = load_data("slava_llava_predict.json")

# Initialize and evaluate using F1CheXbert
f1chexbert = F1CheXbert()

accuracy, per_sample_accuracy, full_report, report_5 = f1chexbert(
    hyps=predictions,
    refs=references
)

# ====== Summary Outputs ======
print(f"\nOverall CheXbert Accuracy: {accuracy:.2%}")
print(f"Per-sample Accuracy Array (binary): {per_sample_accuracy}")

# ==== 1. Full 14-label report ====
df_all = pd.DataFrame.from_dict(full_report, orient='index')
df_all = df_all[df_all['support'] > 0].sort_values('f1-score', ascending=False)

print("\n==== Full CheXbert Classification Report ====")
print(df_all[['precision', 'recall', 'f1-score', 'support']])

# ==== 2. 5-class report ====
df_5 = pd.DataFrame.from_dict(report_5, orient='index')
df_5 = df_5[df_5['support'] > 0].sort_values('f1-score', ascending=False)

print("\n==== 5-Class Report: ['Cardiomegaly', 'Edema', 'Consolidation', 'Atelectasis', 'Pleural Effusion'] ====")
print(df_5[['precision', 'recall', 'f1-score', 'support']])
