In [None]:
%pip install huggingface_hub
from huggingface_hub import snapshot_download

local_folder = snapshot_download(
    repo_id="PPPPPeter/arta",
    repo_type="dataset",
    local_dir="ARTA_LEGO"
)

print("All files are in:", local_folder)


In [None]:
import json
import nltk
from nltk.tokenize import word_tokenize
from sklearn.metrics import f1_score
from rouge_score import rouge_scorer
from collections import Counter
import re

nltk.download('punkt')
nltk.download('smoothness')

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

file_path = 'lego_dataset_cot_grnd.jsonl' 

f1_scores = []
rouge1_scores = []
rouge2_scores = []
rougeL_scores = []
bleu_scores = []

def extract_answer_content(text):
    """
    Extracts the content inside the first <answer>...</answer> tag pair.
    Returns the content string or the original text if tags are not found.
    """
    if not isinstance(text, str):
        return ""
    match = re.search(r"<answer>(.*?)</answer>", text, re.DOTALL)
    return match.group(1).strip() if match else text.strip()

def compute_word_level_f1(pred_words, label_words):
    # Create word count dictionaries
    pred_count = Counter(pred_words)
    label_count = Counter(label_words)
    
    all_words = set(pred_count.keys()).union(set(label_count.keys()))
    
    y_pred = [1 if word in pred_count else 0 for word in all_words]
    y_true = [1 if word in label_count else 0 for word in all_words]
    
    return f1_score(y_true, y_pred, zero_division=0)

with open(file_path, 'r') as f:
    for line in f:
        data = json.loads(line.strip())
        prediction_full = data.get("predict", "").strip()
        label_full = data.get("label", "").strip()
        
        # Extract content from <answer> tags
        prediction = extract_answer_content(prediction_full)
        label = extract_answer_content(label_full)
        
        # Tokenize predictions and labels
        pred_words = word_tokenize(prediction.lower()) if prediction else []
        label_words = word_tokenize(label.lower()) if label else []

        # Compute BLEU score (using smoothing to avoid zero scores for short sequences)
        smoothie = SmoothingFunction().method1
        if len(label_words) > 0:
            bleu = sentence_bleu([label_words], pred_words, smoothing_function=smoothie)
        else:
            bleu = 0.0  # No reference words
        bleu_scores.append(bleu)
        
        # Compute word-level F1
        f1 = compute_word_level_f1(pred_words, label_words)
        f1_scores.append(f1)
        
        # Compute ROUGE scores
        rouge_scores = scorer.score(label, prediction)
        rouge1_scores.append(rouge_scores['rouge1'].fmeasure)
        rouge2_scores.append(rouge_scores['rouge2'].fmeasure)
        rougeL_scores.append(rouge_scores['rougeL'].fmeasure)

print(f"Average Word-Level F1: {sum(f1_scores)/len(f1_scores):.4f}" if f1_scores else "N/A")
print(f"Average ROUGE-1 F1: {sum(rouge1_scores)/len(rouge1_scores):.4f}" if rouge1_scores else "N/A")
print(f"Average ROUGE-2 F1: {sum(rouge2_scores)/len(rouge2_scores):.4f}" if rouge2_scores else "N/A")
print(f"Average ROUGE-L F1: {sum(rougeL_scores)/len(rougeL_scores):.4f}" if rougeL_scores else "N/A")
print(f"Average BLEU Score: {sum(bleu_scores)/len(bleu_scores):.4f}" if bleu_scores else "N/A")

In [None]:
import json
import re
from sklearn.metrics import f1_score, confusion_matrix

file_path = 'lego_dataset_cot_state.jsonl' 

y_true = []
y_pred = []

def extract_answer_content(text):
    """
    Extracts the content inside the first <answer>...</answer> tag pair.
    Returns the content string or the original text if tags are not found.
    """
    if not isinstance(text, str):
        return ""
    match = re.search(r"<answer>(.*?)</answer>", text, re.DOTALL)
    return match.group(1).strip() if match else text.strip()

def parse_response(text):
    if not text or not isinstance(text, str):
        return None
    # Extract first occurrence of Yes/No (case-insensitive)
    match = re.search(r'\b(yes|no)\b', text.strip().lower())
    if match:
        return match.group(1).capitalize() 
    return None 

with open(file_path, 'r') as f:
    for line_num, line in enumerate(f, 1):
        data = json.loads(line.strip())
        predict_full = data.get("predict", "")
        label_full = data.get("label", "")

        # Extract content from <answer> tags
        predict_text = extract_answer_content(predict_full)
        label_text = extract_answer_content(label_full)


        pred_parsed = parse_response(predict_text)
        label_parsed = parse_response(label_text)

        if pred_parsed is None or label_parsed is None:
            print(f"Skipping line {line_num} due to invalid format (pred={pred_parsed}, label={label_parsed})")
            continue

        y_pred.append(pred_parsed)
        y_true.append(label_parsed)

# Convert to binary: Yes=1, No=0 for sklearn metrics
y_true_bin = [1 if y == "Yes" else 0 for y in y_true]
y_pred_bin = [1 if y == "Yes" else 0 for y in y_pred]

f1_state = f1_score(y_true_bin, y_pred_bin, pos_label=1)  

tn, fp, fn, tp = confusion_matrix(y_true_bin, y_pred_bin).ravel()
fpr = fp / (fp + tn) if (fp + tn) > 0 else 0.0

print(f"Total Examples: {len(y_true)}")
print(f"F1-State (F1 on 'Yes'): {f1_state:.4f}")
print(f"False Positive Rate (FPR): {fpr:.4f}")

In [None]:
import json
import re
import numpy as np
from scipy.optimize import linear_sum_assignment
from nltk.tokenize import word_tokenize
import nltk

nltk.download('punkt')


def extract_answer_content(text):
    """
    Extracts the content inside the first <answer>...</answer> tag pair.
    Returns the content string or the original text if tags are not found.
    """
    if not isinstance(text, str):
        return ""
    match = re.search(r"<answer>(.*?)</answer>", text, re.DOTALL)
    return match.group(1).strip() if match else text.strip()


def compute_word_level_f1(gt_labels, pred_labels):
    """Compute token-level F1 between all labels (case-insensitive)"""
    if not gt_labels or not pred_labels:
        return 0.0

    gt_text = " ".join(gt_labels).lower()
    pred_text = " ".join(pred_labels).lower()

    gt_tokens = word_tokenize(gt_text)
    pred_tokens = word_tokenize(pred_text)

    common = set(gt_tokens) & set(pred_tokens)
    if not common:
        return 0.0

    precision = len(common) / len(pred_tokens)
    recall = len(common) / len(gt_tokens)
    f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
    return f1


def parse_json_boxes_and_text(json_str):
    """
    Parse a string that should be a list of dicts:
    [{'bbox_2d': [x1,y1,x2,y2], 'label': '...'}]
    
    Handles single quotes, malformed JSON, missing keys.
    """
    if not isinstance(json_str, str) or not json_str:
        return [], []

    # Clean up
    json_str = json_str.strip()
    json_str = json_str.replace("'", '"')  
    json_str = re.sub(r'\b(\d+)\s*([a-zA-Z])', r'\1 \2', json_str) 
    json_str = re.sub(r'\b([a-zA-Z])\s*(\d+)\b', r'\1 \2', json_str) 

    if not json_str.startswith('['):
        json_str = '[' + json_str
    if not json_str.endswith(']'):
        json_str = json_str + ']'

    try:
        boxes_data = json.loads(json_str)
        if not isinstance(boxes_data, list):
            return [], []
        
        # Extract labels and bboxes
        labels = []
        boxes = []
        for item in boxes_data:
            if isinstance(item, dict):
                label = item.get("label", "").strip()
                bbox = item.get("bbox_2d", [])
                if isinstance(bbox, list) and len(bbox) == 4:
                    try:
                        bbox = [float(x) for x in bbox]
                        boxes.append(bbox)
                        labels.append(label)
                    except (TypeError, ValueError):
                        continue
        return labels, boxes
    except json.JSONDecodeError as e:
        print(f"Failed to decode JSON: {json_str[:100]}... | Error: {e}")
        return [], []


def iou(boxA, boxB):
    """Compute IoU between two bounding boxes [x1, y1, x2, y2]"""
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])

    interW = max(0, xB - xA)
    interH = max(0, yB - yA)
    interArea = interW * interH

    areaA = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1])
    areaB = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1])
    unionArea = areaA + areaB - interArea

    return interArea / unionArea if unionArea > 0 else 0.0


def total_matched_iou_hungarian(gt_boxes, pred_boxes):
    if not gt_boxes or not pred_boxes:
        return 0.0

    M, N = len(gt_boxes), len(pred_boxes)
    cost_matrix = np.zeros((M, N))
    for i in range(M):
        for j in range(N):
            cost_matrix[i, j] = iou(gt_boxes[i], pred_boxes[j])

    row_ind, col_ind = linear_sum_assignment(-cost_matrix)
    matched_ious = cost_matrix[row_ind, col_ind]

    return float(np.sum(matched_ious))  


results = []
file_path = "lego_dataset_obj_cot_all.jsonl"  

with open(file_path, "r", encoding="utf-8") as f:
    for line_num, line in enumerate(f, 1):
        line = line.strip()
        if not line:
            continue

        try:
            data = json.loads(line)
        except json.JSONDecodeError as e:
            print(f"Line {line_num}: JSON decode error: {e}")
            continue

        predict_str_full = data.get("predict", "")
        label_str_full = data.get("label", "")

        #Extract content from <answer> tag
        predict_str = extract_answer_content(predict_str_full)
        label_str = extract_answer_content(label_str_full)

        pr_labels, pr_boxes = parse_json_boxes_and_text(predict_str)
        gt_labels, gt_boxes = parse_json_boxes_and_text(label_str)

        word_f1 = compute_word_level_f1(gt_labels, pr_labels)
        miou = total_matched_iou_hungarian(gt_boxes, pr_boxes)

        results.append({
            "word_f1": word_f1,
            "iou": miou,
            "gt_labels": gt_labels,
            "pr_labels": pr_labels,
            "gt_boxes": gt_boxes,
            "pr_boxes": pr_boxes
        })



if results:
    avg_f1 = np.mean([r["word_f1"] for r in results])
    avg_iou = np.mean([r["iou"] for r in results])

    print(f"Average Word-level F1: {avg_f1:.4f}")
    print(f"Average Matched IoU:    {avg_iou:.4f}")
    print(f"Total Examples:         {len(results)}")

