In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import warnings
from pathlib import Path
from collections import Counter, defaultdict
from PIL import Image

warnings.filterwarnings('ignore')

In [2]:
try:
    import google.colab
    from google.colab import drive

    !uv pip install anomalib
    !uv pip install open-clip-torch
    !uv pip install qwen-vl-utils
    !uv pip install transformers==4.52.4
    !uv pip install langchain-chroma langchain-huggingface
    !uv pip install langchain_community pypdf beautifulsoup4
    !uv pip install -U bitsandbytes
    !uv pip install -U torchao
    !uv pip install rank_bm25

    drive.mount('/content/drive', force_remount=True)

    # Colab Root
    PROJECT_ROOT = Path('/content/drive/Othercomputers/Mac/multiModal_anomaly_report') # 본인 경로 수정: Mac/Window
    DATA_ROOT = PROJECT_ROOT / "dataset" / "MMAD"

except ImportError:

    # Local Root
    PROJECT_ROOT = Path.cwd().parents[1]
    DATA_ROOT = PROJECT_ROOT / "datasets" / "MMAD"

os.chdir(PROJECT_ROOT) # 현재 경로 수정
print(f"Current working directory: {os.getcwd()}")

[2mUsing Python 3.12.12 environment at: /usr[0m
[2K[2mResolved [1m98 packages[0m [2min 1.35s[0m[0m
[2K[2mPrepared [1m12 packages[0m [2min 488ms[0m[0m
[2K[2mInstalled [1m12 packages[0m [2min 17ms[0m[0m
 [32m+[39m [1manomalib[0m[2m==2.2.0[0m
 [32m+[39m [1mfreia[0m[2m==0.2[0m
 [32m+[39m [1mimagecodecs[0m[2m==2026.1.14[0m
 [32m+[39m [1mjsonargparse[0m[2m==4.46.0[0m
 [32m+[39m [1mkornia[0m[2m==0.8.2[0m
 [32m+[39m [1mkornia-rs[0m[2m==0.1.10[0m
 [32m+[39m [1mlightning[0m[2m==2.6.1[0m
 [32m+[39m [1mlightning-utilities[0m[2m==0.15.3[0m
 [32m+[39m [1mpytorch-lightning[0m[2m==2.6.1[0m
 [32m+[39m [1mrich-argparse[0m[2m==1.7.2[0m
 [32m+[39m [1mtorchmetrics[0m[2m==1.8.2[0m
 [32m+[39m [1mtypeshed-client[0m[2m==2.8.2[0m
[2mUsing Python 3.12.12 environment at: /usr[0m
[2K[2mResolved [1m57 packages[0m [2min 203ms[0m[0m
[2K[2mPrepared [1m2 packages[0m [2min 97ms[0m[0m
[2K[2mInstalled [1m2 pac

In [3]:
# RAG Dataet path
DOMAIN_KNOWLEDGE_ROOT = DATA_ROOT / "domain_knowledge.json"
PDF_PATH = DATA_ROOT / "packaging_guide.pdf"
CFIA_JSON_PATH = DATA_ROOT / "cfia_knowledge.json"

In [4]:
from src.rag import Indexer, Retrievers, PDFKnowledgeLoader, JSONKnowledgeLoader

pdf_loader = PDFKnowledgeLoader(pdf_path=PDF_PATH, chunk_size=1000, chunk_overlap=100)
pdf_docs = pdf_loader.load()
print(f"PDF Total: {len(pdf_docs)} chunks")

cfia_docs = JSONKnowledgeLoader(CFIA_JSON_PATH).load()
print(f"CFIA Total: {len(cfia_docs)} docs")

PDF Total: 20 chunks
CFIA Total: 26 docs


In [8]:
from src.rag import RAGEvaluator, TEST_QUERIES_MMAD

indexers = Indexer(
    json_path=DOMAIN_KNOWLEDGE_ROOT,
    persist_dir="vectorstore/domain_knowledge"
)
vs_config = indexers.get_or_create()
domain_docs = indexers.load_documents()

result_dense = RAGEvaluator(Retrievers(vs_config, documents=domain_docs, mode="dense")).evaluate(TEST_QUERIES_MMAD, k=3)
result_hybrid = RAGEvaluator(Retrievers(vs_config, documents=domain_docs, mode="hybrid")).evaluate(TEST_QUERIES_MMAD, k=3)

print(f"{'Mode':<12} {'Hit Rate':>10} {'MRR':>10}")
print(f"{'Dense':12} {result_dense['hit_rate']:>10.3f} {result_dense['mrr']:>10.3f}")
print(f"{'Hybrid':12} {result_hybrid['hit_rate']:>10.3f} {result_hybrid['mrr']:>10.3f}")
print(f"{'Delta':12} {result_hybrid['hit_rate'] - result_dense['hit_rate']:>+10.3f} {result_hybrid['mrr'] - result_dense['mrr']:>+10.3f}")

Mode           Hit Rate        MRR
Dense             0.600      0.600
Hybrid            0.600      0.550
Delta            +0.000     -0.050


In [6]:
from src.utils.loaders import load_json

# setting
# gemma3-4b-int4, gemma3-12b-int4, gemma3-27b-int4
# gemma3-4b-int8, gemma3-12b-int8, gemma3-27b-int8

LLM = "gemma3-27b-int4"
MMAD_CLASS_JSON = DATA_ROOT / "mmad_10classes.json"
OUTPUT_DIR = f"output/{LLM}"
OUTPUT_RAG_ROOT = Path(OUTPUT_DIR) / "rag"

OUTPUT_AD = OUTPUT_RAG_ROOT / "AD"
OUTPUT_ORIGIN = OUTPUT_RAG_ROOT / "original"
OUTPUT_PDF = OUTPUT_RAG_ROOT / "pdf"
OUTPUT_CFIA = OUTPUT_RAG_ROOT / "cfia"
OUTPUT_PDF_CFIA = OUTPUT_RAG_ROOT / "pdf_cfia"
SAMPLE_PER_FOLDER = 3  # 빠른 테스트: 폴더당 3장

In [None]:
class_10_json = load_json(MMAD_CLASS_JSON)

# AD + LLM
!python scripts/run_experiment.py \
    --llm {LLM} \
    --ad-model "patchcore" \
    --sample-per-folder {SAMPLE_PER_FOLDER} \
    --data-root {DATA_ROOT} \
    --output-dir {OUTPUT_AD} \
    --mmad-json {MMAD_CLASS_JSON} \
    --batch-mode true

Stratified sampling: 3장/폴더, 33폴더
  Total: 4224 -> Sampled: 99 (normal=30, anomaly=69)
MMAD Experiment Runner
Experiment:  patchcore_gemma3-27b-int4_1shot
LLM:         gemma3-27b-int4
AD model:    patchcore
RAG:         disabled
Few-shot:    1
Template:    Random_template
Image size:  (384, 384)
Images:      99 / 4224
Data root:   /content/drive/Othercomputers/Mac/multiModal_anomaly_report/dataset/MMAD
Output:      output/gemma3-27b-int4/rag/AD/answers_1_shot_gemma3-27b-int4_Random_template_with_patchcore_v0_99img.json

Filtered MMAD json: 99 images -> output/gemma3-27b-int4/rag/AD/_sampled_mmad.json

Running AD Model Inference
Script:       /content/drive/Othercomputers/Mac/multiModal_anomaly_report/scripts/run_ad_inference.py
Config:       configs/anomaly.yaml
Checkpoint:   /content/drive/Othercomputers/Mac/multiModal_anomaly_report/dataset/MMAD/checkpoints/patchcore_384
Version:      v0
Data root:    /content/drive/Othercomputers/Mac/multiModal_anomaly_report/dataset/MMAD
MMAD JSON: 

In [7]:
# With RAG: Only Dense
!python scripts/run_experiment.py \
    --llm {LLM} \
    --ad-model "patchcore" \
    --rag --rag-mode dense \
    --sample-per-folder {SAMPLE_PER_FOLDER} \
    --data-root {DATA_ROOT} \
    --output-dir {OUTPUT_ORIGIN} \
    --mmad-json {MMAD_CLASS_JSON} \
    --batch-mode true

Stratified sampling: 3장/폴더, 33폴더
  Total: 4224 -> Sampled: 99 (normal=30, anomaly=69)
2026-02-24 11:10:47.146702: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1771931447.169457    2734 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1771931447.177045    2734 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1771931447.196122    2734 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1771931447.196146    2734 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W

In [None]:
# With RAG: Hybrid
!python scripts/run_experiment.py \
    --llm {LLM} \
    --ad-model "patchcore" \
    --rag --rag-mode hybrid \
    --sample-per-folder {SAMPLE_PER_FOLDER} \
    --data-root {DATA_ROOT} \
    --output-dir {OUTPUT_ORIGIN} \
    --mmad-json {MMAD_CLASS_JSON} \
    --batch-mode true

In [6]:
# # With RAG
# LLM = "gemini-2.5-flash-lite"
# !python scripts/run_experiment.py \
#     --llm {LLM} \
#     --ad-model "patchcore" \
#     --rag \
#     --data-root {DATA_ROOT} \
#     --output-dir {OUTPUT_ORIGIN} \
#     --mmad-json {MMAD_CLASS_JSON} \
#     --batch-mode true

In [None]:
# # 결과 비교
# configs = {
#     # "Baseline": OUTPUT_DIR,
#     # "AD only": OUTPUT_AD,
#     "AD+RAG": OUTPUT_ORIGIN,
#     "AD+RAG (+PDF)": OUTPUT_PDF,
# }

# def load_latest_meta(output_dir):
#     files = sorted(Path(output_dir).glob("*.meta.json"))
#     if not files:
#         return None
#     return json.load(open(files[-1]))

# rows = {}
# for label, out_dir in configs.items():
#     meta = load_latest_meta(out_dir)
#     rows[label] = meta

# # 기준: Baseline accuracy
# # bl_acc = rows["Baseline"]["accuracy"] if rows["Baseline"] else 0

# print(f"{'Config':<12} {'Accuracy':>10} {'Correct':>9} {'Total':>7} {'Diff':>8}")
# print("=" * 52)
# for label, meta in rows.items():
#     if meta is None:
#         print(f"{label:<12} {'결과없음':>10}")
#         continue
#     acc  = meta.get("accuracy", 0)
#     cor  = meta.get("total_correct", 0)
#     tot  = meta.get("total_questions", 0)
#     # diff = acc - bl_acc
#     # sign = "+" if diff > 0 else ""
#     # diff_str = f"{sign}{diff:.2f}" if label != "Baseline" else "-"
#     print(f"{label:<12} {acc:>9.2f}% {cor:>9} {tot:>7}") # {diff_str:>8}

# def load_latest_answers(output_dir):
#     files = [f for f in sorted(Path(output_dir).glob("answers_*.json")) if ".meta." not in f.name]
#     return json.load(open(files[-1])) if files else []

# answers = {label: load_latest_answers(d) for label, d in configs.items()}

# # good vs anomaly 정확도
# print(f"{'Config':<25} {'good':>8} {'anomaly':>10}")
# print("-" * 46)
# for label, ans in answers.items():
#     good    = [a for a in ans if "/good/" in a["image"]]
#     anomaly = [a for a in ans if "/good/" not in a["image"]]
#     g_acc = sum(a["gpt_answer"]==a["correct_answer"] for a in good) / len(good) * 100 if good else 0
#     a_acc = sum(a["gpt_answer"]==a["correct_answer"] for a in anomaly) / len(anomaly) * 100 if anomaly else 0
#     print(f"{label:<25} {g_acc:>7.1f}% {a_acc:>9.1f}%")