### Import & Configuration

In [7]:
import os
# --- FIX FOR OMP: Error #15 ---
# Variabel lingkungan ini adalah solusi untuk konflik antara runtime OpenMP.
# Ini memberitahu sistem untuk mengizinkan beberapa pustaka OpenMP untuk dimuat.
# Letakkan ini SEBELUM mengimpor torch, pandas, atau numpy.
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

import easyocr
import pandas as pd
from tqdm.notebook import tqdm # Gunakan tqdm.notebook untuk tampilan yang lebih baik di Jupyter
import re
import torch
import matplotlib.pyplot as plt
import seaborn as sns

# --- CONFIGURATION ---
# The root directory containing all your experiment folders (exp1, exp2, etc.)
ROOT_DIR = "dataset_eval"
OUTPUT_CSV = "ocr_accuracy_results.csv"
OUTPUT_DIR_PLOTS = "output_plots" # Folder untuk menyimpan visualisasi

print("✅ Pustaka dan konfigurasi berhasil dimuat.")

✅ Pustaka dan konfigurasi berhasil dimuat.


### Def Library Cell

In [8]:
def get_ground_truth(filename):
    """Mengekstrak jenis onomatope dan kata kunci dasar dari nama file."""
    name_part = filename.split('_')[0].lower()
    
    if "byur" in name_part:
        return "byur", "byur"
    if "dutt" in name_part:
        return "dutt", "dutt"
    if "gluduk" in name_part:
        return "gluduk", "gluduk"
    if "gong" in name_part:
        return "gongg", "gongg"
    if "ting" in name_part:
        return "ting", "ting"
    if "toktok" in name_part:
        return "toktok", "tok"
    return None, None

def normalize_text(text):
    """Membersihkan dan menormalkan teks untuk perbandingan."""
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def check_match_flexible(recognized_text, onomatopoeia_type, base_keyword):
    """Memeriksa kecocokan teks dengan aturan yang lebih fleksibel."""
    normalized_recognized = normalize_text(recognized_text)

    if onomatopoeia_type == "dutt":
        # Untuk 'dutt', cari pola 'du...t...' (minimal satu 'u' dan satu 't')
        return 1 if re.search(r'du+t+', normalized_recognized) else 0
    else:
        # Untuk yang lain, periksa apakah kata kunci dasar ada di dalam teks yang dikenali
        return 1 if base_keyword in normalized_recognized else 0

print("✅ Fungsi bantuan (get_ground_truth, normalize_text, check_match_flexible) telah didefinisikan.")

✅ Fungsi bantuan (get_ground_truth, normalize_text, check_match_flexible) telah didefinisikan.


### Model Initiation and preparation

In [9]:
print("Initializing EasyOCR... This might take a moment.")
# Inisialisasi pembaca OCR untuk bahasa Inggris.
# Gunakan gpu=False jika Anda mengalami crash karena memori GPU tidak cukup.
reader = easyocr.Reader(['en']) 

# Siapkan list kosong untuk menampung semua hasil dari semua folder
all_results = []

print("✅ EasyOCR ready to be used.")

Initializing EasyOCR... This might take a moment.
✅ EasyOCR ready to be used.


#### Folder by Folder Processing

In [10]:
# Cari semua folder eksperimen di dalam direktori root
experiment_folders = [d for d in os.listdir(ROOT_DIR) if os.path.isdir(os.path.join(ROOT_DIR, d))]

print(f"Ditemukan {len(experiment_folders)} folder eksperimen untuk diproses: {experiment_folders}")

# Iterasi dan proses setiap folder
for experiment_name in experiment_folders:
    dirpath = os.path.join(ROOT_DIR, experiment_name)
    image_files = [f for f in os.listdir(dirpath) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
    
    if not image_files:
        continue

    print(f"\n▶️ Memulai pemrosesan untuk eksperimen: {experiment_name}...")
    
    for filename in tqdm(image_files, desc=f"Reading text from {experiment_name}"):
        image_path = os.path.join(dirpath, filename)
        onomatopoeia_type, base_keyword = get_ground_truth(filename)
        
        if onomatopoeia_type is None:
            continue

        try:
            ocr_result = reader.readtext(image_path, detail=0)
            recognized_text = " ".join(ocr_result)
            is_correct = check_match_flexible(recognized_text, onomatopoeia_type, base_keyword)
            
            all_results.append({
                "experiment": experiment_name, "filename": filename,
                "onomatopoeia_type": onomatopoeia_type,
                "ground_truth_keyword": base_keyword,
                "recognized_text": recognized_text,
                "is_correct": is_correct
            })
        except Exception as e:
            all_results.append({
                "experiment": experiment_name, "filename": filename,
                "onomatopoeia_type": onomatopoeia_type,
                "ground_truth_keyword": base_keyword,
                "recognized_text": f"ERROR: {e}",
                "is_correct": 0
            })
        finally:
            # FIX PENTING: Bersihkan cache memori GPU setelah setiap gambar
            if torch.cuda.is_available():
                torch.cuda.empty_cache()

    print(f"✅ Selesai memproses {experiment_name}.")

print("\n🎉 Semua folder telah berhasil diproses!")

Ditemukan 5 folder eksperimen untuk diproses: ['exp1_sd15_2050', 'exp2_sd15_3050', 'exp3_sd15_3050_v2', 'exp4_sd15_3050_v3', 'exp5_sdxl_a6000']

▶️ Memulai pemrosesan untuk eksperimen: exp1_sd15_2050...


Reading text from exp1_sd15_2050:   0%|          | 0/60 [00:00<?, ?it/s]

✅ Selesai memproses exp1_sd15_2050.

▶️ Memulai pemrosesan untuk eksperimen: exp2_sd15_3050...


Reading text from exp2_sd15_3050:   0%|          | 0/60 [00:00<?, ?it/s]

✅ Selesai memproses exp2_sd15_3050.

▶️ Memulai pemrosesan untuk eksperimen: exp3_sd15_3050_v2...


Reading text from exp3_sd15_3050_v2:   0%|          | 0/60 [00:00<?, ?it/s]

✅ Selesai memproses exp3_sd15_3050_v2.

▶️ Memulai pemrosesan untuk eksperimen: exp4_sd15_3050_v3...


Reading text from exp4_sd15_3050_v3:   0%|          | 0/60 [00:00<?, ?it/s]

✅ Selesai memproses exp4_sd15_3050_v3.

▶️ Memulai pemrosesan untuk eksperimen: exp5_sdxl_a6000...


Reading text from exp5_sdxl_a6000:   0%|          | 0/60 [00:00<?, ?it/s]

✅ Selesai memproses exp5_sdxl_a6000.

🎉 Semua folder telah berhasil diproses!


### Final Analysis and Storing

In [11]:
if not all_results:
    print("Tidak ada gambar yang diproses. Harap jalankan Sel 4 terlebih dahulu.")
else:
    # Buat dan simpan DataFrame
    df = pd.DataFrame(all_results)
    df.to_csv(OUTPUT_CSV, index=False)
    
    # --- SECTION VISUALIZATION ---
    print("\n--- Generating Visualizations ---")
    os.makedirs(OUTPUT_DIR_PLOTS, exist_ok=True)

    # 1. Visualisasi Akurasi per Eksperimen (Bar Chart)
    accuracy_by_exp = df.groupby('experiment')['is_correct'].mean().reset_index()
    accuracy_by_exp['is_correct'] *= 100
    accuracy_by_exp.rename(columns={'is_correct': 'accuracy_%'}, inplace=True)
    accuracy_by_exp.sort_values('accuracy_%', ascending=False, inplace=True)

    plt.figure(figsize=(12, 7))
    sns.barplot(data=accuracy_by_exp, x='experiment', y='accuracy_%', palette='viridis')
    plt.title('Performa OCR Terbaik per Eksperimen', fontsize=16)
    plt.xlabel('Eksperimen', fontsize=12)
    plt.ylabel('Akurasi (%)', fontsize=12)
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plot_path_bar = os.path.join(OUTPUT_DIR_PLOTS, 'best_experiment_ocr_performance.png')
    plt.savefig(plot_path_bar)
    plt.close()
    print(f"Bar chart performa terbaik disimpan di: {plot_path_bar}")
    
    # 2. Visualisasi Keberhasilan OCR per Kategori (Heatmap)
    pivot_df = df.pivot_table(index='experiment', columns='onomatopoeia_type', values='is_correct', aggfunc='sum')
    
    plt.figure(figsize=(14, 8))
    sns.heatmap(pivot_df, annot=True, fmt='g', cmap='YlGnBu', linewidths=.5)
    plt.title('Heatmap Keberhasilan OCR (Jumlah Gambar yang Lolos)', fontsize=16)
    plt.xlabel('Jenis Onomatope', fontsize=12)
    plt.ylabel('Eksperimen', fontsize=12)
    plt.tight_layout()
    plot_path_heatmap = os.path.join(OUTPUT_DIR_PLOTS, 'ocr_success_heatmap.png')
    plt.savefig(plot_path_heatmap)
    plt.close()
    print(f"Heatmap keberhasilan disimpan di: {plot_path_heatmap}")

    # --- SECTION SUMMARY ---
    overall_accuracy = df['is_correct'].mean() * 100
    print("\n--- OCR Evaluation Complete ---")
    print(f"Hasil detail disimpan ke {OUTPUT_CSV}")
    print(f"Akurasi OCR Keseluruhan: {overall_accuracy:.2f}%")
    print("\nAkurasi per eksperimen:")
    print(accuracy_by_exp.to_string(index=False))


--- Generating Visualizations ---



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(data=accuracy_by_exp, x='experiment', y='accuracy_%', palette='viridis')


Bar chart performa terbaik disimpan di: output_plots\best_experiment_ocr_performance.png
Heatmap keberhasilan disimpan di: output_plots\ocr_success_heatmap.png

--- OCR Evaluation Complete ---
Hasil detail disimpan ke ocr_accuracy_results.csv
Akurasi OCR Keseluruhan: 14.00%

Akurasi per eksperimen:
       experiment  accuracy_%
   exp1_sd15_2050   25.000000
  exp5_sdxl_a6000   16.666667
exp3_sd15_3050_v2   13.333333
exp4_sd15_3050_v3   10.000000
   exp2_sd15_3050    5.000000
