akhir code

In [1]:
import os
import gc
import pandas as pd
import torch
from datasets import Dataset
from ast import literal_eval  # PENTING: Untuk mengubah string "['...']" kembali jadi list

# Library Ragas & Evaluasi
from ragas import evaluate
from ragas.metrics import (
    context_precision,
    faithfulness,
    answer_relevancy,
    context_recall,
)
from langchain_huggingface import HuggingFaceEmbeddings
from transformers import BitsAndBytesConfig

# Import module custom Anda (hanya untuk evaluator judge)
from resources import evaluator

# =====================================================
# STEP 0: BERSIHKAN MEMORI (VRAM)
# =====================================================
try:
    del evaluator_embeddings
    del results
    del rag_dataset
except:
    pass

gc.collect()
torch.cuda.empty_cache()
print(f"üßπ VRAM Cleaned. Usage: {torch.cuda.memory_allocated()/1024**3:.2f} GB")

# =====================================================
# STEP 1: KONFIGURASI MODEL (JUDGE & EMBEDDING)
# =====================================================
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

# LLM Judge (GPT-4o / Model Lain)
evaluator_llm = evaluator

print("‚öôÔ∏è Loading Embedding Model (untuk metrik Ragas)...")
evaluator_embeddings = HuggingFaceEmbeddings(
    model_name="Qwen/Qwen3-Embedding-4B",
    model_kwargs={
        'device': 'cuda',
        'trust_remote_code': True,
        'model_kwargs': {'quantization_config': bnb_config, 'device_map': 'auto'}
    },
    encode_kwargs={'normalize_embeddings': True, 'batch_size': 1},
    multi_process=False 
)

# =====================================================
# STEP 2: LOAD DATASET DARI EXCEL (LANGSUNG EVALUASI)
# =====================================================
# Ganti nama file sesuai file excel Anda yang sudah lengkap/rapi
file_path = "dataset_ragas_lengkap.xlsx" 
print(f"üìÇ Membaca file dataset: {file_path}")

df = pd.read_excel(file_path)

# --- FUNGSI PREPROCESSING KRUSIAL ---
# Excel menyimpan list sebagai string, contoh: "['dokumen A', 'dokumen B']"
# Kita harus mengubahnya kembali menjadi list Python sungguhan: ['dokumen A', 'dokumen B']
def parse_contexts(x):
    if isinstance(x, list):
        return x  # Jika sudah list, biarkan
    try:
        # Mencoba mengubah string representasi list menjadi list asli
        return literal_eval(x)
    except:
        # Jika gagal (misal sel kosong atau format rusak), kembalikan list kosong
        return []

print("‚öôÔ∏è Melakukan konversi format data (String -> List)...")
# Terapkan fungsi ke kolom contexts
if 'contexts' in df.columns:
    df['contexts'] = df['contexts'].apply(parse_contexts)
else:
    raise ValueError("‚ùå Kolom 'contexts' tidak ditemukan di Excel!")

# Pastikan kolom ground_truth diperlakukan sebagai string (jika ada)
if 'ground_truth' in df.columns:
    df['ground_truth'] = df['ground_truth'].astype(str)

# Pastikan kolom answer diperlakukan sebagai string
df['answer'] = df['answer'].astype(str)

print("‚úÖ Data berhasil dimuat dan diproses.")

# =====================================================
# STEP 3: JALANKAN EVALUASI RAGAS
# =====================================================
# Konversi DataFrame Pandas ke Dataset RAGAs
rag_dataset = Dataset.from_pandas(df)

metrics = [context_precision, faithfulness, answer_relevancy, context_recall]

print("\nüöÄ Memulai Evaluasi Ragas...")
# Jalankan Evaluasi
results = evaluate(
    dataset=rag_dataset,
    metrics=metrics,
    llm=evaluator_llm,
    embeddings=evaluator_embeddings,
    raise_exceptions=False
)

# --- PRINT HASIL AKHIR ---
print("\n" + "="*30)
print("üìä HASIL AKHIR SKOR RAGAS")
print("="*30)
print(results)
print("="*30)

# =====================================================
# STEP 4: SAVE HASIL KE EXCEL BARU
# =====================================================
metrics_df = results.to_pandas()

# Reset index untuk penggabungan yang aman
df_reset = df.reset_index(drop=True)
metrics_reset = metrics_df.reset_index(drop=True)

# Gabungkan data asli dengan skor hasil evaluasi
final_df = pd.concat([df_reset, metrics_reset], axis=1)

# Hapus kolom duplikat jika ada
final_df = final_df.loc[:, ~final_df.columns.duplicated()]

# Tentukan urutan kolom yang rapi untuk output
cols_target = ['question', 'answer', 'contexts', 'ground_truth', 
               'context_precision', 'context_recall', 'faithfulness', 'answer_relevancy']
final_cols = [c for c in cols_target if c in final_df.columns]
final_df = final_df[final_cols]

output_filename = "hasil_evaluasi_final.xlsx"
final_df.to_excel(output_filename, index=False)
print(f"\nüíæ File hasil evaluasi disimpan: {output_filename}")

Koneksi ke Firebase Firestore berhasil.
Inisialisasi LLM dengan model: meta-llama/llama-4-scout
Inisialisasi Evaluator dengan model: openai/gpt-4o-mini
üßπ VRAM Cleaned. Usage: 0.00 GB
‚öôÔ∏è Loading Embedding Model (untuk metrik Ragas)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

üìÇ Membaca file dataset: dataset_ragas_lengkap.xlsx
‚öôÔ∏è Melakukan konversi format data (String -> List)...
‚úÖ Data berhasil dimuat dan diproses.

üöÄ Memulai Evaluasi Ragas...


Evaluating:   0%|          | 0/40 [00:00<?, ?it/s]

Requested n=3 with low temperature (None). Overriding temperature to 0.5 to ensure diversity.
Requested n=3 with low temperature (None). Overriding temperature to 0.5 to ensure diversity.
Requested n=3 with low temperature (None). Overriding temperature to 0.5 to ensure diversity.
Requested n=3 with low temperature (None). Overriding temperature to 0.5 to ensure diversity.
Requested n=3 with low temperature (None). Overriding temperature to 0.5 to ensure diversity.
Requested n=3 with low temperature (None). Overriding temperature to 0.5 to ensure diversity.
Requested n=3 with low temperature (None). Overriding temperature to 0.5 to ensure diversity.
Requested n=3 with low temperature (None). Overriding temperature to 0.5 to ensure diversity.
Requested n=3 with low temperature (None). Overriding temperature to 0.5 to ensure diversity.
Requested n=3 with low temperature (None). Overriding temperature to 0.5 to ensure diversity.



üìä HASIL AKHIR SKOR RAGAS
{'context_precision': 0.7333, 'faithfulness': 0.7667, 'answer_relevancy': 0.4396, 'context_recall': 0.8000}

üíæ File hasil evaluasi disimpan: hasil_evaluasi_final.xlsx


In [1]:
import os
import gc
import pandas as pd
import torch
from datasets import Dataset
from ast import literal_eval  # PENTING: Untuk mengubah string "['...']" kembali jadi list

# Library Ragas & Evaluasi
from ragas import evaluate
from ragas.metrics import (
    context_precision,
    faithfulness,
    answer_relevancy,
    context_recall,
)
from langchain_huggingface import HuggingFaceEmbeddings
from transformers import BitsAndBytesConfig

# Import module custom Anda (hanya untuk evaluator judge)
from resources import evaluator

# =====================================================
# STEP 0: BERSIHKAN MEMORI (VRAM)
# =====================================================
try:
    del evaluator_embeddings
    del results
    del rag_dataset
except:
    pass

gc.collect()
torch.cuda.empty_cache()
print(f"üßπ VRAM Cleaned. Usage: {torch.cuda.memory_allocated()/1024**3:.2f} GB")

# =====================================================
# STEP 1: KONFIGURASI MODEL (JUDGE & EMBEDDING)
# =====================================================
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

# LLM Judge (GPT-4o / Model Lain)
evaluator_llm = evaluator

print("‚öôÔ∏è Loading Embedding Model (untuk metrik Ragas)...")
evaluator_embeddings = HuggingFaceEmbeddings(
    model_name="Qwen/Qwen3-Embedding-4B",
    model_kwargs={
        'device': 'cuda',
        'trust_remote_code': True,
        'model_kwargs': {'quantization_config': bnb_config, 'device_map': 'auto'}
    },
    encode_kwargs={'normalize_embeddings': True, 'batch_size': 1},
    multi_process=False 
)

# =====================================================
# STEP 2: LOAD DATASET DARI EXCEL (LANGSUNG EVALUASI)
# =====================================================
# Ganti nama file sesuai file excel Anda yang sudah lengkap/rapi
file_path = "dataset_ragas_lengkap.xlsx" 
print(f"üìÇ Membaca file dataset: {file_path}")

df = pd.read_excel(file_path)

# --- FUNGSI PREPROCESSING KRUSIAL ---
# Excel menyimpan list sebagai string, contoh: "['dokumen A', 'dokumen B']"
# Kita harus mengubahnya kembali menjadi list Python sungguhan: ['dokumen A', 'dokumen B']
def parse_contexts(x):
    if isinstance(x, list):
        return x  # Jika sudah list, biarkan
    try:
        # Mencoba mengubah string representasi list menjadi list asli
        return literal_eval(x)
    except:
        # Jika gagal (misal sel kosong atau format rusak), kembalikan list kosong
        return []

print("‚öôÔ∏è Melakukan konversi format data (String -> List)...")
# Terapkan fungsi ke kolom contexts
if 'contexts' in df.columns:
    df['contexts'] = df['contexts'].apply(parse_contexts)
else:
    raise ValueError("‚ùå Kolom 'contexts' tidak ditemukan di Excel!")

# Pastikan kolom ground_truth diperlakukan sebagai string (jika ada)
if 'ground_truth' in df.columns:
    df['ground_truth'] = df['ground_truth'].astype(str)

# Pastikan kolom answer diperlakukan sebagai string
df['answer'] = df['answer'].astype(str)

print("‚úÖ Data berhasil dimuat dan diproses.")

# =====================================================
# STEP 3: JALANKAN EVALUASI RAGAS
# =====================================================
# Konversi DataFrame Pandas ke Dataset RAGAs
rag_dataset = Dataset.from_pandas(df)

metrics = [context_precision, faithfulness, answer_relevancy, context_recall]

print("\nüöÄ Memulai Evaluasi Ragas...")
# Jalankan Evaluasi
results = evaluate(
    dataset=rag_dataset,
    metrics=metrics,
    llm=evaluator_llm,
    embeddings=evaluator_embeddings,
    raise_exceptions=False
)

# --- PRINT HASIL AKHIR ---
print("\n" + "="*30)
print("üìä HASIL AKHIR SKOR RAGAS")
print("="*30)
print(results)
print("="*30)

# =====================================================
# STEP 4: SAVE HASIL KE EXCEL BARU
# =====================================================
metrics_df = results.to_pandas()

# Reset index untuk penggabungan yang aman
df_reset = df.reset_index(drop=True)
metrics_reset = metrics_df.reset_index(drop=True)

# Gabungkan data asli dengan skor hasil evaluasi
final_df = pd.concat([df_reset, metrics_reset], axis=1)

# Hapus kolom duplikat jika ada
final_df = final_df.loc[:, ~final_df.columns.duplicated()]

# Tentukan urutan kolom yang rapi untuk output
cols_target = ['question', 'answer', 'contexts', 'ground_truth', 
               'context_precision', 'context_recall', 'faithfulness', 'answer_relevancy']
final_cols = [c for c in cols_target if c in final_df.columns]
final_df = final_df[final_cols]

output_filename = "hasil_evaluasi_final.xlsx"
final_df.to_excel(output_filename, index=False)
print(f"\nüíæ File hasil evaluasi disimpan: {output_filename}")

Koneksi ke Firebase Firestore berhasil.
Inisialisasi LLM dengan model: meta-llama/llama-4-scout
Inisialisasi Evaluator dengan model: openai/gpt-4o-mini
üßπ VRAM Cleaned. Usage: 0.00 GB
‚öôÔ∏è Loading Embedding Model (untuk metrik Ragas)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

üìÇ Membaca file dataset: dataset_ragas_lengkap.xlsx
‚öôÔ∏è Melakukan konversi format data (String -> List)...
‚úÖ Data berhasil dimuat dan diproses.

üöÄ Memulai Evaluasi Ragas...


Evaluating:   0%|          | 0/40 [00:00<?, ?it/s]

Requested n=3 with low temperature (None). Overriding temperature to 0.5 to ensure diversity.
Requested n=3 with low temperature (None). Overriding temperature to 0.5 to ensure diversity.
Requested n=3 with low temperature (None). Overriding temperature to 0.5 to ensure diversity.
Requested n=3 with low temperature (None). Overriding temperature to 0.5 to ensure diversity.
Requested n=3 with low temperature (None). Overriding temperature to 0.5 to ensure diversity.
Requested n=3 with low temperature (None). Overriding temperature to 0.5 to ensure diversity.
Requested n=3 with low temperature (None). Overriding temperature to 0.5 to ensure diversity.
Requested n=3 with low temperature (None). Overriding temperature to 0.5 to ensure diversity.
Requested n=3 with low temperature (None). Overriding temperature to 0.5 to ensure diversity.
Requested n=3 with low temperature (None). Overriding temperature to 0.5 to ensure diversity.
Exception raised in Job[25]: TimeoutError()



üìä HASIL AKHIR SKOR RAGAS
{'context_precision': 0.7333, 'faithfulness': 0.8519, 'answer_relevancy': 0.4668, 'context_recall': 0.8000}

üíæ File hasil evaluasi disimpan: hasil_evaluasi_final.xlsx


In [3]:
import os
import gc
import pandas as pd
import torch
from datasets import Dataset
from ast import literal_eval  # PENTING: Untuk mengubah string "['...']" kembali jadi list
from resources import evaluator
evaluator_llm = evaluator

# Library Ragas & Evaluasi
from ragas import evaluate
from ragas.metrics import (
    context_precision,
    faithfulness,
    answer_relevancy,
    context_recall,
)
from langchain_huggingface import HuggingFaceEmbeddings
from transformers import BitsAndBytesConfig

# Import module custom Anda (hanya untuk evaluator judge)
from resources import evaluator

# =====================================================
# STEP 0: BERSIHKAN MEMORI (VRAM)
# =====================================================
try:
    del evaluator_embeddings
    del results
    del rag_dataset
except:
    pass

gc.collect()
torch.cuda.empty_cache()
print(f"üßπ VRAM Cleaned. Usage: {torch.cuda.memory_allocated()/1024**3:.2f} GB")

# =====================================================
# STEP 1: KONFIGURASI MODEL (JUDGE & EMBEDDING)
# =====================================================
print("‚öôÔ∏è Loading Embedding Model (untuk metrik Ragas)...")

evaluator_embeddings = HuggingFaceEmbeddings(
    model_name="intfloat/multilingual-e5-base",
    model_kwargs={
        'device': 'cuda',   # Memaksa model berjalan di GPU
        # 'trust_remote_code': True # Biasanya tidak wajib untuk e5-base, tapi boleh dibiarkan jika error
    },
    encode_kwargs={
        'normalize_embeddings': True, # Wajib True untuk model E5 agar cosine similarity akurat
        'batch_size': 32 # Saya naikkan dari 1. Karena tanpa kuantisasi, model ini ringan & cepat.
    },
    multi_process=False 
)

# =====================================================
# STEP 2: LOAD DATASET DARI EXCEL (LANGSUNG EVALUASI)
# =====================================================
# Ganti nama file sesuai file excel Anda yang sudah lengkap/rapi
file_path = "dataset_ragas_lengkap.xlsx" 
print(f"üìÇ Membaca file dataset: {file_path}")

df = pd.read_excel(file_path)

# --- FUNGSI PREPROCESSING KRUSIAL ---
# Excel menyimpan list sebagai string, contoh: "['dokumen A', 'dokumen B']"
# Kita harus mengubahnya kembali menjadi list Python sungguhan: ['dokumen A', 'dokumen B']
def parse_contexts(x):
    if isinstance(x, list):
        return x  # Jika sudah list, biarkan
    try:
        # Mencoba mengubah string representasi list menjadi list asli
        return literal_eval(x)
    except:
        # Jika gagal (misal sel kosong atau format rusak), kembalikan list kosong
        return []

print("‚öôÔ∏è Melakukan konversi format data (String -> List)...")
# Terapkan fungsi ke kolom contexts
if 'contexts' in df.columns:
    df['contexts'] = df['contexts'].apply(parse_contexts)
else:
    raise ValueError("‚ùå Kolom 'contexts' tidak ditemukan di Excel!")

# Pastikan kolom ground_truth diperlakukan sebagai string (jika ada)
if 'ground_truth' in df.columns:
    df['ground_truth'] = df['ground_truth'].astype(str)

# Pastikan kolom answer diperlakukan sebagai string
df['answer'] = df['answer'].astype(str)

print("‚úÖ Data berhasil dimuat dan diproses.")

# =====================================================
# STEP 3: JALANKAN EVALUASI RAGAS
# =====================================================
# Konversi DataFrame Pandas ke Dataset RAGAs
rag_dataset = Dataset.from_pandas(df)

metrics = [context_precision, faithfulness, answer_relevancy, context_recall]

print("\nüöÄ Memulai Evaluasi Ragas...")
# Jalankan Evaluasi
results = evaluate(
    dataset=rag_dataset,
    metrics=metrics,
    llm=evaluator_llm,
    embeddings=evaluator_embeddings,
    raise_exceptions=False
)

# --- PRINT HASIL AKHIR ---
print("\n" + "="*30)
print("üìä HASIL AKHIR SKOR RAGAS")
print("="*30)
print(results)
print("="*30)

# =====================================================
# STEP 4: SAVE HASIL KE EXCEL BARU
# =====================================================
metrics_df = results.to_pandas()

# Reset index untuk penggabungan yang aman
df_reset = df.reset_index(drop=True)
metrics_reset = metrics_df.reset_index(drop=True)

# Gabungkan data asli dengan skor hasil evaluasi
final_df = pd.concat([df_reset, metrics_reset], axis=1)

# Hapus kolom duplikat jika ada
final_df = final_df.loc[:, ~final_df.columns.duplicated()]

# Tentukan urutan kolom yang rapi untuk output
cols_target = ['question', 'answer', 'contexts', 'ground_truth', 
               'context_precision', 'context_recall', 'faithfulness', 'answer_relevancy']
final_cols = [c for c in cols_target if c in final_df.columns]
final_df = final_df[final_cols]

output_filename = "hasil_evaluasi_final_e5_base.xlsx"
final_df.to_excel(output_filename, index=False)
print(f"\nüíæ File hasil evaluasi disimpan: {output_filename}")

üßπ VRAM Cleaned. Usage: 0.00 GB
‚öôÔ∏è Loading Embedding Model (untuk metrik Ragas)...
üìÇ Membaca file dataset: dataset_ragas_lengkap.xlsx
‚öôÔ∏è Melakukan konversi format data (String -> List)...
‚úÖ Data berhasil dimuat dan diproses.

üöÄ Memulai Evaluasi Ragas...


Evaluating:   0%|          | 0/40 [00:00<?, ?it/s]

Requested n=3 with low temperature (None). Overriding temperature to 0.5 to ensure diversity.
Requested n=3 with low temperature (None). Overriding temperature to 0.5 to ensure diversity.
Requested n=3 with low temperature (None). Overriding temperature to 0.5 to ensure diversity.
Requested n=3 with low temperature (None). Overriding temperature to 0.5 to ensure diversity.
Requested n=3 with low temperature (None). Overriding temperature to 0.5 to ensure diversity.
Requested n=3 with low temperature (None). Overriding temperature to 0.5 to ensure diversity.
Requested n=3 with low temperature (None). Overriding temperature to 0.5 to ensure diversity.
Requested n=3 with low temperature (None). Overriding temperature to 0.5 to ensure diversity.
Requested n=3 with low temperature (None). Overriding temperature to 0.5 to ensure diversity.
Requested n=3 with low temperature (None). Overriding temperature to 0.5 to ensure diversity.
Exception raised in Job[33]: TimeoutError()



üìä HASIL AKHIR SKOR RAGAS
{'context_precision': 0.6833, 'faithfulness': 0.7407, 'answer_relevancy': 0.6886, 'context_recall': 0.8000}

üíæ File hasil evaluasi disimpan: hasil_evaluasi_final_e5_base.xlsx


In [1]:
import os
import gc
import pandas as pd
import torch
from datasets import Dataset
from ast import literal_eval  # PENTING: Untuk mengubah string "['...']" kembali jadi list
from resources import evaluator
evaluator_llm = evaluator

# Library Ragas & Evaluasi
from ragas import evaluate
from ragas.metrics import (
    context_precision,
    faithfulness,
    answer_relevancy,
    context_recall,
)
from langchain_huggingface import HuggingFaceEmbeddings
from transformers import BitsAndBytesConfig

# Import module custom Anda (hanya untuk evaluator judge)
from resources import evaluator

# =====================================================
# STEP 0: BERSIHKAN MEMORI (VRAM)
# =====================================================
try:
    del evaluator_embeddings
    del results
    del rag_dataset
except:
    pass

gc.collect()
torch.cuda.empty_cache()
print(f"üßπ VRAM Cleaned. Usage: {torch.cuda.memory_allocated()/1024**3:.2f} GB")

# =====================================================
# STEP 1: KONFIGURASI MODEL (JUDGE & EMBEDDING)
# =====================================================
print("‚öôÔ∏è Loading Embedding Model (untuk metrik Ragas)...")

evaluator_embeddings = HuggingFaceEmbeddings(
    model_name="Qwen/Qwen3-Embedding-0.6B",
    model_kwargs={
        'device': 'cuda',   # Memaksa model berjalan di GPU
        # 'trust_remote_code': True # Biasanya tidak wajib untuk e5-base, tapi boleh dibiarkan jika error
    },
    encode_kwargs={
        'normalize_embeddings': True, # Wajib True untuk model E5 agar cosine similarity akurat
        'batch_size': 32 # Saya naikkan dari 1. Karena tanpa kuantisasi, model ini ringan & cepat.
    },
    multi_process=False 
)

# =====================================================
# STEP 2: LOAD DATASET DARI EXCEL (LANGSUNG EVALUASI)
# =====================================================
# Ganti nama file sesuai file excel Anda yang sudah lengkap/rapi
file_path = "dataset_ragas_lengkap.xlsx" 
print(f"üìÇ Membaca file dataset: {file_path}")

df = pd.read_excel(file_path)

# --- FUNGSI PREPROCESSING KRUSIAL ---
# Excel menyimpan list sebagai string, contoh: "['dokumen A', 'dokumen B']"
# Kita harus mengubahnya kembali menjadi list Python sungguhan: ['dokumen A', 'dokumen B']
def parse_contexts(x):
    if isinstance(x, list):
        return x  # Jika sudah list, biarkan
    try:
        # Mencoba mengubah string representasi list menjadi list asli
        return literal_eval(x)
    except:
        # Jika gagal (misal sel kosong atau format rusak), kembalikan list kosong
        return []

print("‚öôÔ∏è Melakukan konversi format data (String -> List)...")
# Terapkan fungsi ke kolom contexts
if 'contexts' in df.columns:
    df['contexts'] = df['contexts'].apply(parse_contexts)
else:
    raise ValueError("‚ùå Kolom 'contexts' tidak ditemukan di Excel!")

# Pastikan kolom ground_truth diperlakukan sebagai string (jika ada)
if 'ground_truth' in df.columns:
    df['ground_truth'] = df['ground_truth'].astype(str)

# Pastikan kolom answer diperlakukan sebagai string
df['answer'] = df['answer'].astype(str)

print("‚úÖ Data berhasil dimuat dan diproses.")

# =====================================================
# STEP 3: JALANKAN EVALUASI RAGAS
# =====================================================
# Konversi DataFrame Pandas ke Dataset RAGAs
rag_dataset = Dataset.from_pandas(df)

metrics = [context_precision, faithfulness, answer_relevancy, context_recall]

print("\nüöÄ Memulai Evaluasi Ragas...")
# Jalankan Evaluasi
results = evaluate(
    dataset=rag_dataset,
    metrics=metrics,
    llm=evaluator_llm,
    embeddings=evaluator_embeddings,
    raise_exceptions=False
)

# --- PRINT HASIL AKHIR ---
print("\n" + "="*30)
print("üìä HASIL AKHIR SKOR RAGAS")
print("="*30)
print(results)
print("="*30)

# =====================================================
# STEP 4: SAVE HASIL KE EXCEL BARU
# =====================================================
metrics_df = results.to_pandas()

# Reset index untuk penggabungan yang aman
df_reset = df.reset_index(drop=True)
metrics_reset = metrics_df.reset_index(drop=True)

# Gabungkan data asli dengan skor hasil evaluasi
final_df = pd.concat([df_reset, metrics_reset], axis=1)

# Hapus kolom duplikat jika ada
final_df = final_df.loc[:, ~final_df.columns.duplicated()]

# Tentukan urutan kolom yang rapi untuk output
cols_target = ['question', 'answer', 'contexts', 'ground_truth', 
               'context_precision', 'context_recall', 'faithfulness', 'answer_relevancy']
final_cols = [c for c in cols_target if c in final_df.columns]
final_df = final_df[final_cols]

output_filename = "hasil_evaluasi_final_e5_base.xlsx"
final_df.to_excel(output_filename, index=False)
print(f"\nüíæ File hasil evaluasi disimpan: {output_filename}")

Koneksi ke Firebase Firestore berhasil.
Inisialisasi LLM dengan model: meta-llama/llama-4-scout
Inisialisasi Evaluator dengan model: openai/gpt-4o-mini
üßπ VRAM Cleaned. Usage: 0.00 GB
‚öôÔ∏è Loading Embedding Model (untuk metrik Ragas)...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.19G [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

üìÇ Membaca file dataset: dataset_ragas_lengkap.xlsx
‚öôÔ∏è Melakukan konversi format data (String -> List)...
‚úÖ Data berhasil dimuat dan diproses.

üöÄ Memulai Evaluasi Ragas...


Evaluating:   0%|          | 0/40 [00:00<?, ?it/s]

Requested n=3 with low temperature (None). Overriding temperature to 0.5 to ensure diversity.
Requested n=3 with low temperature (None). Overriding temperature to 0.5 to ensure diversity.
Requested n=3 with low temperature (None). Overriding temperature to 0.5 to ensure diversity.
Requested n=3 with low temperature (None). Overriding temperature to 0.5 to ensure diversity.
Requested n=3 with low temperature (None). Overriding temperature to 0.5 to ensure diversity.
Requested n=3 with low temperature (None). Overriding temperature to 0.5 to ensure diversity.
Requested n=3 with low temperature (None). Overriding temperature to 0.5 to ensure diversity.
Requested n=3 with low temperature (None). Overriding temperature to 0.5 to ensure diversity.
Requested n=3 with low temperature (None). Overriding temperature to 0.5 to ensure diversity.
Requested n=3 with low temperature (None). Overriding temperature to 0.5 to ensure diversity.



üìä HASIL AKHIR SKOR RAGAS
{'context_precision': 0.7333, 'faithfulness': 0.7667, 'answer_relevancy': 0.4692, 'context_recall': 0.8000}

üíæ File hasil evaluasi disimpan: hasil_evaluasi_final_e5_base.xlsx


qwen4b dengan dataset baik

In [1]:
import os
import gc
import pandas as pd
import torch
from datasets import Dataset
from ast import literal_eval  # PENTING: Untuk mengubah string "['...']" kembali jadi list

# Library Ragas & Evaluasi
from ragas import evaluate
from ragas.metrics import (
    context_precision,
    faithfulness,
    answer_relevancy,
    context_recall,
)
from langchain_huggingface import HuggingFaceEmbeddings
from transformers import BitsAndBytesConfig

# Import module custom Anda (hanya untuk evaluator judge)
from resources import evaluator

# =====================================================
# STEP 0: BERSIHKAN MEMORI (VRAM)
# =====================================================
try:
    del evaluator_embeddings
    del results
    del rag_dataset
except:
    pass

gc.collect()
torch.cuda.empty_cache()
print(f"üßπ VRAM Cleaned. Usage: {torch.cuda.memory_allocated()/1024**3:.2f} GB")

# =====================================================
# STEP 1: KONFIGURASI MODEL (JUDGE & EMBEDDING)
# =====================================================
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

# LLM Judge (GPT-4o / Model Lain)
evaluator_llm = evaluator

print("‚öôÔ∏è Loading Embedding Model (untuk metrik Ragas)...")
evaluator_embeddings = HuggingFaceEmbeddings(
    model_name="Qwen/Qwen3-Embedding-4B",
    model_kwargs={
        'device': 'cuda',
        'trust_remote_code': True,
        'model_kwargs': {'quantization_config': bnb_config, 'device_map': 'auto'}
    },
    encode_kwargs={'normalize_embeddings': True, 'batch_size': 1},
    multi_process=False 
)

# =====================================================
# STEP 2: LOAD DATASET DARI EXCEL (LANGSUNG EVALUASI)
# =====================================================
# Ganti nama file sesuai file excel Anda yang sudah lengkap/rapi
file_path = "dataset_ragas_lengkap.xlsx" 
print(f"üìÇ Membaca file dataset: {file_path}")

df = pd.read_excel(file_path)

# --- FUNGSI PREPROCESSING KRUSIAL ---
# Excel menyimpan list sebagai string, contoh: "['dokumen A', 'dokumen B']"
# Kita harus mengubahnya kembali menjadi list Python sungguhan: ['dokumen A', 'dokumen B']
def parse_contexts(x):
    if isinstance(x, list):
        return x  # Jika sudah list, biarkan
    try:
        # Mencoba mengubah string representasi list menjadi list asli
        return literal_eval(x)
    except:
        # Jika gagal (misal sel kosong atau format rusak), kembalikan list kosong
        return []

print("‚öôÔ∏è Melakukan konversi format data (String -> List)...")
# Terapkan fungsi ke kolom contexts
if 'contexts' in df.columns:
    df['contexts'] = df['contexts'].apply(parse_contexts)
else:
    raise ValueError("‚ùå Kolom 'contexts' tidak ditemukan di Excel!")

# Pastikan kolom ground_truth diperlakukan sebagai string (jika ada)
if 'ground_truth' in df.columns:
    df['ground_truth'] = df['ground_truth'].astype(str)

# Pastikan kolom answer diperlakukan sebagai string
df['answer'] = df['answer'].astype(str)

print("‚úÖ Data berhasil dimuat dan diproses.")

# =====================================================
# STEP 3: JALANKAN EVALUASI RAGAS
# =====================================================
# Konversi DataFrame Pandas ke Dataset RAGAs
rag_dataset = Dataset.from_pandas(df)

metrics = [context_precision, faithfulness, answer_relevancy, context_recall]

print("\nüöÄ Memulai Evaluasi Ragas...")
# Jalankan Evaluasi
results = evaluate(
    dataset=rag_dataset,
    metrics=metrics,
    llm=evaluator_llm,
    embeddings=evaluator_embeddings,
    raise_exceptions=False
)

# --- PRINT HASIL AKHIR ---
print("\n" + "="*30)
print("üìä HASIL AKHIR SKOR RAGAS")
print("="*30)
print(results)
print("="*30)

# =====================================================
# STEP 4: SAVE HASIL KE EXCEL BARU
# =====================================================
metrics_df = results.to_pandas()

# Reset index untuk penggabungan yang aman
df_reset = df.reset_index(drop=True)
metrics_reset = metrics_df.reset_index(drop=True)

# Gabungkan data asli dengan skor hasil evaluasi
final_df = pd.concat([df_reset, metrics_reset], axis=1)

# Hapus kolom duplikat jika ada
final_df = final_df.loc[:, ~final_df.columns.duplicated()]

# Tentukan urutan kolom yang rapi untuk output
cols_target = ['question', 'answer', 'contexts', 'ground_truth', 
               'context_precision', 'context_recall', 'faithfulness', 'answer_relevancy']
final_cols = [c for c in cols_target if c in final_df.columns]
final_df = final_df[final_cols]

output_filename = "hasil_evaluasi_final.xlsx"
final_df.to_excel(output_filename, index=False)
print(f"\nüíæ File hasil evaluasi disimpan: {output_filename}")

Koneksi ke Firebase Firestore berhasil.
Inisialisasi LLM dengan model: meta-llama/llama-4-scout
Inisialisasi Evaluator dengan model: openai/gpt-4o-mini
üßπ VRAM Cleaned. Usage: 0.00 GB
‚öôÔ∏è Loading Embedding Model (untuk metrik Ragas)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

üìÇ Membaca file dataset: dataset_ragas_lengkap.xlsx
‚öôÔ∏è Melakukan konversi format data (String -> List)...
‚úÖ Data berhasil dimuat dan diproses.

üöÄ Memulai Evaluasi Ragas...


Evaluating:   0%|          | 0/40 [00:00<?, ?it/s]


üìä HASIL AKHIR SKOR RAGAS
{'context_precision': 0.8000, 'faithfulness': 0.8000, 'answer_relevancy': 0.7681, 'context_recall': 0.8000}

üíæ File hasil evaluasi disimpan: hasil_evaluasi_final.xlsx


e5 dengan dataset baik

In [2]:
import os
import gc
import pandas as pd
import torch
from datasets import Dataset
from ast import literal_eval  # PENTING: Untuk mengubah string "['...']" kembali jadi list
from resources import evaluator
evaluator_llm = evaluator

# Library Ragas & Evaluasi
from ragas import evaluate
from ragas.metrics import (
    context_precision,
    faithfulness,
    answer_relevancy,
    context_recall,
)
from langchain_huggingface import HuggingFaceEmbeddings
from transformers import BitsAndBytesConfig

# Import module custom Anda (hanya untuk evaluator judge)
from resources import evaluator

# =====================================================
# STEP 0: BERSIHKAN MEMORI (VRAM)
# =====================================================
try:
    del evaluator_embeddings
    del results
    del rag_dataset
except:
    pass

gc.collect()
torch.cuda.empty_cache()
print(f"üßπ VRAM Cleaned. Usage: {torch.cuda.memory_allocated()/1024**3:.2f} GB")

# =====================================================
# STEP 1: KONFIGURASI MODEL (JUDGE & EMBEDDING)
# =====================================================
print("‚öôÔ∏è Loading Embedding Model (untuk metrik Ragas)...")

evaluator_embeddings = HuggingFaceEmbeddings(
    model_name="intfloat/multilingual-e5-base",
    model_kwargs={
        'device': 'cuda',   # Memaksa model berjalan di GPU
        # 'trust_remote_code': True # Biasanya tidak wajib untuk e5-base, tapi boleh dibiarkan jika error
    },
    encode_kwargs={
        'normalize_embeddings': True, # Wajib True untuk model E5 agar cosine similarity akurat
        'batch_size': 32 # Saya naikkan dari 1. Karena tanpa kuantisasi, model ini ringan & cepat.
    },
    multi_process=False 
)

# =====================================================
# STEP 2: LOAD DATASET DARI EXCEL (LANGSUNG EVALUASI)
# =====================================================
# Ganti nama file sesuai file excel Anda yang sudah lengkap/rapi
file_path = "dataset_ragas_lengkap.xlsx" 
print(f"üìÇ Membaca file dataset: {file_path}")

df = pd.read_excel(file_path)

# --- FUNGSI PREPROCESSING KRUSIAL ---
# Excel menyimpan list sebagai string, contoh: "['dokumen A', 'dokumen B']"
# Kita harus mengubahnya kembali menjadi list Python sungguhan: ['dokumen A', 'dokumen B']
def parse_contexts(x):
    if isinstance(x, list):
        return x  # Jika sudah list, biarkan
    try:
        # Mencoba mengubah string representasi list menjadi list asli
        return literal_eval(x)
    except:
        # Jika gagal (misal sel kosong atau format rusak), kembalikan list kosong
        return []

print("‚öôÔ∏è Melakukan konversi format data (String -> List)...")
# Terapkan fungsi ke kolom contexts
if 'contexts' in df.columns:
    df['contexts'] = df['contexts'].apply(parse_contexts)
else:
    raise ValueError("‚ùå Kolom 'contexts' tidak ditemukan di Excel!")

# Pastikan kolom ground_truth diperlakukan sebagai string (jika ada)
if 'ground_truth' in df.columns:
    df['ground_truth'] = df['ground_truth'].astype(str)

# Pastikan kolom answer diperlakukan sebagai string
df['answer'] = df['answer'].astype(str)

print("‚úÖ Data berhasil dimuat dan diproses.")

# =====================================================
# STEP 3: JALANKAN EVALUASI RAGAS
# =====================================================
# Konversi DataFrame Pandas ke Dataset RAGAs
rag_dataset = Dataset.from_pandas(df)

metrics = [context_precision, faithfulness, answer_relevancy, context_recall]

print("\nüöÄ Memulai Evaluasi Ragas...")
# Jalankan Evaluasi
results = evaluate(
    dataset=rag_dataset,
    metrics=metrics,
    llm=evaluator_llm,
    embeddings=evaluator_embeddings,
    raise_exceptions=False
)

# --- PRINT HASIL AKHIR ---
print("\n" + "="*30)
print("üìä HASIL AKHIR SKOR RAGAS")
print("="*30)
print(results)
print("="*30)

# =====================================================
# STEP 4: SAVE HASIL KE EXCEL BARU
# =====================================================
metrics_df = results.to_pandas()

# Reset index untuk penggabungan yang aman
df_reset = df.reset_index(drop=True)
metrics_reset = metrics_df.reset_index(drop=True)

# Gabungkan data asli dengan skor hasil evaluasi
final_df = pd.concat([df_reset, metrics_reset], axis=1)

# Hapus kolom duplikat jika ada
final_df = final_df.loc[:, ~final_df.columns.duplicated()]

# Tentukan urutan kolom yang rapi untuk output
cols_target = ['question', 'answer', 'contexts', 'ground_truth', 
               'context_precision', 'context_recall', 'faithfulness', 'answer_relevancy']
final_cols = [c for c in cols_target if c in final_df.columns]
final_df = final_df[final_cols]

output_filename = "hasil_evaluasi_final_e5_base.xlsx"
final_df.to_excel(output_filename, index=False)
print(f"\nüíæ File hasil evaluasi disimpan: {output_filename}")

üßπ VRAM Cleaned. Usage: 0.01 GB
‚öôÔ∏è Loading Embedding Model (untuk metrik Ragas)...
üìÇ Membaca file dataset: dataset_ragas_lengkap.xlsx
‚öôÔ∏è Melakukan konversi format data (String -> List)...
‚úÖ Data berhasil dimuat dan diproses.

üöÄ Memulai Evaluasi Ragas...


Evaluating:   0%|          | 0/40 [00:00<?, ?it/s]


üìä HASIL AKHIR SKOR RAGAS
{'context_precision': 0.8500, 'faithfulness': 0.8000, 'answer_relevancy': 0.9155, 'context_recall': 0.8000}

üíæ File hasil evaluasi disimpan: hasil_evaluasi_final_e5_base.xlsx


qwen 0.6b dataset baik


In [3]:
import os
import gc
import pandas as pd
import torch
from datasets import Dataset
from ast import literal_eval  # PENTING: Untuk mengubah string "['...']" kembali jadi list
from resources import evaluator
evaluator_llm = evaluator

# Library Ragas & Evaluasi
from ragas import evaluate
from ragas.metrics import (
    context_precision,
    faithfulness,
    answer_relevancy,
    context_recall,
)
from langchain_huggingface import HuggingFaceEmbeddings
from transformers import BitsAndBytesConfig

# Import module custom Anda (hanya untuk evaluator judge)
from resources import evaluator

# =====================================================
# STEP 0: BERSIHKAN MEMORI (VRAM)
# =====================================================
try:
    del evaluator_embeddings
    del results
    del rag_dataset
except:
    pass

gc.collect()
torch.cuda.empty_cache()
print(f"üßπ VRAM Cleaned. Usage: {torch.cuda.memory_allocated()/1024**3:.2f} GB")

# =====================================================
# STEP 1: KONFIGURASI MODEL (JUDGE & EMBEDDING)
# =====================================================
print("‚öôÔ∏è Loading Embedding Model (untuk metrik Ragas)...")

evaluator_embeddings = HuggingFaceEmbeddings(
    model_name="Qwen/Qwen3-Embedding-0.6B",
    model_kwargs={
        'device': 'cuda',   # Memaksa model berjalan di GPU
        # 'trust_remote_code': True # Biasanya tidak wajib untuk e5-base, tapi boleh dibiarkan jika error
    },
    encode_kwargs={
        'normalize_embeddings': True, # Wajib True untuk model E5 agar cosine similarity akurat
        'batch_size': 32 # Saya naikkan dari 1. Karena tanpa kuantisasi, model ini ringan & cepat.
    },
    multi_process=False 
)

# =====================================================
# STEP 2: LOAD DATASET DARI EXCEL (LANGSUNG EVALUASI)
# =====================================================
# Ganti nama file sesuai file excel Anda yang sudah lengkap/rapi
file_path = "dataset_ragas_lengkap.xlsx" 
print(f"üìÇ Membaca file dataset: {file_path}")

df = pd.read_excel(file_path)

# --- FUNGSI PREPROCESSING KRUSIAL ---
# Excel menyimpan list sebagai string, contoh: "['dokumen A', 'dokumen B']"
# Kita harus mengubahnya kembali menjadi list Python sungguhan: ['dokumen A', 'dokumen B']
def parse_contexts(x):
    if isinstance(x, list):
        return x  # Jika sudah list, biarkan
    try:
        # Mencoba mengubah string representasi list menjadi list asli
        return literal_eval(x)
    except:
        # Jika gagal (misal sel kosong atau format rusak), kembalikan list kosong
        return []

print("‚öôÔ∏è Melakukan konversi format data (String -> List)...")
# Terapkan fungsi ke kolom contexts
if 'contexts' in df.columns:
    df['contexts'] = df['contexts'].apply(parse_contexts)
else:
    raise ValueError("‚ùå Kolom 'contexts' tidak ditemukan di Excel!")

# Pastikan kolom ground_truth diperlakukan sebagai string (jika ada)
if 'ground_truth' in df.columns:
    df['ground_truth'] = df['ground_truth'].astype(str)

# Pastikan kolom answer diperlakukan sebagai string
df['answer'] = df['answer'].astype(str)

print("‚úÖ Data berhasil dimuat dan diproses.")

# =====================================================
# STEP 3: JALANKAN EVALUASI RAGAS
# =====================================================
# Konversi DataFrame Pandas ke Dataset RAGAs
rag_dataset = Dataset.from_pandas(df)

metrics = [context_precision, faithfulness, answer_relevancy, context_recall]

print("\nüöÄ Memulai Evaluasi Ragas...")
# Jalankan Evaluasi
results = evaluate(
    dataset=rag_dataset,
    metrics=metrics,
    llm=evaluator_llm,
    embeddings=evaluator_embeddings,
    raise_exceptions=False
)

# --- PRINT HASIL AKHIR ---
print("\n" + "="*30)
print("üìä HASIL AKHIR SKOR RAGAS")
print("="*30)
print(results)
print("="*30)

# =====================================================
# STEP 4: SAVE HASIL KE EXCEL BARU
# =====================================================
metrics_df = results.to_pandas()

# Reset index untuk penggabungan yang aman
df_reset = df.reset_index(drop=True)
metrics_reset = metrics_df.reset_index(drop=True)

# Gabungkan data asli dengan skor hasil evaluasi
final_df = pd.concat([df_reset, metrics_reset], axis=1)

# Hapus kolom duplikat jika ada
final_df = final_df.loc[:, ~final_df.columns.duplicated()]

# Tentukan urutan kolom yang rapi untuk output
cols_target = ['question', 'answer', 'contexts', 'ground_truth', 
               'context_precision', 'context_recall', 'faithfulness', 'answer_relevancy']
final_cols = [c for c in cols_target if c in final_df.columns]
final_df = final_df[final_cols]

output_filename = "hasil_evaluasi_final_e5_base.xlsx"
final_df.to_excel(output_filename, index=False)
print(f"\nüíæ File hasil evaluasi disimpan: {output_filename}")

üßπ VRAM Cleaned. Usage: 0.01 GB
‚öôÔ∏è Loading Embedding Model (untuk metrik Ragas)...
üìÇ Membaca file dataset: dataset_ragas_lengkap.xlsx
‚öôÔ∏è Melakukan konversi format data (String -> List)...
‚úÖ Data berhasil dimuat dan diproses.

üöÄ Memulai Evaluasi Ragas...


Evaluating:   0%|          | 0/40 [00:00<?, ?it/s]


üìä HASIL AKHIR SKOR RAGAS
{'context_precision': 0.8500, 'faithfulness': 0.8000, 'answer_relevancy': 0.7532, 'context_recall': 0.8000}

üíæ File hasil evaluasi disimpan: hasil_evaluasi_final_e5_base.xlsx


In [1]:
import os
import gc
import pandas as pd
import torch
from datasets import Dataset
from ast import literal_eval  # PENTING: Untuk mengubah string "['...']" kembali jadi list
from resources import evaluator
evaluator_llm = evaluator

# Library Ragas & Evaluasi
from ragas import evaluate
from ragas.metrics import (
    context_precision,
    faithfulness,
    answer_relevancy,
    context_recall,
)
from langchain_huggingface import HuggingFaceEmbeddings
from transformers import BitsAndBytesConfig

# Import module custom Anda (hanya untuk evaluator judge)
from resources import evaluator

# =====================================================
# STEP 0: BERSIHKAN MEMORI (VRAM)
# =====================================================
try:
    del evaluator_embeddings
    del results
    del rag_dataset
except:
    pass

gc.collect()
torch.cuda.empty_cache()
print(f"üßπ VRAM Cleaned. Usage: {torch.cuda.memory_allocated()/1024**3:.2f} GB")

# =====================================================
# STEP 1: KONFIGURASI MODEL (JUDGE & EMBEDDING)
# =====================================================
print("‚öôÔ∏è Loading Embedding Model (untuk metrik Ragas)...")

evaluator_embeddings = HuggingFaceEmbeddings(
    model_name="intfloat/multilingual-e5-base",
    model_kwargs={
        'device': 'cuda',   # Memaksa model berjalan di GPU
        # 'trust_remote_code': True # Biasanya tidak wajib untuk e5-base, tapi boleh dibiarkan jika error
    },
    encode_kwargs={
        'normalize_embeddings': True, # Wajib True untuk model E5 agar cosine similarity akurat
        'batch_size': 32 # Saya naikkan dari 1. Karena tanpa kuantisasi, model ini ringan & cepat.
    },
    multi_process=False 
)

# =====================================================
# STEP 2: LOAD DATASET DARI EXCEL (LANGSUNG EVALUASI)
# =====================================================
# Ganti nama file sesuai file excel Anda yang sudah lengkap/rapi
file_path = "dataset_ragas_lengkap.xlsx" 
print(f"üìÇ Membaca file dataset: {file_path}")

df = pd.read_excel(file_path)

# --- FUNGSI PREPROCESSING KRUSIAL ---
# Excel menyimpan list sebagai string, contoh: "['dokumen A', 'dokumen B']"
# Kita harus mengubahnya kembali menjadi list Python sungguhan: ['dokumen A', 'dokumen B']
def parse_contexts(x):
    if isinstance(x, list):
        return x  # Jika sudah list, biarkan
    try:
        # Mencoba mengubah string representasi list menjadi list asli
        return literal_eval(x)
    except:
        # Jika gagal (misal sel kosong atau format rusak), kembalikan list kosong
        return []

print("‚öôÔ∏è Melakukan konversi format data (String -> List)...")
# Terapkan fungsi ke kolom contexts
if 'contexts' in df.columns:
    df['contexts'] = df['contexts'].apply(parse_contexts)
else:
    raise ValueError("‚ùå Kolom 'contexts' tidak ditemukan di Excel!")

# Pastikan kolom ground_truth diperlakukan sebagai string (jika ada)
if 'ground_truth' in df.columns:
    df['ground_truth'] = df['ground_truth'].astype(str)

# Pastikan kolom answer diperlakukan sebagai string
df['answer'] = df['answer'].astype(str)

print("‚úÖ Data berhasil dimuat dan diproses.")

# =====================================================
# STEP 3: JALANKAN EVALUASI RAGAS
# =====================================================
# Konversi DataFrame Pandas ke Dataset RAGAs
rag_dataset = Dataset.from_pandas(df)

metrics = [context_precision, faithfulness, answer_relevancy, context_recall]

print("\nüöÄ Memulai Evaluasi Ragas...")
# Jalankan Evaluasi
results = evaluate(
    dataset=rag_dataset,
    metrics=metrics,
    llm=evaluator_llm,
    embeddings=evaluator_embeddings,
    raise_exceptions=False
)

# --- PRINT HASIL AKHIR ---
print("\n" + "="*30)
print("üìä HASIL AKHIR SKOR RAGAS")
print("="*30)
print(results)
print("="*30)

# =====================================================
# STEP 4: SAVE HASIL KE EXCEL BARU
# =====================================================
metrics_df = results.to_pandas()

# Reset index untuk penggabungan yang aman
df_reset = df.reset_index(drop=True)
metrics_reset = metrics_df.reset_index(drop=True)

# Gabungkan data asli dengan skor hasil evaluasi
final_df = pd.concat([df_reset, metrics_reset], axis=1)

# Hapus kolom duplikat jika ada
final_df = final_df.loc[:, ~final_df.columns.duplicated()]

# Tentukan urutan kolom yang rapi untuk output
cols_target = ['question', 'answer', 'contexts', 'ground_truth', 
               'context_precision', 'context_recall', 'faithfulness', 'answer_relevancy']
final_cols = [c for c in cols_target if c in final_df.columns]
final_df = final_df[final_cols]

output_filename = "hasil_evaluasi_final_e5_base.xlsx"
final_df.to_excel(output_filename, index=False)
print(f"\nüíæ File hasil evaluasi disimpan: {output_filename}")

Koneksi ke Firebase Firestore berhasil.
Inisialisasi LLM dengan model: meta-llama/llama-4-scout
Inisialisasi Evaluator dengan model: openai/gpt-4o-mini
üßπ VRAM Cleaned. Usage: 0.00 GB
‚öôÔ∏è Loading Embedding Model (untuk metrik Ragas)...
üìÇ Membaca file dataset: dataset_ragas_lengkap.xlsx
‚öôÔ∏è Melakukan konversi format data (String -> List)...
‚úÖ Data berhasil dimuat dan diproses.

üöÄ Memulai Evaluasi Ragas...


Evaluating:   0%|          | 0/36 [00:00<?, ?it/s]


üìä HASIL AKHIR SKOR RAGAS
{'context_precision': 0.8889, 'faithfulness': 0.8889, 'answer_relevancy': 0.9067, 'context_recall': 0.8889}

üíæ File hasil evaluasi disimpan: hasil_evaluasi_final_e5_base.xlsx


uji dengan 130 QA

In [1]:
import os
import gc
import pandas as pd
import torch
from datasets import Dataset
from ast import literal_eval  # PENTING: Untuk mengubah string "['...']" kembali jadi list
from resources import evaluator
evaluator_llm = evaluator

# Library Ragas & Evaluasi
from ragas import evaluate
from ragas.metrics import (
    context_precision,
    faithfulness,
    answer_relevancy,
    context_recall,
)
from langchain_huggingface import HuggingFaceEmbeddings
from transformers import BitsAndBytesConfig

# Import module custom Anda (hanya untuk evaluator judge)
from resources import evaluator

# =====================================================
# STEP 0: BERSIHKAN MEMORI (VRAM)
# =====================================================
try:
    del evaluator_embeddings
    del results
    del rag_dataset
except:
    pass

gc.collect()
torch.cuda.empty_cache()
print(f"üßπ VRAM Cleaned. Usage: {torch.cuda.memory_allocated()/1024**3:.2f} GB")

# =====================================================
# STEP 1: KONFIGURASI MODEL (JUDGE & EMBEDDING)
# =====================================================
print("‚öôÔ∏è Loading Embedding Model (untuk metrik Ragas)...")

evaluator_embeddings = HuggingFaceEmbeddings(
    model_name="intfloat/multilingual-e5-base",
    model_kwargs={
        'device': 'cuda',   # Memaksa model berjalan di GPU
        # 'trust_remote_code': True # Biasanya tidak wajib untuk e5-base, tapi boleh dibiarkan jika error
    },
    encode_kwargs={
        'normalize_embeddings': True, # Wajib True untuk model E5 agar cosine similarity akurat
        'batch_size': 32 # Saya naikkan dari 1. Karena tanpa kuantisasi, model ini ringan & cepat.
    },
    multi_process=False 
)

# =====================================================
# STEP 2: LOAD DATASET DARI EXCEL (LANGSUNG EVALUASI)
# =====================================================
# Ganti nama file sesuai file excel Anda yang sudah lengkap/rapi
file_path = "dataset_ragas_lengkap.xlsx" 
print(f"üìÇ Membaca file dataset: {file_path}")

df = pd.read_excel(file_path)

# --- FUNGSI PREPROCESSING KRUSIAL ---
# Excel menyimpan list sebagai string, contoh: "['dokumen A', 'dokumen B']"
# Kita harus mengubahnya kembali menjadi list Python sungguhan: ['dokumen A', 'dokumen B']
def parse_contexts(x):
    if isinstance(x, list):
        return x  # Jika sudah list, biarkan
    try:
        # Mencoba mengubah string representasi list menjadi list asli
        return literal_eval(x)
    except:
        # Jika gagal (misal sel kosong atau format rusak), kembalikan list kosong
        return []

print("‚öôÔ∏è Melakukan konversi format data (String -> List)...")
# Terapkan fungsi ke kolom contexts
if 'contexts' in df.columns:
    df['contexts'] = df['contexts'].apply(parse_contexts)
else:
    raise ValueError("‚ùå Kolom 'contexts' tidak ditemukan di Excel!")

# Pastikan kolom ground_truth diperlakukan sebagai string (jika ada)
if 'ground_truth' in df.columns:
    df['ground_truth'] = df['ground_truth'].astype(str)

# Pastikan kolom answer diperlakukan sebagai string
df['answer'] = df['answer'].astype(str)

print("‚úÖ Data berhasil dimuat dan diproses.")

# =====================================================
# STEP 3: JALANKAN EVALUASI RAGAS
# =====================================================
# Konversi DataFrame Pandas ke Dataset RAGAs
rag_dataset = Dataset.from_pandas(df)

metrics = [context_precision, faithfulness, answer_relevancy, context_recall]

print("\nüöÄ Memulai Evaluasi Ragas...")
# Jalankan Evaluasi
results = evaluate(
    dataset=rag_dataset,
    metrics=metrics,
    llm=evaluator_llm,
    embeddings=evaluator_embeddings,
    raise_exceptions=False
)

# --- PRINT HASIL AKHIR ---
print("\n" + "="*30)
print("üìä HASIL AKHIR SKOR RAGAS")
print("="*30)
print(results)
print("="*30)

# =====================================================
# STEP 4: SAVE HASIL KE EXCEL BARU
# =====================================================
metrics_df = results.to_pandas()

# Reset index untuk penggabungan yang aman
df_reset = df.reset_index(drop=True)
metrics_reset = metrics_df.reset_index(drop=True)

# Gabungkan data asli dengan skor hasil evaluasi
final_df = pd.concat([df_reset, metrics_reset], axis=1)

# Hapus kolom duplikat jika ada
final_df = final_df.loc[:, ~final_df.columns.duplicated()]

# Tentukan urutan kolom yang rapi untuk output
cols_target = ['question', 'answer', 'contexts', 'ground_truth', 
               'context_precision', 'context_recall', 'faithfulness', 'answer_relevancy']
final_cols = [c for c in cols_target if c in final_df.columns]
final_df = final_df[final_cols]

output_filename = "hasil_evaluasi_final_e5_base.xlsx"
final_df.to_excel(output_filename, index=False)
print(f"\nüíæ File hasil evaluasi disimpan: {output_filename}")

Koneksi ke Firebase Firestore berhasil.
Inisialisasi LLM dengan model: meta-llama/llama-4-scout
Menggunakan model evaluator UPATIK
Inisialisasi Evaluator dengan model: gpt-4o-mini
üßπ VRAM Cleaned. Usage: 0.00 GB
‚öôÔ∏è Loading Embedding Model (untuk metrik Ragas)...
üìÇ Membaca file dataset: dataset_ragas_lengkap.xlsx
‚öôÔ∏è Melakukan konversi format data (String -> List)...
‚úÖ Data berhasil dimuat dan diproses.

üöÄ Memulai Evaluasi Ragas...


Evaluating:   0%|          | 0/520 [00:00<?, ?it/s]


üìä HASIL AKHIR SKOR RAGAS
{'context_precision': 0.8833, 'faithfulness': 0.7517, 'answer_relevancy': 0.8989, 'context_recall': 0.9231}

üíæ File hasil evaluasi disimpan: hasil_evaluasi_final_e5_base.xlsx
