# Evaluasi Text Summarization dengan Gemma2 9B

## 📋 Deskripsi
Notebook ini berisi implementasi lengkap untuk evaluasi model text summarization menggunakan:
- **Dataset**: Berita bahasa Indonesia
- **Model**: Gemma2 9B dari Google
- **Metrik Evaluasi**: ROUGE, BLEU, dan BERTScore

## 🎯 Tujuan
Mengevaluasi performa model Gemma2 9B dalam melakukan text summarization pada dataset berita bahasa Indonesia menggunakan metrik evaluasi standar.

## 📚 Daftar Isi
1. [Setup dan Import Library](#1-setup-dan-import-library)
2. [Data Loader Module](#2-data-loader-module)
3. [Summarizer Module](#3-summarizer-module)
4. [Evaluator Module](#4-evaluator-module)
5. [Visualizer Module](#5-visualizer-module)
6. [Main Pipeline](#6-main-pipeline)
7. [Hasil dan Analisis](#7-hasil-dan-analisis)

## 1. Setup dan Import Library

In [None]:
# Install required packages!pip install -q torch transformers accelerate bitsandbytes!pip install -q rouge-score sacrebleu bert-score!pip install -q pandas numpy matplotlib seaborn tqdm!pip install -q datasets huggingface_hub

In [None]:
# Import necessary librariesimport jsonimport osimport reimport warningsfrom typing import List, Dict, Any, Tupleimport torchimport pandas as pdimport numpy as npimport matplotlib.pyplot as pltimport seaborn as snsfrom tqdm import tqdmfrom transformers import AutoTokenizer, AutoModelForCausalLMfrom rouge_score import rouge_scorerfrom sacrebleu import BLEUfrom bert_score import score as bert_score_func# Suppress warningswarnings.filterwarnings('ignore')# Set plot styleplt.style.use('seaborn-v0_8')sns.set_palette("husl")# Check GPU availabilitydevice = "cuda" if torch.cuda.is_available() else "cpu"print(f"Using device: {device}")if device == "cuda":    print(f"GPU: {torch.cuda.get_device_name(0)}")    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")

## 2. Data Loader Module

In [None]:
import jsonimport osfrom typing import List, Dict, Any, Tupleimport pandas as pdfrom tqdm import tqdmclass NewsDatasetLoader:    """    Class untuk memuat dataset berita dari file JSONL    """        def __init__(self, data_dir: str = "data"):        """        Inisialisasi data loader                Args:            data_dir: Direktori yang berisi file dataset        """        self.data_dir = data_dir            def load_jsonl_file(self, file_path: str) -> List[Dict[str, Any]]:        """        Memuat file JSONL dan mengembalikan list of dictionaries                Args:            file_path: Path ke file JSONL                    Returns:            List of dictionaries yang berisi data berita        """        data = []        with open(file_path, 'r', encoding='utf-8') as f:            for line in f:                if line.strip():                    data.append(json.loads(line))        return data        def load_all_train_files(self) -> List[Dict[str, Any]]:        """        Memuat semua file train.XX.jsonl                Returns:            List of dictionaries yang berisi semua data training        """        all_data = []                # Cari semua file train.XX.jsonl        train_files = []        for file in os.listdir(self.data_dir):            if file.startswith('train.') and file.endswith('.jsonl'):                train_files.append(file)                train_files.sort()  # Urutkan berdasarkan nomor                print(f"Menemukan {len(train_files)} file training:")        for file in train_files:            print(f"  - {file}")                # Muat setiap file        for file in tqdm(train_files, desc="Loading training files"):            file_path = os.path.join(self.data_dir, file)            data = self.load_jsonl_file(file_path)            all_data.extend(data)                    print(f"Total {len(all_data)} artikel berita dimuat")        return all_data        def preprocess_data(self, data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:        """        Preprocessing data untuk format yang lebih mudah digunakan                Args:            data: Raw data dari JSONL                    Returns:            Data yang sudah dipreprocess        """        processed_data = []                for item in tqdm(data, desc="Preprocessing data"):            # Gabungkan paragraphs menjadi teks lengkap            full_text = self._combine_paragraphs(item['paragraphs'])                        # Gabungkan summary menjadi teks lengkap            full_summary = self._combine_paragraphs(item['summary'])                        processed_item = {                'id': item['id'],                'category': item['category'],                'source': item['source'],                'source_url': item['source_url'],                'text': full_text,                'summary': full_summary,                'gold_labels': item['gold_labels'],                'paragraphs': item['paragraphs'],                'summary_paragraphs': item['summary']            }                        processed_data.append(processed_item)                    return processed_data        def _combine_paragraphs(self, paragraphs: List[List[List[str]]]) -> str:        """        Menggabungkan paragraphs menjadi satu teks                Args:            paragraphs: List of paragraphs yang berisi list of sentences yang berisi list of tokens                    Returns:            Teks yang sudah digabungkan        """        full_text = ""                for paragraph in paragraphs:            paragraph_text = ""            for sentence in paragraph:                sentence_text = " ".join(sentence)                paragraph_text += sentence_text + " "            full_text += paragraph_text.strip() + "\n\n"                    return full_text.strip()        def get_dataframe(self, data: List[Dict[str, Any]]) -> pd.DataFrame:        """        Mengkonversi data menjadi pandas DataFrame                Args:            data: List of dictionaries                    Returns:            Pandas DataFrame        """        return pd.DataFrame(data)        def save_processed_data(self, data: List[Dict[str, Any]], output_path: str):        """        Menyimpan data yang sudah dipreprocess                Args:            data: Data yang sudah dipreprocess            output_path: Path untuk menyimpan file        """        with open(output_path, 'w', encoding='utf-8') as f:            for item in data:                json.dump(item, f, ensure_ascii=False)                f.write('\n')        print(f"Data tersimpan di: {output_path}")        def load_processed_data(self, file_path: str) -> List[Dict[str, Any]]:        """        Memuat data yang sudah dipreprocess                Args:            file_path: Path ke file data yang sudah dipreprocess                    Returns:            List of dictionaries        """        return self.load_jsonl_file(file_path)

## 3. Summarizer Module

In [None]:
import torchfrom transformers import AutoTokenizer, AutoModelForCausalLMfrom typing import List, Dict, Anyimport refrom tqdm import tqdmclass GemmaSummarizer:    """    Class untuk melakukan summarization menggunakan model Gemma2 9B    """        def __init__(self, model_name: str = "google/gemma-2-9b-it", device: str = None):        """        Inisialisasi summarizer dengan model Gemma2 9B                Args:            model_name: Nama model yang akan digunakan            device: Device untuk inference (cuda/cpu)        """        self.model_name = model_name                # Set device        if device is None:            self.device = "cuda" if torch.cuda.is_available() else "cpu"        else:            self.device = device                    print(f"Menggunakan device: {self.device}")                # Load tokenizer dan model        print("Memuat tokenizer...")        self.tokenizer = AutoTokenizer.from_pretrained(model_name)                print("Memuat model...")        self.model = AutoModelForCausalLM.from_pretrained(            model_name,            torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,            device_map="auto" if self.device == "cuda" else None        )                # Set padding token jika belum ada        if self.tokenizer.pad_token is None:            self.tokenizer.pad_token = self.tokenizer.eos_token                    print("Model berhasil dimuat!")        def generate_summary(self, text: str, max_length: int = 512, temperature: float = 0.7) -> str:        """        Generate summary untuk teks input                Args:            text: Teks yang akan diringkas            max_length: Panjang maksimal summary            temperature: Temperature untuk sampling                    Returns:            Summary yang dihasilkan        """        # Prompt template untuk summarization dalam bahasa Indonesia        prompt = f"""Berikut adalah artikel berita dalam bahasa Indonesia. Buatlah ringkasan yang singkat dan informatif dalam bahasa Indonesia.Artikel:{text}Ringkasan:"""                # Tokenize input        inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048)        inputs = {k: v.to(self.device) for k, v in inputs.items()}                # Generate summary        with torch.no_grad():            outputs = self.model.generate(                **inputs,                max_new_tokens=max_length,                temperature=temperature,                do_sample=True,                pad_token_id=self.tokenizer.eos_token_id,                eos_token_id=self.tokenizer.eos_token_id,                repetition_penalty=1.1            )                # Decode output        generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)                # Extract summary (hapus prompt)        summary = generated_text[len(prompt):].strip()                return summary        def batch_summarize(self, texts: List[str], max_length: int = 512, temperature: float = 0.7) -> List[str]:        """        Generate summary untuk batch teks                Args:            texts: List of teks yang akan diringkas            max_length: Panjang maksimal summary            temperature: Temperature untuk sampling                    Returns:            List of summaries        """        summaries = []                for text in tqdm(texts, desc="Generating summaries"):            try:                summary = self.generate_summary(text, max_length, temperature)                summaries.append(summary)            except Exception as e:                print(f"Error saat generate summary: {e}")                summaries.append("")                        return summaries        def summarize_dataset(self, dataset: List[Dict[str, Any]], max_length: int = 512, temperature: float = 0.7) -> List[Dict[str, Any]]:        """        Generate summary untuk seluruh dataset                Args:            dataset: Dataset yang berisi teks berita            max_length: Panjang maksimal summary            temperature: Temperature untuk sampling                    Returns:            Dataset dengan summary yang dihasilkan        """        results = []                for item in tqdm(dataset, desc="Processing dataset"):            try:                # Generate summary                generated_summary = self.generate_summary(                    item['text'],                     max_length=max_length,                     temperature=temperature                )                                # Tambahkan hasil ke item                result_item = item.copy()                result_item['generated_summary'] = generated_summary                results.append(result_item)                            except Exception as e:                print(f"Error processing item {item.get('id', 'unknown')}: {e}")                result_item = item.copy()                result_item['generated_summary'] = ""                results.append(result_item)                        return results        def clean_summary(self, summary: str) -> str:        """        Membersihkan summary dari karakter yang tidak diinginkan                Args:            summary: Summary yang akan dibersihkan                    Returns:            Summary yang sudah dibersihkan        """        # Hapus karakter khusus        summary = re.sub(r'[^\w\s\.\,\!\?\:\;\-\(\)]', '', summary)                # Hapus spasi berlebih        summary = re.sub(r'\s+', ' ', summary)                # Hapus baris kosong        summary = summary.strip()                return summary

## 4. Evaluator Module

In [None]:
import numpy as npfrom typing import List, Dict, Any, Tuplefrom rouge_score import rouge_scorerfrom sacrebleu import BLEUfrom bert_score import score as bert_score_funcimport pandas as pdfrom tqdm import tqdmclass SummarizationEvaluator:    """    Class untuk mengevaluasi hasil summarization menggunakan ROUGE, BLEU, dan BERTScore    """        def __init__(self, lang: str = "id"):        """        Inisialisasi evaluator                Args:            lang: Bahasa untuk evaluasi (default: id untuk Indonesia)        """        self.lang = lang                # Initialize ROUGE scorer        self.rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)                print("Evaluator berhasil diinisialisasi!")        def calculate_rouge_scores(self, references: List[str], predictions: List[str]) -> Dict[str, float]:        """        Menghitung skor ROUGE                Args:            references: List of reference summaries (ground truth)            predictions: List of predicted summaries                    Returns:            Dictionary berisi skor ROUGE        """        rouge1_scores = []        rouge2_scores = []        rougeL_scores = []                for ref, pred in tqdm(zip(references, predictions), desc="Calculating ROUGE scores", total=len(references)):            if not pred.strip():  # Skip empty predictions                rouge1_scores.append(0.0)                rouge2_scores.append(0.0)                rougeL_scores.append(0.0)                continue                            scores = self.rouge_scorer.score(ref, pred)            rouge1_scores.append(scores['rouge1'].fmeasure)            rouge2_scores.append(scores['rouge2'].fmeasure)            rougeL_scores.append(scores['rougeL'].fmeasure)                return {            'rouge1': np.mean(rouge1_scores),            'rouge2': np.mean(rouge2_scores),            'rougeL': np.mean(rougeL_scores),            'rouge1_std': np.std(rouge1_scores),            'rouge2_std': np.std(rouge2_scores),            'rougeL_std': np.std(rougeL_scores)        }        def calculate_bleu_score(self, references: List[str], predictions: List[str]) -> Dict[str, float]:        """        Menghitung skor BLEU                Args:            references: List of reference summaries            predictions: List of predicted summaries                    Returns:            Dictionary berisi skor BLEU        """        # Filter out empty predictions        valid_pairs = [(ref, pred) for ref, pred in zip(references, predictions) if pred.strip()]                if not valid_pairs:            return {'bleu': 0.0}                refs, preds = zip(*valid_pairs)                # Convert to list of lists for BLEU calculation        refs_list = [[ref] for ref in refs]                # Calculate BLEU        bleu = BLEU()        result = bleu.corpus_score(preds, refs_list)                return {            'bleu': result.score,            'bleu_details': {                'precisions': result.precisions,                'bp': result.bp,                'sys_len': result.sys_len,                'ref_len': result.ref_len            }        }        def calculate_bertscore(self, references: List[str], predictions: List[str]) -> Dict[str, float]:        """        Menghitung skor BERTScore                Args:            references: List of reference summaries            predictions: List of predicted summaries                    Returns:            Dictionary berisi skor BERTScore        """        # Filter out empty predictions        valid_pairs = [(ref, pred) for ref, pred in zip(references, predictions) if pred.strip()]                if not valid_pairs:            return {'bertscore': 0.0}                refs, preds = zip(*valid_pairs)                try:            # Calculate BERTScore            P, R, F1 = bert_score_func(                preds,                 refs,                 lang=self.lang,                 verbose=True,                batch_size=16            )                        return {                'bertscore_precision': P.mean().item(),                'bertscore_recall': R.mean().item(),                'bertscore_f1': F1.mean().item(),                'bertscore_precision_std': P.std().item(),                'bertscore_recall_std': R.std().item(),                'bertscore_f1_std': F1.std().item()            }        except Exception as e:            print(f"Error calculating BERTScore: {e}")            return {                'bertscore_precision': 0.0,                'bertscore_recall': 0.0,                'bertscore_f1': 0.0,                'bertscore_precision_std': 0.0,                'bertscore_recall_std': 0.0,                'bertscore_f1_std': 0.0            }        def evaluate_summaries(self, references: List[str], predictions: List[str]) -> Dict[str, Any]:        """        Evaluasi lengkap menggunakan semua metrik                Args:            references: List of reference summaries            predictions: List of predicted summaries                    Returns:            Dictionary berisi semua skor evaluasi        """        print("Memulai evaluasi summarization...")                # Calculate ROUGE scores        print("Menghitung skor ROUGE...")        rouge_scores = self.calculate_rouge_scores(references, predictions)                # Calculate BLEU score        print("Menghitung skor BLEU...")        bleu_scores = self.calculate_bleu_score(references, predictions)                # Calculate BERTScore        print("Menghitung skor BERTScore...")        bert_scores = self.calculate_bertscore(references, predictions)                # Combine all scores        results = {            'rouge': rouge_scores,            'bleu': bleu_scores,            'bertscore': bert_scores,            'summary': {                'rouge1': rouge_scores['rouge1'],                'rouge2': rouge_scores['rouge2'],                'rougeL': rouge_scores['rougeL'],                'bleu': bleu_scores['bleu'],                'bertscore_f1': bert_scores['bertscore_f1']            }        }                return results        def evaluate_dataset(self, dataset: List[Dict[str, Any]]) -> Dict[str, Any]:        """        Evaluasi dataset yang sudah berisi generated summaries                Args:            dataset: Dataset dengan field 'summary' dan 'generated_summary'                    Returns:            Dictionary berisi hasil evaluasi        """        references = [item['summary'] for item in dataset]        predictions = [item['generated_summary'] for item in dataset]                return self.evaluate_summaries(references, predictions)        def print_results(self, results: Dict[str, Any]):        """        Print hasil evaluasi dengan format yang rapi                Args:            results: Hasil evaluasi dari evaluate_summaries        """        print("\n" + "="*50)        print("HASIL EVALUASI SUMMARIZATION")        print("="*50)                # ROUGE Scores        print("\nROUGE Scores:")        print(f"  ROUGE-1: {results['rouge']['rouge1']:.4f} ± {results['rouge']['rouge1_std']:.4f}")        print(f"  ROUGE-2: {results['rouge']['rouge2']:.4f} ± {results['rouge']['rouge2_std']:.4f}")        print(f"  ROUGE-L: {results['rouge']['rougeL']:.4f} ± {results['rouge']['rougeL_std']:.4f}")                # BLEU Score        print(f"\nBLEU Score:")        print(f"  BLEU: {results['bleu']['bleu']:.4f}")                # BERTScore        print(f"\nBERTScore:")        print(f"  Precision: {results['bertscore']['bertscore_precision']:.4f} ± {results['bertscore']['bertscore_precision_std']:.4f}")        print(f"  Recall: {results['bertscore']['bertscore_recall']:.4f} ± {results['bertscore']['bertscore_recall_std']:.4f}")        print(f"  F1: {results['bertscore']['bertscore_f1']:.4f} ± {results['bertscore']['bertscore_f1_std']:.4f}")                print("\n" + "="*50)        def save_results(self, results: Dict[str, Any], output_path: str):        """        Menyimpan hasil evaluasi ke file                Args:            results: Hasil evaluasi            output_path: Path untuk menyimpan file        """        import json                with open(output_path, 'w', encoding='utf-8') as f:            json.dump(results, f, indent=2, ensure_ascii=False)                print(f"Hasil evaluasi tersimpan di: {output_path}")        def create_evaluation_dataframe(self, dataset: List[Dict[str, Any]]) -> pd.DataFrame:        """        Membuat DataFrame untuk analisis detail hasil evaluasi                Args:            dataset: Dataset dengan generated summaries                    Returns:            DataFrame dengan skor per item        """        evaluation_data = []                for item in dataset:            ref = item['summary']            pred = item['generated_summary']                        if not pred.strip():                rouge_scores = {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0}            else:                scores = self.rouge_scorer.score(ref, pred)                rouge_scores = {                    'rouge1': scores['rouge1'].fmeasure,                    'rouge2': scores['rouge2'].fmeasure,                    'rougeL': scores['rougeL'].fmeasure                }                        evaluation_data.append({                'id': item['id'],                'category': item['category'],                'source': item['source'],                'reference_length': len(ref.split()),                'prediction_length': len(pred.split()),                'rouge1': rouge_scores['rouge1'],                'rouge2': rouge_scores['rouge2'],                'rougeL': rouge_scores['rougeL']            })                return pd.DataFrame(evaluation_data)

## 5. Visualizer Module

In [None]:
import matplotlib.pyplot as pltimport seaborn as snsimport pandas as pdimport numpy as npfrom typing import List, Dict, Anyimport warningswarnings.filterwarnings('ignore')# Set style untuk plotplt.style.use('seaborn-v0_8')sns.set_palette("husl")class SummarizationVisualizer:    """    Class untuk visualisasi hasil evaluasi summarization    """        def __init__(self, figsize: tuple = (12, 8)):        """        Inisialisasi visualizer                Args:            figsize: Ukuran default figure        """        self.figsize = figsize        plt.rcParams['font.size'] = 10        plt.rcParams['axes.titlesize'] = 12        plt.rcParams['axes.labelsize'] = 10            def plot_metrics_comparison(self, results: Dict[str, Any], save_path: str = None):        """        Plot perbandingan metrik evaluasi                Args:            results: Hasil evaluasi dari evaluator            save_path: Path untuk menyimpan plot        """        fig, axes = plt.subplots(2, 2, figsize=(15, 12))        fig.suptitle('Perbandingan Metrik Evaluasi Summarization', fontsize=16, fontweight='bold')                # ROUGE Scores        rouge_metrics = ['rouge1', 'rouge2', 'rougeL']        rouge_values = [results['rouge'][metric] for metric in rouge_metrics]        rouge_stds = [results['rouge'][f'{metric}_std'] for metric in rouge_metrics]                axes[0, 0].bar(rouge_metrics, rouge_values, yerr=rouge_stds, capsize=5, alpha=0.7)        axes[0, 0].set_title('ROUGE Scores')        axes[0, 0].set_ylabel('Score')        axes[0, 0].set_ylim(0, 1)        for i, v in enumerate(rouge_values):            axes[0, 0].text(i, v + rouge_stds[i] + 0.01, f'{v:.3f}', ha='center', va='bottom')                # BLEU Score        bleu_value = results['bleu']['bleu']        axes[0, 1].bar(['BLEU'], [bleu_value], alpha=0.7, color='orange')        axes[0, 1].set_title('BLEU Score')        axes[0, 1].set_ylabel('Score')        axes[0, 1].set_ylim(0, max(bleu_value * 1.2, 0.1))        axes[0, 1].text(0, bleu_value + 0.001, f'{bleu_value:.3f}', ha='center', va='bottom')                # BERTScore        bert_metrics = ['bertscore_precision', 'bertscore_recall', 'bertscore_f1']        bert_values = [results['bertscore'][metric] for metric in bert_metrics]        bert_stds = [results['bertscore'][f'{metric}_std'] for metric in bert_metrics]                axes[1, 0].bar(bert_metrics, bert_values, yerr=bert_stds, capsize=5, alpha=0.7, color='green')        axes[1, 0].set_title('BERTScore')        axes[1, 0].set_ylabel('Score')        axes[1, 0].set_ylim(0, 1)        for i, v in enumerate(bert_values):            axes[1, 0].text(i, v + bert_stds[i] + 0.01, f'{v:.3f}', ha='center', va='bottom')                # Summary metrics        summary_metrics = ['ROUGE-1', 'ROUGE-2', 'ROUGE-L', 'BLEU', 'BERTScore-F1']        summary_values = [            results['summary']['rouge1'],            results['summary']['rouge2'],            results['summary']['rougeL'],            results['summary']['bleu'],            results['summary']['bertscore_f1']        ]                axes[1, 1].bar(summary_metrics, summary_values, alpha=0.7, color='purple')        axes[1, 1].set_title('Ringkasan Semua Metrik')        axes[1, 1].set_ylabel('Score')        axes[1, 1].set_ylim(0, 1)        axes[1, 1].tick_params(axis='x', rotation=45)        for i, v in enumerate(summary_values):            axes[1, 1].text(i, v + 0.01, f'{v:.3f}', ha='center', va='bottom')                plt.tight_layout()                if save_path:            plt.savefig(save_path, dpi=300, bbox_inches='tight')            print(f"Plot tersimpan di: {save_path}")                plt.show()        def plot_category_analysis(self, evaluation_df: pd.DataFrame, save_path: str = None):        """        Plot analisis berdasarkan kategori berita                Args:            evaluation_df: DataFrame hasil evaluasi            save_path: Path untuk menyimpan plot        """        fig, axes = plt.subplots(2, 2, figsize=(15, 12))        fig.suptitle('Analisis Hasil Berdasarkan Kategori Berita', fontsize=16, fontweight='bold')                # ROUGE scores by category        category_rouge = evaluation_df.groupby('category')[['rouge1', 'rouge2', 'rougeL']].mean()                category_rouge.plot(kind='bar', ax=axes[0, 0], alpha=0.7)        axes[0, 0].set_title('ROUGE Scores per Kategori')        axes[0, 0].set_ylabel('Score')        axes[0, 0].legend()        axes[0, 0].tick_params(axis='x', rotation=45)                # Length comparison by category        length_comparison = evaluation_df.groupby('category')[['reference_length', 'prediction_length']].mean()                length_comparison.plot(kind='bar', ax=axes[0, 1], alpha=0.7)        axes[0, 1].set_title('Panjang Summary per Kategori')        axes[0, 1].set_ylabel('Jumlah Kata')        axes[0, 1].legend(['Reference', 'Prediction'])        axes[0, 1].tick_params(axis='x', rotation=45)                # Distribution of ROUGE-1 scores        axes[1, 0].hist(evaluation_df['rouge1'], bins=20, alpha=0.7, color='skyblue', edgecolor='black')        axes[1, 0].set_title('Distribusi ROUGE-1 Scores')        axes[1, 0].set_xlabel('ROUGE-1 Score')        axes[1, 0].set_ylabel('Frekuensi')        axes[1, 0].axvline(evaluation_df['rouge1'].mean(), color='red', linestyle='--',                           label=f'Mean: {evaluation_df["rouge1"].mean():.3f}')        axes[1, 0].legend()                # Box plot ROUGE scores by category        rouge_data = []        categories = []        for category in evaluation_df['category'].unique():            cat_data = evaluation_df[evaluation_df['category'] == category]['rouge1']            rouge_data.extend(cat_data.tolist())            categories.extend([category] * len(cat_data))                rouge_df = pd.DataFrame({'category': categories, 'rouge1': rouge_data})        sns.boxplot(data=rouge_df, x='category', y='rouge1', ax=axes[1, 1])        axes[1, 1].set_title('Box Plot ROUGE-1 per Kategori')        axes[1, 1].set_ylabel('ROUGE-1 Score')        axes[1, 1].tick_params(axis='x', rotation=45)                plt.tight_layout()                if save_path:            plt.savefig(save_path, dpi=300, bbox_inches='tight')            print(f"Plot tersimpan di: {save_path}")                plt.show()        def plot_source_analysis(self, evaluation_df: pd.DataFrame, save_path: str = None):        """        Plot analisis berdasarkan sumber berita                Args:            evaluation_df: DataFrame hasil evaluasi            save_path: Path untuk menyimpan plot        """        fig, axes = plt.subplots(2, 2, figsize=(15, 12))        fig.suptitle('Analisis Hasil Berdasarkan Sumber Berita', fontsize=16, fontweight='bold')                # ROUGE scores by source        source_rouge = evaluation_df.groupby('source')[['rouge1', 'rouge2', 'rougeL']].mean()                source_rouge.plot(kind='bar', ax=axes[0, 0], alpha=0.7)        axes[0, 0].set_title('ROUGE Scores per Sumber')        axes[0, 0].set_ylabel('Score')        axes[0, 0].legend()        axes[0, 0].tick_params(axis='x', rotation=45)                # Number of articles per source        source_counts = evaluation_df['source'].value_counts()        axes[0, 1].pie(source_counts.values, labels=source_counts.index, autopct='%1.1f%%', startangle=90)        axes[0, 1].set_title('Distribusi Artikel per Sumber')                # Average length by source        source_length = evaluation_df.groupby('source')[['reference_length', 'prediction_length']].mean()                source_length.plot(kind='bar', ax=axes[1, 0], alpha=0.7)        axes[1, 0].set_title('Panjang Summary per Sumber')        axes[1, 0].set_ylabel('Jumlah Kata')        axes[1, 0].legend(['Reference', 'Prediction'])        axes[1, 0].tick_params(axis='x', rotation=45)                # Heatmap correlation between metrics        metrics_corr = evaluation_df[['rouge1', 'rouge2', 'rougeL', 'reference_length', 'prediction_length']].corr()                sns.heatmap(metrics_corr, annot=True, cmap='coolwarm', center=0, ax=axes[1, 1])        axes[1, 1].set_title('Korelasi Antar Metrik')                plt.tight_layout()                if save_path:            plt.savefig(save_path, dpi=300, bbox_inches='tight')            print(f"Plot tersimpan di: {save_path}")                plt.show()        def plot_length_analysis(self, evaluation_df: pd.DataFrame, save_path: str = None):        """        Plot analisis hubungan panjang teks dengan performa                Args:            evaluation_df: DataFrame hasil evaluasi            save_path: Path untuk menyimpan plot        """        fig, axes = plt.subplots(2, 2, figsize=(15, 12))        fig.suptitle('Analisis Hubungan Panjang Teks dengan Performa', fontsize=16, fontweight='bold')                # ROUGE-1 vs Reference Length        axes[0, 0].scatter(evaluation_df['reference_length'], evaluation_df['rouge1'], alpha=0.6)        axes[0, 0].set_xlabel('Panjang Reference (kata)')        axes[0, 0].set_ylabel('ROUGE-1 Score')        axes[0, 0].set_title('ROUGE-1 vs Panjang Reference')                # Add trend line        z = np.polyfit(evaluation_df['reference_length'], evaluation_df['rouge1'], 1)        p = np.poly1d(z)        axes[0, 0].plot(evaluation_df['reference_length'], p(evaluation_df['reference_length']), "r--", alpha=0.8)                # ROUGE-1 vs Prediction Length        axes[0, 1].scatter(evaluation_df['prediction_length'], evaluation_df['rouge1'], alpha=0.6, color='orange')        axes[0, 1].set_xlabel('Panjang Prediction (kata)')        axes[0, 1].set_ylabel('ROUGE-1 Score')        axes[0, 1].set_title('ROUGE-1 vs Panjang Prediction')                # Add trend line        z = np.polyfit(evaluation_df['prediction_length'], evaluation_df['rouge1'], 1)        p = np.poly1d(z)        axes[0, 1].plot(evaluation_df['prediction_length'], p(evaluation_df['prediction_length']), "r--", alpha=0.8)                # Length ratio vs ROUGE-1        length_ratio = evaluation_df['prediction_length'] / evaluation_df['reference_length']        axes[1, 0].scatter(length_ratio, evaluation_df['rouge1'], alpha=0.6, color='green')        axes[1, 0].set_xlabel('Rasio Panjang (Prediction/Reference)')        axes[1, 0].set_ylabel('ROUGE-1 Score')        axes[1, 0].set_title('ROUGE-1 vs Rasio Panjang')                # Add trend line        z = np.polyfit(length_ratio, evaluation_df['rouge1'], 1)        p = np.poly1d(z)        axes[1, 0].plot(length_ratio, p(length_ratio), "r--", alpha=0.8)                # Distribution of length ratios        axes[1, 1].hist(length_ratio, bins=20, alpha=0.7, color='purple', edgecolor='black')        axes[1, 1].set_xlabel('Rasio Panjang (Prediction/Reference)')        axes[1, 1].set_ylabel('Frekuensi')        axes[1, 1].set_title('Distribusi Rasio Panjang')        axes[1, 1].axvline(length_ratio.mean(), color='red', linestyle='--',                           label=f'Mean: {length_ratio.mean():.2f}')        axes[1, 1].legend()                plt.tight_layout()                if save_path:            plt.savefig(save_path, dpi=300, bbox_inches='tight')            print(f"Plot tersimpan di: {save_path}")                plt.show()        def create_summary_report(self, results: Dict[str, Any], evaluation_df: pd.DataFrame, save_path: str = None):        """        Membuat laporan ringkasan lengkap                Args:            results: Hasil evaluasi            evaluation_df: DataFrame hasil evaluasi            save_path: Path untuk menyimpan laporan        """        fig, axes = plt.subplots(3, 3, figsize=(20, 16))        fig.suptitle('Laporan Lengkap Evaluasi Summarization', fontsize=18, fontweight='bold')                # 1. Overall metrics        metrics = ['ROUGE-1', 'ROUGE-2', 'ROUGE-L', 'BLEU', 'BERTScore-F1']        values = [            results['summary']['rouge1'],            results['summary']['rouge2'],            results['summary']['rougeL'],            results['summary']['bleu'],            results['summary']['bertscore_f1']        ]                bars = axes[0, 0].bar(metrics, values, alpha=0.7, color=['blue', 'green', 'red', 'orange', 'purple'])        axes[0, 0].set_title('Skor Metrik Keseluruhan')        axes[0, 0].set_ylabel('Score')        axes[0, 0].set_ylim(0, 1)        axes[0, 0].tick_params(axis='x', rotation=45)                # Add value labels        for bar, value in zip(bars, values):            axes[0, 0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,                            f'{value:.3f}', ha='center', va='bottom')                # 2. Category performance        category_perf = evaluation_df.groupby('category')['rouge1'].mean().sort_values(ascending=False)        category_perf.plot(kind='bar', ax=axes[0, 1], alpha=0.7)        axes[0, 1].set_title('ROUGE-1 per Kategori')        axes[0, 1].set_ylabel('ROUGE-1 Score')        axes[0, 1].tick_params(axis='x', rotation=45)                # 3. Source performance        source_perf = evaluation_df.groupby('source')['rouge1'].mean().sort_values(ascending=False)        source_perf.plot(kind='bar', ax=axes[0, 2], alpha=0.7)        axes[0, 2].set_title('ROUGE-1 per Sumber')        axes[0, 2].set_ylabel('ROUGE-1 Score')        axes[0, 2].tick_params(axis='x', rotation=45)                # 4. Length distribution        axes[1, 0].hist(evaluation_df['reference_length'], bins=20, alpha=0.7, label='Reference', color='blue')        axes[1, 0].hist(evaluation_df['prediction_length'], bins=20, alpha=0.7, label='Prediction', color='orange')        axes[1, 0].set_title('Distribusi Panjang Summary')        axes[1, 0].set_xlabel('Jumlah Kata')        axes[1, 0].set_ylabel('Frekuensi')        axes[1, 0].legend()                # 5. ROUGE scores distribution        axes[1, 1].hist(evaluation_df['rouge1'], bins=20, alpha=0.7, label='ROUGE-1', color='blue')        axes[1, 1].hist(evaluation_df['rouge2'], bins=20, alpha=0.7, label='ROUGE-2', color='green')        axes[1, 1].hist(evaluation_df['rougeL'], bins=20, alpha=0.7, label='ROUGE-L', color='red')        axes[1, 1].set_title('Distribusi ROUGE Scores')        axes[1, 1].set_xlabel('Score')        axes[1, 1].set_ylabel('Frekuensi')        axes[1, 1].legend()                # 6. Length vs Performance scatter        axes[1, 2].scatter(evaluation_df['reference_length'], evaluation_df['rouge1'], alpha=0.6)        axes[1, 2].set_xlabel('Panjang Reference')        axes[1, 2].set_ylabel('ROUGE-1 Score')        axes[1, 2].set_title('Panjang vs Performa')                # 7. Category counts        category_counts = evaluation_df['category'].value_counts()        axes[2, 0].pie(category_counts.values, labels=category_counts.index, autopct='%1.1f%%', startangle=90)        axes[2, 0].set_title('Distribusi Kategori')                # 8. Source counts        source_counts = evaluation_df['source'].value_counts()        axes[2, 1].pie(source_counts.values, labels=source_counts.index, autopct='%1.1f%%', startangle=90)        axes[2, 1].set_title('Distribusi Sumber')                # 9. Statistics table        axes[2, 2].axis('off')        stats_text = f"""        STATISTIK DATASET:                Total Artikel: {len(evaluation_df)}        Kategori: {evaluation_df['category'].nunique()}        Sumber: {evaluation_df['source'].nunique()}                RATA-RATA:        ROUGE-1: {evaluation_df['rouge1'].mean():.3f}        ROUGE-2: {evaluation_df['rouge2'].mean():.3f}        ROUGE-L: {evaluation_df['rougeL'].mean():.3f}                Panjang Reference: {evaluation_df['reference_length'].mean():.1f} kata        Panjang Prediction: {evaluation_df['prediction_length'].mean():.1f} kata        """        axes[2, 2].text(0.1, 0.9, stats_text, transform=axes[2, 2].transAxes,                        fontsize=10, verticalalignment='top', fontfamily='monospace')                plt.tight_layout()                if save_path:            plt.savefig(save_path, dpi=300, bbox_inches='tight')            print(f"Laporan tersimpan di: {save_path}")                plt.show()

## 6. Main Pipeline

### 6.1 Configuration

In [None]:
# ConfigurationCONFIG = {    'data_dir': 'data',  # Direktori data    'model_name': 'google/gemma-2-2b-it',  # Model yang akan digunakan (bisa diganti dengan gemma-2-9b-it)    'max_samples': 100,  # Jumlah maksimal sampel untuk evaluasi (set None untuk semua)    'output_dir': 'outputs',  # Direktori output    'batch_size': 16,  # Batch size untuk inference    'max_summary_length': 256,  # Panjang maksimal summary    'temperature': 0.7,  # Temperature untuk generation    'save_intermediate': True,  # Simpan hasil intermediate}# Create output directoryos.makedirs(CONFIG['output_dir'], exist_ok=True)

### 6.2 Load and Prepare Data

In [None]:
# Initialize data loaderdata_loader = NewsDatasetLoader(CONFIG['data_dir'])# Check if processed data existsprocessed_data_path = os.path.join(CONFIG['output_dir'], 'processed_data.jsonl')if os.path.exists(processed_data_path):    print("Loading processed data...")    dataset = data_loader.load_processed_data(processed_data_path)else:    print("Loading raw data...")    # Load all training files    raw_data = data_loader.load_all_train_files()        # Preprocess data    print("Preprocessing data...")    dataset = data_loader.preprocess_data(raw_data)        # Save processed data    if CONFIG['save_intermediate']:        data_loader.save_processed_data(dataset, processed_data_path)# Limit samples if specifiedif CONFIG['max_samples'] is not None:    dataset = dataset[:CONFIG['max_samples']]    print(f"Using {len(dataset)} samples for evaluation")# Display dataset statisticsdf = data_loader.get_dataframe(dataset)print(f"\nDataset Statistics:")print(f"Total articles: {len(df)}")print(f"Categories: {df['category'].nunique()}")print(f"Sources: {df['source'].nunique()}")print(f"\nCategory distribution:")print(df['category'].value_counts())print(f"\nSource distribution:")print(df['source'].value_counts())

### 6.3 Generate Summaries

In [None]:
# Check if summaries already existsummaries_path = os.path.join(CONFIG['output_dir'], 'dataset_with_summaries.jsonl')if os.path.exists(summaries_path):    print("Loading existing summaries...")    dataset_with_summaries = data_loader.load_processed_data(summaries_path)else:    print("Generating summaries...")        # Initialize summarizer    summarizer = GemmaSummarizer(model_name=CONFIG['model_name'])        # Generate summaries    dataset_with_summaries = summarizer.summarize_dataset(        dataset,        max_length=CONFIG['max_summary_length'],        temperature=CONFIG['temperature']    )        # Save results    if CONFIG['save_intermediate']:        data_loader.save_processed_data(dataset_with_summaries, summaries_path)# Display sample resultsprint("\n" + "="*50)print("SAMPLE RESULTS")print("="*50)for i in range(min(3, len(dataset_with_summaries))):    item = dataset_with_summaries[i]    print(f"\n--- Sample {i+1} ---")    print(f"ID: {item['id']}")    print(f"Category: {item['category']}")    print(f"Source: {item['source']}")    print(f"\nOriginal Text (first 200 chars):")    print(item['text'][:200] + "...")    print(f"\nReference Summary:")    print(item['summary'])    print(f"\nGenerated Summary:")    print(item['generated_summary'])

### 6.4 Evaluate Results

In [None]:
# Initialize evaluatorevaluator = SummarizationEvaluator(lang="id")# Evaluate summariesprint("\nEvaluating summaries...")evaluation_results = evaluator.evaluate_dataset(dataset_with_summaries)# Print resultsevaluator.print_results(evaluation_results)# Save evaluation resultsevaluator.save_results(    evaluation_results,    os.path.join(CONFIG['output_dir'], 'evaluation_results.json'))# Create evaluation dataframeevaluation_df = evaluator.create_evaluation_dataframe(dataset_with_summaries)# Save evaluation dataframeevaluation_df.to_csv(    os.path.join(CONFIG['output_dir'], 'evaluation_details.csv'),    index=False)print(f"\nEvaluation details saved to {CONFIG['output_dir']}/evaluation_details.csv")

### 6.5 Visualize Results

In [None]:
# Initialize visualizervisualizer = SummarizationVisualizer()# Plot metrics comparisonprint("\nGenerating visualizations...")visualizer.plot_metrics_comparison(    evaluation_results,    save_path=os.path.join(CONFIG['output_dir'], 'metrics_comparison.png'))

In [None]:
# Plot category analysisvisualizer.plot_category_analysis(    evaluation_df,    save_path=os.path.join(CONFIG['output_dir'], 'category_analysis.png'))

In [None]:
# Plot source analysisvisualizer.plot_source_analysis(    evaluation_df,    save_path=os.path.join(CONFIG['output_dir'], 'source_analysis.png'))

In [None]:
# Plot length analysisvisualizer.plot_length_analysis(    evaluation_df,    save_path=os.path.join(CONFIG['output_dir'], 'length_analysis.png'))

In [None]:
# Create summary reportvisualizer.create_summary_report(    evaluation_results,    evaluation_df,    save_path=os.path.join(CONFIG['output_dir'], 'summary_report.png'))

## 7. Hasil dan Analisis

### 7.1 Summary Statistics

In [None]:
# Display detailed statisticsprint("\n" + "="*60)print("DETAILED EVALUATION STATISTICS")print("="*60)# Overall performanceprint("\n1. OVERALL PERFORMANCE:")print("-" * 30)for metric, value in evaluation_results['summary'].items():    print(f"{metric:15s}: {value:.4f}")# Performance by categoryprint("\n2. PERFORMANCE BY CATEGORY:")print("-" * 30)category_stats = evaluation_df.groupby('category')[['rouge1', 'rouge2', 'rougeL']].agg(['mean', 'std'])print(category_stats)# Performance by sourceprint("\n3. PERFORMANCE BY SOURCE:")print("-" * 30)source_stats = evaluation_df.groupby('source')[['rouge1', 'rouge2', 'rougeL']].agg(['mean', 'std'])print(source_stats)# Length statisticsprint("\n4. LENGTH STATISTICS:")print("-" * 30)print(f"Average reference length: {evaluation_df['reference_length'].mean():.1f} words")print(f"Average prediction length: {evaluation_df['prediction_length'].mean():.1f} words")print(f"Average compression ratio: {(evaluation_df['prediction_length'] / evaluation_df['reference_length']).mean():.2f}")# Best and worst performing articlesprint("\n5. BEST PERFORMING ARTICLES (by ROUGE-1):")print("-" * 30)best_articles = evaluation_df.nlargest(5, 'rouge1')[['id', 'category', 'source', 'rouge1']]print(best_articles)print("\n6. WORST PERFORMING ARTICLES (by ROUGE-1):")print("-" * 30)worst_articles = evaluation_df.nsmallest(5, 'rouge1')[['id', 'category', 'source', 'rouge1']]print(worst_articles)

### 7.2 Export Results

In [None]:
# Create comprehensive results exportexport_data = {    'config': CONFIG,    'evaluation_results': evaluation_results,    'dataset_statistics': {        'total_articles': len(evaluation_df),        'categories': evaluation_df['category'].nunique(),        'sources': evaluation_df['source'].nunique(),        'avg_reference_length': evaluation_df['reference_length'].mean(),        'avg_prediction_length': evaluation_df['prediction_length'].mean(),        'avg_compression_ratio': (evaluation_df['prediction_length'] / evaluation_df['reference_length']).mean()    },    'category_performance': category_stats.to_dict(),    'source_performance': source_stats.to_dict()}# Save comprehensive resultswith open(os.path.join(CONFIG['output_dir'], 'comprehensive_results.json'), 'w', encoding='utf-8') as f:    json.dump(export_data, f, indent=2, ensure_ascii=False)print(f"\nAll results have been saved to the '{CONFIG['output_dir']}' directory.")print("\nFiles created:")for file in os.listdir(CONFIG['output_dir']):    print(f"  - {file}")

### 7.3 Conclusions and Next Steps

In [None]:
# Generate conclusionsprint("\n" + "="*60)print("CONCLUSIONS")print("="*60)# Performance assessmentrouge1_score = evaluation_results['summary']['rouge1']if rouge1_score > 0.4:    performance_level = "EXCELLENT"elif rouge1_score > 0.3:    performance_level = "GOOD"elif rouge1_score > 0.2:    performance_level = "MODERATE"else:    performance_level = "NEEDS IMPROVEMENT"print(f"\n1. Overall Performance: {performance_level}")print(f"   - ROUGE-1 Score: {rouge1_score:.4f}")print(f"   - This indicates that the model captures {rouge1_score*100:.1f}% of unigrams from reference summaries")print("\n2. Key Findings:")# Find best performing categorybest_category = evaluation_df.groupby('category')['rouge1'].mean().idxmax()best_category_score = evaluation_df.groupby('category')['rouge1'].mean().max()print(f"   - Best performing category: {best_category} (ROUGE-1: {best_category_score:.4f})")# Find best performing sourcebest_source = evaluation_df.groupby('source')['rouge1'].mean().idxmax()best_source_score = evaluation_df.groupby('source')['rouge1'].mean().max()print(f"   - Best performing source: {best_source} (ROUGE-1: {best_source_score:.4f})")# Compression analysisavg_compression = (evaluation_df['prediction_length'] / evaluation_df['reference_length']).mean()print(f"   - Average compression ratio: {avg_compression:.2f}")if avg_compression < 0.8:    print("     → Model tends to generate shorter summaries than references")elif avg_compression > 1.2:    print("     → Model tends to generate longer summaries than references")else:    print("     → Model generates summaries with similar length to references")print("\n3. Recommendations for Improvement:")print("   - Fine-tune the model on Indonesian news data for better performance")print("   - Adjust temperature and max_length parameters based on category/source")print("   - Consider using beam search for more consistent results")print("   - Implement post-processing to improve summary quality")print("\n" + "="*60)