# Installation

In [None]:
!pip install transformers[sentencepiece] datasets sacrebleu rouge_score py7zr -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.6/106.6 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.6/67.6 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m411.2/411.2 kB[0m [31m27.2

# Library

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import os
import torch
import rouge

from rouge import Rouge
from transformers import BertTokenizer, EncoderDecoderModel
from datasets import load_dataset, load_metric
from transformers import pipeline, set_seed

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

# Load Dataset

In [None]:
path = os.getcwd()
project_path = 'drive/MyDrive/IndonesiaAI/TextSummarization/'
full_project_path = os.path.join(path, project_path)
dataset_folder = 'dataset'
model_folder = 'models'
dataset_path = os.path.join(full_project_path, dataset_folder)
model_path = os.path.join(full_project_path, model_folder)

In [None]:
dataset = pd.read_csv(os.path.join(dataset_path, 'df_test_prep.csv'), index_col=0) # path nya masih local

In [None]:
dataset.head()

Unnamed: 0,id,url,clean_article,clean_summary,extractive_summary,clean_article_prep,clean_summary_prep
1,13020,https://www.liputan6.com/news/read/13020/bi-di...,"Liputan6.com, Jakarta: Bank Indonesia dinilai ...",Kendati Bank Sentral AS menurunkan suku bungan...,"Liputan6.com, Jakarta: Bank Indonesia dinilai ...",bank indonesia dinilai masih akan menghadapi s...,kendati bank sentral as menurunkan suku bungan...
3,13024,https://www.liputan6.com/news/read/13024/perub...,"Liputan6.com, Jakarta: Penghapusan beberapa pa...","Revisi Kepmennaker Nomor 78 Tahun 2001, dinila...","Liputan6.com, Jakarta: Penghapusan beberapa pa...",penghapusan beberapa pasal menyangkut hak buru...,revisi kepmennaker nomor 78 tahun 2001 dinilai...
4,13025,https://www.liputan6.com/news/read/13025/puluh...,"Liputan6.com, Jakarta: Operasi Sadar Jaya yang...",Polisi menangkap 32 pengunjung Diskotik Mileni...,"Dari operasi tersebut, polisi menangkap 32 pen...",operasi sadar jaya yang dilancarkan selasa mal...,polisi menangkap 32 pengunjung diskotik mileni...
6,13027,https://www.liputan6.com/news/read/13027/ruu-p...,"Liputan6.com, Jakarta: Sejumlah pasal dalam Ra...",Praktisi penyiaran menyoroti sejumlah pasal da...,RUU tersebut dipandang tak berpihak pada kepen...,sejumlah pasal dalam rancangan undang undang p...,praktisi penyiaran menyoroti sejumlah pasal da...
8,13030,https://www.liputan6.com/news/read/13030/pusat...,"Liputan6.com, Bantul: Pusat desa kerajinan ata...","Karena ditinggalkan para perajin, kondisi Desa...",Sejumlah benda antik yang dianggap menjadi cik...,bantul pusat desa kerajinan atau pasar kriya d...,karena ditinggalkan para perajin kondisi desa ...


In [None]:
dataset.drop(columns=['url','extractive_summary','clean_article', 'clean_summary'], axis=1, inplace = True)
dataset.head()

Unnamed: 0,id,clean_article_prep,clean_summary_prep
1,13020,bank indonesia dinilai masih akan menghadapi s...,kendati bank sentral as menurunkan suku bungan...
3,13024,penghapusan beberapa pasal menyangkut hak buru...,revisi kepmennaker nomor 78 tahun 2001 dinilai...
4,13025,operasi sadar jaya yang dilancarkan selasa mal...,polisi menangkap 32 pengunjung diskotik mileni...
6,13027,sejumlah pasal dalam rancangan undang undang p...,praktisi penyiaran menyoroti sejumlah pasal da...
8,13030,bantul pusat desa kerajinan atau pasar kriya d...,karena ditinggalkan para perajin kondisi desa ...


In [None]:
# !pip install Dataset -q

In [None]:
# rubah pandas csv menjadi dataframe
from datasets import Dataset

dataset = Dataset.from_pandas(dataset)

In [None]:
dataset

Dataset({
    features: ['id', 'clean_article_prep', 'clean_summary_prep', '__index_level_0__'],
    num_rows: 3862
})

In [None]:
# Assuming 'clean_article_prep' is the correct column name
sample_text = dataset['clean_article_prep'][0]  # This might not work as expected with `datasets.Dataset`
print(sample_text)  # Check what this prints to debug


bank indonesia dinilai masih akan menghadapi situasi sulit kendati bank sentral amerika serikat terus menurunkan tingkat suku bunga yang dimiliki penilaian itu dikemukakan pengamat ekonomi didiek rachbini di jakarta baru baru ini menurut perhitungan didiek dalam tahun ini the fed telah lima kali menurunkan nilai suku bunga yang mereka miliki bahkan didiek memperkirakan tingkat suku bunga the fed akan diturunkan hingga menjadi empat persen dengan keadaan itu tambah didiek di atas kertas dapat dimanfaatkan bi untuk meningkatkan suku bunga bi sebagai upaya mempertahankan nilai tukar rupiah namun demikian didiek pesimistis hal itu akan tercapai mengingat kondisi bangsa masih carut marut jika keadaan terus seperti ini tak tertutup kemungkinan bi akan tetap memberlakukan nilai suku bunga tinggi ujar didiek sementara itu the fed terpaksa menurunkan tingkat suku bunga karena pertembuhan ekonomi di negeri paman sam terus melemah padahal selama ini as menjadi pasar ekspor penting untuk indonesia

# Model

model-model yg akan di bandingkan:


*   Alfahluzi/bert2bert-dropout-0.3-lr-5e-05-ds-canonical
*   cahya/bert2bert-indonesian-summarization
*   Alfahluzi/bert2bert-extreme
*   model_batch_4_lr_1e-5
*   model_batch_4_lr_5e-5
*   model_batch_2_lr_1e-5
*   model_batch_2_lr_5e-5




## Helper Function to Calculate Rouge

In [None]:
!pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [None]:
def calculate_mean_rouge(dataframe, model_name, sample_size=None):
    """
    Calculate aggregated ROUGE scores for predictions made by a BERT model.

    Args:
    - dataframe (DataFrame): DataFrame containing clean_article_prep (X data) and clean_summary_prep (Y data).
    - model_name (str): Name of the BERT model.

    Returns:
    - DataFrame: DataFrame containing model name, aggregated ROUGE1, ROUGE2, ROUGEL, and ROUGELsum scores.
    """
    if sample_size==None:
      dataframe
    else:
      dataframe = dataframe.sample(sample_size, random_state=42)

    # Initialize ROUGE calculator
    rouge = Rouge()

    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = EncoderDecoderModel.from_pretrained(model_name)

    # Initialize lists to store ROUGE scores
    rouge1_scores = []
    rouge2_scores = []
    rougeL_scores = []
    rougeLsum_scores = []

    # Iterate over rows of the DataFrame
    for index, row in dataframe.iterrows():
        # Get X and Y values from the DataFrame
        x_text = row['clean_article_prep']
        y_text = row['clean_summary_prep']

        # Generate prediction for X data using the BERT model
        # Tokenize the input text
        inputs = tokenizer(x_text, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
        input_ids = inputs.input_ids
        attention_mask = inputs.attention_mask

        # Convert input_ids and attention_mask to PyTorch tensors
        input_ids = torch.tensor(input_ids)
        attention_mask = torch.tensor(attention_mask)

        outputs = model.generate(
            input_ids,
            attention_mask=attention_mask,
            min_length=20,
            max_length=128,
            num_beams=10,
            repetition_penalty=2.5,
            length_penalty=1.0,
            early_stopping=True,
            no_repeat_ngram_size=2,
            use_cache=True,
            do_sample = True,
            temperature = 0.8,
            top_k = 50,
            top_p = 0.95)

        # Decode the generated summaries
        output_str = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Calculate ROUGE scores
        scores = rouge.get_scores(output_str, y_text)[0]

        # Extract ROUGE scores
        rouge1 = scores['rouge-1']['f']
        rouge2 = scores['rouge-2']['f']
        rougeL = scores['rouge-l']['f']
        rougeLsum = (rouge1 + rouge2 + rougeL) / 3  # ROUGE-Lsum is the average of ROUGE-1, ROUGE-2, and ROUGE-L

        # Append scores to respective lists
        rouge1_scores.append(rouge1)
        rouge2_scores.append(rouge2)
        rougeL_scores.append(rougeL)
        rougeLsum_scores.append(rougeLsum)

    # Calculate mean scores
    mean_rouge1 = sum(rouge1_scores) / len(rouge1_scores)
    mean_rouge2 = sum(rouge2_scores) / len(rouge2_scores)
    mean_rougeL = sum(rougeL_scores) / len(rougeL_scores)
    mean_rougeLsum = sum(rougeLsum_scores) / len(rougeLsum_scores)

    # Create DataFrame with aggregated scores
    df = pd.DataFrame({
        'Model Name': [model_name],
        'Mean ROUGE1': [mean_rouge1],
        'Mean ROUGE2': [mean_rouge2],
        'Mean ROUGEL': [mean_rougeL],
        'Mean ROUGELsum': [mean_rougeLsum]
    })

    return df

# Mean ROUGE

## cahya/bert2bert-indonesian-summarization

In [None]:
rouge_1 = calculate_mean_rouge(dataset, "cahya/bert2bert-indonesian-summarization", 50)
rouge_1

  input_ids = torch.tensor(input_ids)
  attention_mask = torch.tensor(attention_mask)


Unnamed: 0,Model Name,Mean ROUGE1,Mean ROUGE2,Mean ROUGEL,Mean ROUGELsum
0,cahya/bert2bert-indonesian-summarization,0.3121267801,0.1118384747,0.2527776395,0.2255809648


## Alfahluzi/bert2bert-extreme

In [None]:
rouge_2 = calculate_mean_rouge(dataset, "Alfahluzi/bert2bert-extreme", 50)
rouge_2

tokenizer_config.json:   0%|          | 0.00/1.45k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/234k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/737k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.78k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.00G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/133 [00:00<?, ?B/s]

  input_ids = torch.tensor(input_ids)
  attention_mask = torch.tensor(attention_mask)


Unnamed: 0,Model Name,Mean ROUGE1,Mean ROUGE2,Mean ROUGEL,Mean ROUGELsum
0,Alfahluzi/bert2bert-extreme,0.0656776834,0.0,0.0498435744,0.0385070859


## Alfahluzi/bert2bert-dropout-0.3-lr-5e-05-ds-canonical

In [None]:
rouge_3 = calculate_mean_rouge(dataset, "Alfahluzi/bert2bert-dropout-0.3-lr-5e-05-ds-canonical", 50)
rouge_3

  input_ids = torch.tensor(input_ids)
  input_ids = torch.tensor(input_ids)
  attention_mask = torch.tensor(attention_mask)
  attention_mask = torch.tensor(attention_mask)
  input_ids = torch.tensor(input_ids)
  attention_mask = torch.tensor(attention_mask)
  input_ids = torch.tensor(input_ids)
  attention_mask = torch.tensor(attention_mask)
  input_ids = torch.tensor(input_ids)
  attention_mask = torch.tensor(attention_mask)
  input_ids = torch.tensor(input_ids)
  attention_mask = torch.tensor(attention_mask)
  input_ids = torch.tensor(input_ids)
  attention_mask = torch.tensor(attention_mask)
  input_ids = torch.tensor(input_ids)
  attention_mask = torch.tensor(attention_mask)


## model_batch_4_lr_1e-5

In [None]:
rouge_4 = calculate_mean_rouge(dataset, os.path.join(model_path, 'model_batch_4_lr_1e-5'), 50)
rouge_4

  input_ids = torch.tensor(input_ids)
  attention_mask = torch.tensor(attention_mask)
  input_ids = torch.tensor(input_ids)
  attention_mask = torch.tensor(attention_mask)


Unnamed: 0,Model Name,Mean ROUGE1,Mean ROUGE2,Mean ROUGEL,Mean ROUGELsum
0,/content/drive/MyDrive/IndonesiaAI/TextSummari...,0.320739,0.117325,0.245667,0.22791


## model_batch_4_lr_5e-5

In [None]:
rouge_5 = calculate_mean_rouge(dataset, os.path.join(model_path, 'model_batch_4_lr_5e-5'), 50)
rouge_5

  input_ids = torch.tensor(input_ids)
  attention_mask = torch.tensor(attention_mask)


Unnamed: 0,Model Name,Mean ROUGE1,Mean ROUGE2,Mean ROUGEL,Mean ROUGELsum
0,/content/drive/MyDrive/IndonesiaAI/TextSummari...,0.333851,0.120635,0.251822,0.235436


## model_batch_2_lr_1e-5

In [None]:
rouge_6 = calculate_mean_rouge(dataset, os.path.join(model_path, 'model_batch_2_lr_1e-5'), 50)
rouge_6

  input_ids = torch.tensor(input_ids)
  attention_mask = torch.tensor(attention_mask)


Unnamed: 0,Model Name,Mean ROUGE1,Mean ROUGE2,Mean ROUGEL,Mean ROUGELsum
0,/content/drive/MyDrive/IndonesiaAI/TextSummari...,0.342705,0.126596,0.262253,0.243851


## model_batch_2_lr_5e-5

In [None]:
rouge_7 = calculate_mean_rouge(dataset, os.path.join(model_path, 'model_batch_2_lr_5e-5'), 50)
rouge_7

  input_ids = torch.tensor(input_ids)
  attention_mask = torch.tensor(attention_mask)


Unnamed: 0,Model Name,Mean ROUGE1,Mean ROUGE2,Mean ROUGEL,Mean ROUGELsum
0,/content/drive/MyDrive/IndonesiaAI/TextSummari...,0.328523,0.117366,0.25087,0.232253
