In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/mlft-dataset/train.01.csv
/kaggle/input/mlft-dataset/dev.01.csv


In [85]:
!pip install transformers



In [86]:
from transformers import AutoModelWithLMHead, AutoTokenizer
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from torch import optim
from tqdm import tqdm
import torch

In [96]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained('malay-huggingface/t5-super-super-tiny-bahasa-cased')
model = T5ForConditionalGeneration.from_pretrained('malay-huggingface/t5-super-super-tiny-bahasa-cased')


In [89]:
import pandas as pd
from torch.utils.data import Dataset

class T5Dataset(Dataset):
    def __init__(self, path, tokenizer):
        self.df = pd.read_csv(path, on_bad_lines='skip')
        self.summary = self.df["summary"]
        self.paragraphs = self.df["paragraphs"]
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.paragraphs)

    def __getitem__(self, index):
        paragraphs = str(self.paragraphs[index])
        summary = str(self.summary[index])
        pad = self.tokenizer.pad_token
        eos = self.tokenizer.eos_token
        
        # Encode input (paragraf) dengan prefix "ringkasan: "
        encoding_paragraphs = self.tokenizer.encode_plus(
            "ringkasan: " + paragraphs,
            return_token_type_ids=False,
            return_attention_mask=True,
            max_length=512,
            padding="max_length",
            truncation=True,
            return_tensors='pt'
        )

        # Encode output (ringkasan)
        encoding_summary = self.tokenizer.encode(
            pad + summary,
            add_special_tokens=False,
            return_token_type_ids=False,
            max_length=150,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'sentence_text': paragraphs,
            'summary_text': summary,
            'input_ids': encoding_paragraphs['input_ids'].flatten(),
            'attention_mask': encoding_paragraphs['attention_mask'].flatten(),
            'lm_labels': encoding_summary.flatten(),
        }


In [90]:
train_set = T5Dataset("/kaggle/input/sumdatav2/train_Indosum (1).csv",tokenizer)
train_loader = DataLoader(train_set, batch_size = 8,shuffle = True)
val_set = T5Dataset("/kaggle/input/sumdatav2/test_Indosum (1).csv",tokenizer)
val_loader = DataLoader(val_set, batch_size = 8,shuffle = True)

In [91]:
#test token
trytoken =tokenizer('kucing itu mengendarai kuda supaya baik jalannya hey hey hey aku blutuh hah tidak mungkin')

In [92]:
trytoken

{'input_ids': [8835, 37, 1882, 10070, 5345, 6001, 2401, 187, 489, 38, 57, 128, 57, 128, 57, 128, 10294, 13, 380, 9721, 2262, 13, 261, 668, 30, 167, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [93]:
trytoken = trytoken['input_ids']

In [94]:
[tokenizer.decode(x)for x in trytoken]

['kucing',
 'itu',
 'meng',
 'enda',
 'rai',
 'kuda',
 'supaya',
 'baik',
 'jalan',
 'nya',
 'he',
 'y',
 'he',
 'y',
 'he',
 'y',
 'aku',
 '',
 'b',
 'lut',
 'uh',
 '',
 'h',
 'ah',
 'tidak',
 'mungkin',
 '</s>']

In [109]:
optimizer = optim.AdamW(model.parameters(), lr=3e-5)
model = model.to("cuda")

In [110]:
import os
import time

optimizer = optim.AdamW(model.parameters(), lr=1e-5)
model = model.to("cuda")

best_val_loss = float("inf")
early_stop = 0
epochs = 10 #atur epoch disini (5,10,15,20) lama runningnya

save_path = "/kaggle/working/model_summarization/"
os.makedirs(save_path, exist_ok=True)


for epoch in range(epochs):
    model.train()
    train_loss = 0
    start_time = time.time()

    for idx, data in tqdm(enumerate(train_loader), total=len(train_loader)):
        try:
            sentence_text, summary_text = data["sentence_text"], data["summary_text"]
            input_ids, attention_mask, labels = data["input_ids"], data["attention_mask"], data["lm_labels"]
            input_ids, attention_mask, labels = input_ids.to("cuda"), attention_mask.to("cuda"), labels.to("cuda")
            # Ubah padding token pada labels menjadi -100
            labels[labels == tokenizer.pad_token_id] = -100

            optimizer.zero_grad()

            # Forward pass
            output = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = output.loss  # Akses loss langsung
            train_loss += loss.item()

            # Backward pass dan optimisasi
            loss.backward()
            optimizer.step()

            # Logging setiap 1000 iterasi
            if (idx % 1000) == 0:
                elapsed_time = time.time() - start_time
                print(f"Epoch {epoch + 1}, Step {idx}, Loss: {loss.item():.4f}, Train Loss: {train_loss / (idx + 1):.4f}, Time/Step: {elapsed_time / (idx + 1):.2f}s")
        except Exception as e:
            print(f"Error at batch {idx}: {e}")
            continue

    # Eval setiap selesai jalanin satu epoch 
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for idx, data in tqdm(enumerate(val_loader), total=len(val_loader)):
            sentence_text, summary_text = data["sentence_text"], data["summary_text"]
            input_ids, attention_mask, labels = data["input_ids"], data["attention_mask"], data["lm_labels"]
            input_ids, attention_mask, labels = input_ids.to("cuda"), attention_mask.to("cuda"), labels.to("cuda")
            # Ubah padding token pada labels menjadi -100
            labels[labels == tokenizer.pad_token_id] = -100

            # Forward pass
            output = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = output.loss
            val_loss += loss.item()

    # Simpan model terbaik
    avg_val_loss = val_loss / len(val_loader)
    if avg_val_loss < best_val_loss:
        model.save_pretrained(save_path)
        tokenizer.save_pretrained(save_path)
        best_val_loss = avg_val_loss
        early_stop = 0  # Reset early stopping
    else:
        early_stop += 1

    # Logging
    print(f"Epoch {epoch + 1} completed. Train Loss: {train_loss / len(train_loader):.4f}, Val Loss: {avg_val_loss:.4f}")

    # Early stopping diatur kalau setelah 3 epoch beruntun, val_loss tidak menurun
    if early_stop == 5:
        print("Early stopping triggered.")
        break


  0%|          | 3/1056 [00:00<01:24, 12.45it/s]

Epoch 1, Step 0, Loss: 9.7085, Train Loss: 9.7085, Time/Step: 0.07s


 95%|█████████▍| 1003/1056 [00:59<00:03, 16.42it/s]

Epoch 1, Step 1000, Loss: 4.4652, Train Loss: 5.7019, Time/Step: 0.06s


100%|██████████| 1056/1056 [01:02<00:00, 16.99it/s]
100%|██████████| 118/118 [00:04<00:00, 24.82it/s]


Epoch 1 completed. Train Loss: 5.6486, Val Loss: 1.1313


  0%|          | 2/1056 [00:00<01:10, 14.87it/s]

Epoch 2, Step 0, Loss: 4.5572, Train Loss: 4.5572, Time/Step: 0.06s


 95%|█████████▌| 1004/1056 [00:58<00:03, 16.66it/s]

Epoch 2, Step 1000, Loss: 4.0383, Train Loss: 4.2650, Time/Step: 0.06s


100%|██████████| 1056/1056 [01:01<00:00, 17.25it/s]
100%|██████████| 118/118 [00:04<00:00, 24.83it/s]


Epoch 2 completed. Train Loss: 4.2500, Val Loss: 1.1262


  0%|          | 2/1056 [00:00<01:16, 13.69it/s]

Epoch 3, Step 0, Loss: 3.9013, Train Loss: 3.9013, Time/Step: 0.07s


 95%|█████████▌| 1004/1056 [00:58<00:03, 16.71it/s]

Epoch 3, Step 1000, Loss: 3.3362, Train Loss: 3.7330, Time/Step: 0.06s


100%|██████████| 1056/1056 [01:01<00:00, 17.10it/s]
100%|██████████| 118/118 [00:04<00:00, 25.43it/s]


Epoch 3 completed. Train Loss: 3.7246, Val Loss: 1.1194


  0%|          | 2/1056 [00:00<01:06, 15.86it/s]

Epoch 4, Step 0, Loss: 3.2315, Train Loss: 3.2315, Time/Step: 0.06s


 95%|█████████▌| 1004/1056 [00:58<00:03, 16.63it/s]

Epoch 4, Step 1000, Loss: 3.8059, Train Loss: 3.4102, Time/Step: 0.06s


100%|██████████| 1056/1056 [01:01<00:00, 17.17it/s]
100%|██████████| 118/118 [00:04<00:00, 24.81it/s]


Epoch 4 completed. Train Loss: 3.4037, Val Loss: 1.1238


  0%|          | 2/1056 [00:00<01:08, 15.48it/s]

Epoch 5, Step 0, Loss: 3.1833, Train Loss: 3.1833, Time/Step: 0.06s


 95%|█████████▌| 1004/1056 [00:58<00:03, 16.72it/s]

Epoch 5, Step 1000, Loss: 3.3396, Train Loss: 3.1812, Time/Step: 0.06s


100%|██████████| 1056/1056 [01:01<00:00, 17.13it/s]
100%|██████████| 118/118 [00:04<00:00, 25.36it/s]


Epoch 5 completed. Train Loss: 3.1747, Val Loss: 1.1331


  0%|          | 2/1056 [00:00<01:06, 15.88it/s]

Epoch 6, Step 0, Loss: 3.6014, Train Loss: 3.6014, Time/Step: 0.06s


 95%|█████████▌| 1004/1056 [00:58<00:03, 16.20it/s]

Epoch 6, Step 1000, Loss: 3.1091, Train Loss: 3.0021, Time/Step: 0.06s


100%|██████████| 1056/1056 [01:01<00:00, 17.10it/s]
100%|██████████| 118/118 [00:04<00:00, 24.33it/s]


Epoch 6 completed. Train Loss: 2.9987, Val Loss: 1.1322


  0%|          | 2/1056 [00:00<01:07, 15.70it/s]

Epoch 7, Step 0, Loss: 2.8752, Train Loss: 2.8752, Time/Step: 0.06s


 95%|█████████▌| 1004/1056 [00:58<00:02, 17.39it/s]

Epoch 7, Step 1000, Loss: 2.8564, Train Loss: 2.8553, Time/Step: 0.06s


100%|██████████| 1056/1056 [01:01<00:00, 17.30it/s]
100%|██████████| 118/118 [00:04<00:00, 24.95it/s]


Epoch 7 completed. Train Loss: 2.8492, Val Loss: 1.1333


  0%|          | 2/1056 [00:00<01:09, 15.11it/s]

Epoch 8, Step 0, Loss: 2.7884, Train Loss: 2.7884, Time/Step: 0.06s


 95%|█████████▌| 1004/1056 [00:58<00:03, 16.58it/s]

Epoch 8, Step 1000, Loss: 3.3621, Train Loss: 2.7460, Time/Step: 0.06s


100%|██████████| 1056/1056 [01:01<00:00, 17.13it/s]
100%|██████████| 118/118 [00:04<00:00, 25.04it/s]

Epoch 8 completed. Train Loss: 2.7431, Val Loss: 1.1387
Early stopping triggered.





In [111]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

save_path = "/kaggle/working/model_summarization/"

# Load model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained(save_path)
tokenizer = AutoTokenizer.from_pretrained(save_path)

print("Model and tokenizer successfully loaded!")

Model and tokenizer successfully loaded!


In [119]:

with torch.no_grad():
    data = next(iter(val_loader))
    sentence_text, summary_text, input_ids, attention_mask, lm_labels = (
        data["sentence_text"],
        data["summary_text"],
        data["input_ids"],
        data["attention_mask"],
        data["lm_labels"],
    )

    # Generate summary
    generated = model.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    max_length=150,
    min_length=40,
    length_penalty=1.0,
    num_beams=5,  # Beam search
    early_stopping=True,
    )

    # Decode generated output
    decoded_summary = tokenizer.decode(
        generated[0],
        skip_special_tokens=True,  # Hilangkan token spesial seperti <pad>, <sos>, dll.
        clean_up_tokenization_spaces=True,  # Hapus spasi tambahan
    )

    # Print results
    print("Full text:")
    print(sentence_text[0])  # Pastikan sentence_text berupa string
    print("\nReference summary:")
    print(summary_text[0])  # Pastikan summary_text berupa string
    print("\nGenerated summary:")
    print(decoded_summary)  # Hasil yang telah dibersihkan


Full text:
DEPOK ( Pos Kota )  Guna mematangkan kesiapan personil menjelang pengamanan Pilgub Jawa Barat ( Jabar ) 2018  ratusan personil Polresta Depok menggelar latihan simulasi Pra Operasi Pilkada di lapangan utama Mapolresta Depok  Rabu ( 3 / 1 ) .]] Pelatihan simulasi dibawah tanggung jawab pengawasan Kabag Ops Polresta Depok Kompol Hari Agung Julianto  serta Kasat Sabhara Kompol Subandi  dengan menurunkan sebanyak sekitar 300 personil dari semua satuan fungsi melakukan simulasi jalannya pengamanan pilkada .]] Menurut Kasubag Humas Polresta Depok AKP Sutrisno mengatakan semua personil terdari dari satuan fungsi yang ada termasuk seluruh anggota binmas diterjunkan dalam melakukan bagaimana cara pengamanan Pilkada .]] Â Tujuan simulasi yang rencana akan diadakan selama dua hari ini dalam rangka persiapan Pam Pilkada Jabar 2018 nanti  tahapan mulai dari rangkaian kegiatan Pilkada mulai dari pengamanan kota suara  aksi huru hara  serta bagaiaman penanganan aksi terorÂujarnya kepada 

In [147]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

save_path = "/kaggle/working/model_summarization/"

# Load model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained(save_path)
tokenizer = AutoTokenizer.from_pretrained(save_path)

print("Model and tokenizer successfully loaded!")


Model and tokenizer successfully loaded!


In [83]:

print("Full text:")
print(sentence_text[0])  # Pastikan sentence_text berupa string
print("\nReference summary:")
print(summary_text[0])  # Pastikan summary_text berupa string
print("\nGenerated summary:")
input_ids = tokenizer.encode('ringkasan: ',sentence_text[0], return_tensors = 'pt')
outputs = model.generate(input_ids,max_length=150,
        min_length=80)
print(tokenizer.decode(outputs[0]))

Full text:
Rimanews - Calon Gubernur DKI Jakarta Anies Baswedan memberikan klarifikasi tentang ketidakhadiran dia maupun calon wakilnya  Sandiaga Uno  dalam debat cagub dan cawagub Kompas TV  tadi malam . Anies menjelaskan  acara tersebut semula diatur mempertemukan antar calon gubernur dan di acara berikutnya antara dua calon wakil gubernur .  Minggu lalu debat cagub  dan pekan ini adalah debat calon wakil gubernur . Bukan antar pasangan   kata Anies kepada wartawan di kampus Universitas Al Azhar  Jakarta Selatan  hari ini . Pasangan Anies - Sandi memutuskan absen dalam acara debat pasangan cagub dan cawagub DKI  tadi malam  tidak adanya kesepakatan antara pihak Kompas TV dengan tim pemenangan Anies - Sandi . Konsultan politik pasangan calon nomor urut tiga  Eep Saefullah Fatah  mengatakan permintaan dari pihak media center Anies - Sandi tidak ditanggapi oleh Kompas TV . Menurut Eep  pihaknya meminta agar format acara bukan debat antar cawagub tetapi talkshow dengan mengadirkan cawagu

In [84]:
import shutil

# Path direktori yang ingin didownload
source_path = "/kaggle/working/model_summarization"
output_zip = "/kaggle/working/model_summarization.zip"

# Kompres direktori menjadi zip
shutil.make_archive(source_path, 'zip', source_path)
print(f"Direktori dikompres ke {output_zip}")


Direktori dikompres ke /kaggle/working/model_summarization.zip


In [61]:
print(input_ids.device, attention_mask.device, lm_labels.device)
print(next(model.parameters()).device)  # Untuk memeriksa perangkat model


cuda:0 cuda:0 cuda:0
cuda:0


In [122]:
import os
import pandas as pd

In [134]:
datapath = os.path.join('/kaggle/input/sumdatav2/test_Indosum (1).csv')

data_test = pd.read_csv(datapath, encoding='latin-1')
data_test.head()

Unnamed: 0,paragraphs,summary
0,Jakarta CNN Indonesia - - Bayern Munich menja...,Bayern Munich menjadi klub raksasa Eropa teran...
1,Merdeka.com - Catatan tak pernah kalah yang di...,Catatan tak pernah kalah yang dirangkum oleh P...
2,Instagram Stories adalah candu baru buat merek...,Kita mungkom telah banyak melihat ada teman ya...
3,Pemerintah bersama dengan Kepolisian RI TNI ...,Pemerintah bersama dengan Kepolisian RI TNI ...
4,Mantan Bupati Buton Samsu Umar Abdul Samiun d...,Mantan Bupati Buton Samsu Umar Abdul Samiun d...


In [150]:
save_path = "/kaggle/working/model_summarization/"

# Load model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained(save_path)
tokenizer = AutoTokenizer.from_pretrained(save_path)

In [151]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [154]:
from tqdm import tqdm
# Fungsi untuk membuat ringkasan
def summarize_text(paragraphs, model, tokenizer, max_length=150):
    # Tokenisasi input
    inputs = tokenizer.encode("ringkasan: " + paragraphs, return_tensors="pt", truncation=True).to(device)
    # Generate summary
    outputs = model.generate(inputs, max_length=max_length, num_beams=4, early_stopping=True)
    # Decode hasilnya
    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return summary

# Generate summaries untuk setiap paragraf
model_summaries = []
for paragraphs in tqdm(data_test['paragraphs'], desc="Generating"):
    summary = summarize_text(paragraphs, model, tokenizer)
    model_summaries.append(summary)

# Tambahkan kolom baru ke DataFrame
data_test['model_summary'] = model_summaries

data_test.head()


Generating: 100%|██████████| 939/939 [13:05<00:00,  1.20it/s]


Unnamed: 0,paragraphs,summary,model_summary
0,Jakarta CNN Indonesia - - Bayern Munich menja...,Bayern Munich menjadi klub raksasa Eropa teran...,Bayern Munich menjadi klub raksasa Eropa teran...
1,Merdeka.com - Catatan tak pernah kalah yang di...,Catatan tak pernah kalah yang dirangkum oleh P...,Catatan tak pernah kalah yang dirangkum oleh P...
2,Instagram Stories adalah candu baru buat merek...,Kita mungkom telah banyak melihat ada teman ya...,Media yang tepat bagi orang dalam membagikan k...
3,Pemerintah bersama dengan Kepolisian RI TNI ...,Pemerintah bersama dengan Kepolisian RI TNI ...,Pemerintah bersama dengan Kepolisian RI TNI Ja...
4,Mantan Bupati Buton Samsu Umar Abdul Samiun d...,Mantan Bupati Buton Samsu Umar Abdul Samiun d...,Mantan Bupati Buton Samsu Umar Abdul Samsu Uma...


In [156]:
output_file_path = '/kaggle/working/model_summarization/file_test_with_summary.csv'
data_test.to_csv(output_file_path, index=False)

In [157]:
pip install rouge-score

  pid, fd = os.forkpty()


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=df3c4213c4fcda44f459af2a11f46cc381043958bc76e16e60dec4cab2a0b355
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2
Note: you may need to restart the kernel to use updated packages.


In [158]:
import pandas as pd
from rouge_score import rouge_scorer

# Load file dengan summary dan model_summary
file_path = '/kaggle/working/model_summarization/file_test_with_summary.csv'  # Ganti dengan file CSV-mu
data = pd.read_csv(file_path)

# Pastikan kolom sesuai
references = data['summary'].tolist()  # Kolom referensi
generated_summaries = data['model_summary'].tolist()  # Kolom ringkasan model

# Inisialisasi ROUGE Scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Evaluasi setiap pasangan summary
rouge_results = {
    'rouge1_precision': [],
    'rouge1_recall': [],
    'rouge1_f1': [],
    'rouge2_precision': [],
    'rouge2_recall': [],
    'rouge2_f1': [],
    'rougeL_precision': [],
    'rougeL_recall': [],
    'rougeL_f1': [],
}

for ref, gen in zip(references, generated_summaries):
    scores = scorer.score(ref, gen)
    rouge_results['rouge1_precision'].append(scores['rouge1'].precision)
    rouge_results['rouge1_recall'].append(scores['rouge1'].recall)
    rouge_results['rouge1_f1'].append(scores['rouge1'].fmeasure)
    rouge_results['rouge2_precision'].append(scores['rouge2'].precision)
    rouge_results['rouge2_recall'].append(scores['rouge2'].recall)
    rouge_results['rouge2_f1'].append(scores['rouge2'].fmeasure)
    rouge_results['rougeL_precision'].append(scores['rougeL'].precision)
    rouge_results['rougeL_recall'].append(scores['rougeL'].recall)
    rouge_results['rougeL_f1'].append(scores['rougeL'].fmeasure)

# Hitung rata-rata metrik
average_results = {metric: sum(values) / len(values) for metric, values in rouge_results.items()}

# Tampilkan hasil rata-rata
print("ROUGE Evaluation Results:")
for metric, value in average_results.items():
    print(f"{metric}: {value:.4f}")


ROUGE Evaluation Results:
rouge1_precision: 0.3942
rouge1_recall: 0.6481
rouge1_f1: 0.4870
rouge2_precision: 0.3303
rouge2_recall: 0.5454
rouge2_f1: 0.4088
rougeL_precision: 0.3716
rougeL_recall: 0.6103
rougeL_f1: 0.4590
