In [None]:
# 1. Import Library yang Diperlukan
# ----------------------------------
import torch
from transformers import BertTokenizer, EncoderDecoderModel, Trainer, TrainingArguments
from datasets import Dataset
from bert_score import score

# Cek apakah GPU tersedia
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 2. Mempersiapkan Dataset Contoh
# -------------------------------
# Contoh data dosen dengan jurnal dan lingkup jurnal
data = [
    {"lecturer": "Dr. Ahmad", "journal": "Optimizing Neural Networks", "scope": "deep learning"},
    {"lecturer": "Prof. Budi", "journal": "IoT Security Protocols", "scope": "IoT"},
    {"lecturer": "Dr. Chandra", "journal": "Software Testing Techniques", "scope": "rekayasa perangkat lunak"},
    {"lecturer": "Dr. Dian", "journal": "Natural Language Processing Applications", "scope": "machine learning"},
    {"lecturer": "Prof. Endah", "journal": "Edge Computing in IoT", "scope": "IoT"},
    {"lecturer": "Dr. Fajar", "journal": "Convolutional Neural Networks", "scope": "deep learning"},
    {"lecturer": "Dr. Gita", "journal": "Agile Software Development", "scope": "rekayasa perangkat lunak"},
    {"lecturer": "Prof. Hendra", "journal": "Reinforcement Learning Models", "scope": "machine learning"},
    {"lecturer": "Dr. Indra", "journal": "IoT Device Management", "scope": "IoT"},
    {"lecturer": "Dr. Joko", "journal": "Generative Adversarial Networks", "scope": "deep learning"}
]

# Membagi data menjadi data latih dan data uji
train_data = data[:7]  # 7 data untuk pelatihan
test_data = data[7:]   # 3 data untuk pengujian

# Mengubah data menjadi Dataset Hugging Face
train_dataset = Dataset.from_list(train_data)
test_dataset = Dataset.from_list(test_data)

# 3. Menginisialisasi Tokenizer dan Model
# ---------------------------------------
# Memuat tokenizer BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Membuat model EncoderDecoder dengan BERT sebagai encoder dan decoder
model = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-uncased', 'bert-base-uncased')
model.to(device)

# 4. Fungsi Pra-pemrosesan
# ------------------------
# Fungsi untuk tokenisasi input dan target
def preprocess_function(examples):
    # Menggabungkan nama dosen dan judul jurnal sebagai input
    inputs = [f"Dosen: {lecturer}. Jurnal: {journal}" for lecturer, journal in zip(examples['lecturer'], examples['journal'])]
    # Lingkup jurnal sebagai target
    targets = examples['scope']
    
    # Tokenisasi input
    model_inputs = tokenizer(inputs, padding='max_length', truncation=True, max_length=64)
    # Tokenisasi target
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, padding='max_length', truncation=True, max_length=16)
    
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# Menerapkan fungsi pra-pemrosesan ke dataset
train_dataset = train_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

# Mengatur format dataset untuk PyTorch
columns = ['input_ids', 'attention_mask', 'labels']
train_dataset.set_format(type='torch', columns=columns)
test_dataset.set_format(type='torch', columns=columns)

# 5. Menentukan Argumen Pelatihan dan Trainer
# -------------------------------------------
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    evaluation_strategy='epoch',
    logging_steps=10,
    save_total_limit=2,
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),
    logging_dir='./logs',
    disable_tqdm=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer
)

# 6. Melatih Model
# ----------------
trainer.train()

# 7. Menghasilkan Output pada Data Uji
# ------------------------------------
# Mengambil teks input dari data uji
test_inputs = [f"Dosen: {item['lecturer']}. Jurnal: {item['journal']}" for item in test_data]

# Tokenisasi input data uji
test_encodings = tokenizer(test_inputs, padding=True, truncation=True, max_length=64, return_tensors='pt').to(device)

# Menghasilkan output menggunakan model
generated_ids = model.generate(
    input_ids=test_encodings['input_ids'],
    attention_mask=test_encodings['attention_mask'],
    max_length=16,
    num_beams=4,
    early_stopping=True
)

# Mendekode output menjadi teks
generated_texts = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

# Mengambil teks referensi (lingkup jurnal asli)
reference_texts = [item['scope'] for item in test_data]

# 8. Evaluasi Menggunakan BERTScore
# ---------------------------------
P, R, F1 = score(generated_texts, reference_texts, lang='id', verbose=True)

# 9. Menampilkan Skor Individu
# ----------------------------
for idx, (input_text, gen_text, ref_text, p, r, f) in enumerate(zip(test_inputs, generated_texts, reference_texts, P, R, F1)):
    print(f"Contoh {idx + 1}:")
    print(f"Input: {input_text}")
    print(f"Generated Scope: {gen_text}")
    print(f"Reference Scope: {ref_text}")
    print(f"Precision: {p.item():.4f}")
    print(f"Recall: {r.item():.4f}")
    print(f"F1 Score: {f.item():.4f}\n")

# 10. Menghitung dan Menampilkan Skor Rata-rata
# ---------------------------------------------
avg_precision = P.mean().item()
avg_recall = R.mean().item()
avg_f1 = F1.mean().item()

print("Skor Rata-rata:")
print(f"Precision: {avg_precision:.4f}")
print(f"Recall: {avg_recall:.4f}")
print(f"F1 Score: {avg_f1:.4f}")
