In [1]:
from tinycss2 import tokenizer
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
import matplotlib.pyplot as plt
from rouge_score import rouge_scorer
import json
import os

# Fungsi untuk memuat dataset dari file JSON
def load_dataset(json_file):
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

# Fungsi compute_metrics dengan perhitungan ROUGE
def compute_metrics(pred):
    logits = torch.tensor(pred.predictions)
    labels = pred.label_ids

    # Konversi prediksi dan label ke teks menggunakan tokenizer
    pred_texts = tokenizer.batch_decode(torch.argmax(logits, dim=-1), skip_special_tokens=True)
    label_texts = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Menghitung ROUGE
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = [scorer.score(pred_text, label_text) for pred_text, label_text in zip(pred_texts, label_texts)]

    avg_rouge1 = sum(score['rouge1'].fmeasure for score in rouge_scores) / len(rouge_scores)
    avg_rouge2 = sum(score['rouge2'].fmeasure for score in rouge_scores) / len(rouge_scores)
    avg_rougeL = sum(score['rougeL'].fmeasure for score in rouge_scores) / len(rouge_scores)

    # Visualisasi metrik ROUGE
    metrics = ['rouge1', 'rouge2', 'rougeL']
    values = [avg_rouge1, avg_rouge2, avg_rougeL]

    plt.figure(figsize=(10, 7))
    plt.bar(metrics, values, color='blue')
    plt.title('Average ROUGE Scores')
    plt.xlabel('Metrics')
    plt.ylabel('Scores')

    # Simpan visualisasi secara lokal
    local_path = 'rouge_scores.png'
    try:
        plt.savefig(local_path)
        print(f"Visualisasi disimpan secara lokal: {local_path}")
    except Exception as e:
        print(f"Error saat menyimpan visualisasi: {e}")

    # Cek apakah file berhasil disimpan
    if os.path.exists(local_path):
        print(f"File visualisasi berhasil disimpan di: {local_path}")
    else:
        print("Gagal menyimpan file visualisasi.")

    return {
        "rouge1": avg_rouge1,
        "rouge2": avg_rouge2,
        "rougeL": avg_rougeL,
    }

# Fungsi fine_tune_model yang memanggil load_dataset dan menghitung evaluation loss per batch
def fine_tune_model():
    model_name = "kalisai/Nusantara-0.8b-Indo-Chat"
    dataset_file = './output2.json'
    output_dir = './fine-tuned-model'
    
    model = AutoModelForCausalLM.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    data = load_dataset(dataset_file)

    inputs = tokenizer([conv['conversations'][0]['value'] for conv in data], return_tensors='pt', truncation=True, padding='max_length', max_length=64)
    labels = tokenizer([conv['conversations'][1]['value'] for conv in data], return_tensors='pt', truncation=True, padding='max_length', max_length=64)

    class CustomDataset(torch.utils.data.Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels

        def __getitem__(self, idx):
            item = {key: val[idx] for key, val in self.encodings.items()}
            item['labels'] = self.labels['input_ids'][idx]
            return item

        def __len__(self):
            return len(self.labels['input_ids'])

    dataset = CustomDataset(inputs, labels)

    training_args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=10,
        gradient_accumulation_steps=16,
        num_train_epochs=16,
        save_steps=500,
        save_total_limit=2,
        evaluation_strategy="steps",
        eval_steps=250,
        logging_dir='./logs',
        report_to="none",
        fp16=torch.cuda.is_available(),
        load_best_model_at_end=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
        data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
        compute_metrics=compute_metrics,
    )

    trainer.train()
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)
    
fine_tune_model()


FileNotFoundError: [Errno 2] No such file or directory: './output2.json'