In [1]:
import os
from transformers import Trainer, TrainingArguments, AutoModelForCausalLM, AutoTokenizer, DataCollatorForLanguageModeling
import torch
import json

# Memastikan hanya menggunakan CPU
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

# Load dataset
def load_dataset(json_file):
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

# Fungsi untuk fine-tune model
def fine_tune_model(model_name, dataset_file, output_dir):
    # Load model dan tokenizer
    model = AutoModelForCausalLM.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Load dataset
    data = load_dataset(dataset_file)

    # Konversi data ke format yang dibutuhkan oleh model
    inputs = tokenizer([conv['conversations'][0]['value'] for conv in data], return_tensors='pt', truncation=True, padding='max_length', max_length=64)
    labels = tokenizer([conv['conversations'][1]['value'] for conv in data], return_tensors='pt', truncation=True, padding='max_length', max_length=64)

    # Membuat dataset
    class CustomDataset(torch.utils.data.Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels

        def __getitem__(self, idx):
            item = {key: val[idx] for key, val in self.encodings.items()}
            item['labels'] = self.labels['input_ids'][idx]
            return item

        def __len__(self):
            return len(self.labels['input_ids'])

    dataset = CustomDataset(inputs, labels)

    # Mendefinisikan argumen training
    training_args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=1,  # Ukuran batch yang lebih kecil
        gradient_accumulation_steps=16,  # Accumulate gradients for 16 steps
        num_train_epochs=3,
        save_steps=10_000,
        save_total_limit=2,
        evaluation_strategy="steps",  # Evaluasi dilakukan setiap beberapa langkah
        eval_steps=5_000,  # Jumlah langkah antara evaluasi
        logging_dir='./logs',
        report_to="none",
        fp16=False,  # Nonaktifkan mixed precision training karena hanya menggunakan CPU
        load_best_model_at_end=True,
    )

    # Inisialisasi Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
        data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
    )

    # Fine-tuning model
    trainer.train()

    # Menyimpan model
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)

# Penggunaan contoh
model_name = "kalisai/Nusantara-0.8b-Indo-Chat"  # Ganti dengan model Nusantara-7b-Indo-Chat
dataset_file = '../Dataset/nusantara_dataset/output2.json'  # Path ke file JSON yang dihasilkan sebelumnya
output_dir = '../saved_model/fine-tuned-model'  # Directory di mana model yang telah di fine-tune akan disimpan
fine_tune_model(model_name, dataset_file, output_dir)




Step,Training Loss,Validation Loss


hallo


In [2]:
from datasets import load_dataset

# Memuat dataset
ds = load_dataset("akmalfairuz/indoprog")

# Loop untuk menyimpan setiap split dataset ke file CSV
for split in ds:
    # Mendefinisikan nama file output berdasarkan split
    csv_filename = f"indoprog_{split}.csv"
    
    # Mengekspor dataset split ke CSV
    ds[split].to_csv(csv_filename, index=False)
    
    print(f"Dataset {split} berhasil diunduh dan disimpan sebagai {csv_filename}")


Creating CSV from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Dataset train berhasil diunduh dan disimpan sebagai indoprog_train.csv
