In [2]:
# Mengecek ketersediaan GPU di TensorFlow
import tensorflow as tf

print("TensorFlow version:", tf.__version__)
gpu_available_tf = len(tf.config.list_physical_devices('GPU')) > 0
print("GPU is", "available" if gpu_available_tf else "NOT available", "in TensorFlow")

# Mengecek ketersediaan GPU di PyTorch
import torch

print("\nPyTorch version:", torch.__version__)
gpu_available_torch = torch.cuda.is_available()
print("GPU is", "available" if gpu_available_torch else "NOT available", "in PyTorch")

# Mengecek ketersediaan GPU di Transformers
from transformers import AutoModel

try:
    model = AutoModel.from_pretrained('bert-base-uncased')
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    print("\nTransformers are using", "GPU" if device.type == "cuda" else "CPU")
except ImportError:
    print("\nTransformers library is not installed. Please install it to check GPU availability.")


TensorFlow version: 2.12.0
GPU is NOT available in TensorFlow

PyTorch version: 2.3.0
GPU is available in PyTorch


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]




Transformers are using GPU


In [1]:
import os
import json

def convert_to_finetune_format(folder_path, output_file):
    """
    Mengonversi file teks dalam folder menjadi format fine-tuning yang diperlukan untuk model.

    Parameters:
    folder_path (str): Path folder yang berisi file teks.
    output_file (str): Path file JSON output.
    """
    data = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                lines = file.readlines()
                for line in lines:
                    if line.strip():  # Lewati baris kosong
                        conversation = [
                            {"from": "human", "value": line.strip()},
                            {"from": "gpt", "value": ""}  # Placeholder untuk respons GPT
                        ]
                        data.append({
                            "id": filename.split('.')[0],
                            "conversations": conversation
                        })

    with open(output_file, 'w', encoding='utf-8') as json_file:
        json.dump(data, json_file, ensure_ascii=False, indent=4)

# Penggunaan contoh
folder_path = '../Dataset/nlp_dataset'  # Ganti ini dengan path folder yang benar
output_file = '../Dataset/nusantara_dataset/output.json'  # Ganti ini dengan file JSON output yang diinginkan
convert_to_finetune_format(folder_path, output_file)


In [1]:
import os
from transformers import Trainer, TrainingArguments, AutoModelForCausalLM, AutoTokenizer, DataCollatorForLanguageModeling
import torch
import json

# Load dataset
def load_dataset(json_file):
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

# Fungsi untuk fine-tune model
def fine_tune_model(model_name, dataset_file, output_dir):
    # Load model dan tokenizer
    model = AutoModelForCausalLM.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Load dataset
    data = load_dataset(dataset_file)

    # Konversi data ke format yang dibutuhkan oleh model
    inputs = tokenizer([conv['conversations'][0]['value'] for conv in data], return_tensors='pt', truncation=True, padding='max_length')
    labels = tokenizer([conv['conversations'][1]['value'] for conv in data], return_tensors='pt', truncation=True, padding='max_length')

    # Membuat dataset
    class CustomDataset(torch.utils.data.Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels

        def __getitem__(self, idx):
            item = {key: val[idx] for key, val in self.encodings.items()}
            item['labels'] = self.labels['input_ids'][idx]
            return item

        def __len__(self):
            return len(self.labels['input_ids'])

    dataset = CustomDataset(inputs, labels)

    # Mendefinisikan argumen training
    training_args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=8,
        num_train_epochs=3,
        save_steps=10_000,
        save_total_limit=2,
        logging_dir='./logs',
        report_to="none",
    )

    # Inisialisasi Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
        data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
    )

    # Fine-tuning model
    trainer.train()

    # Menyimpan model
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)

# Penggunaan contoh
model_name = "kalisai/Nusantara-0.8b-Indo-Chat"  # Ganti dengan model Nusantara-7b-Indo-Chat
dataset_file = '../Dataset/nusantara_dataset/output2.json'  # Path ke file JSON yang dihasilkan sebelumnya
output_dir = '../saved_model/fine-tuned-model'  # Directory di mana model yang telah di fine-tune akan disimpan
fine_tune_model(model_name, dataset_file, output_dir)


OutOfMemoryError: CUDA out of memory. Tried to allocate 32.00 GiB. GPU 

In [None]:

import matplotlib.pyplot as plt
from transformers import TrainerCallback


class PlotLossesCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        logs = logs.copy()
        loss = logs.get('loss')
        if loss is not None:
            state.log_history.append({'step': state.global_step, 'loss': loss})

    def plot_loss(self, state):
        steps = [log['step'] for log in state.log_history]
        losses = [log['loss'] for log in state.log_history]

        plt.plot(steps, losses)
        plt.xlabel('Steps')
        plt.ylabel('Loss')
        plt.title('Training Loss Over Time')
        plt.show()

# Tambahkan callback ini ke Trainer
plot_loss_callback = PlotLossesCallback()
trainer.add_callback(plot_loss_callback)

# Setelah training selesai, panggil fungsi plot_loss untuk visualisasi
plot_loss_callback.plot_loss(trainer.state)


In [1]:
import os
from transformers import Trainer, TrainingArguments, AutoModelForCausalLM, AutoTokenizer, DataCollatorForLanguageModeling
import torch
import json

# Load dataset
def load_dataset(json_file):
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

# Fungsi untuk fine-tune model
def fine_tune_model(model_name, dataset_file, output_dir):
    # Load model dan tokenizer
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype="auto",
        device_map="auto"  # Mengatur pemetaan otomatis perangkat, misalnya ke GPU jika tersedia
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Load dataset
    data = load_dataset(dataset_file)

    # Konversi data ke format yang dibutuhkan oleh model
    inputs = tokenizer([conv['conversations'][0]['value'] for conv in data], return_tensors='pt', truncation=True, padding='max_length')
    labels = tokenizer([conv['conversations'][1]['value'] for conv in data], return_tensors='pt', truncation=True, padding='max_length')

    # Membuat dataset
    class CustomDataset(torch.utils.data.Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels

        def __getitem__(self, idx):
            item = {key: val[idx] for key, val in self.encodings.items()}
            item['labels'] = self.labels['input_ids'][idx]
            return item

        def __len__(self):
            return len(self.labels['input_ids'])

    dataset = CustomDataset(inputs, labels)

    # Mendefinisikan argumen training
    training_args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=4,  # Sesuaikan batch size agar cocok dengan ukuran model dan memori GPU
        num_train_epochs=3,
        save_steps=10_000,
        save_total_limit=2,
        logging_dir='./logs',
        report_to="none",
        fp16=True,  # Gunakan mixed precision training jika menggunakan GPU untuk mempercepat dan mengurangi memori
    )

    # Inisialisasi Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
        data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
    )

    # Fine-tuning model
    trainer.train()

    # Menyimpan model
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)

# Penggunaan contoh
model_name = "kalisai/Nusantara-1.8B-Indo-Chat"  # Ganti dengan model Nusantara-1.8B-Indo-Chat
dataset_file = '../Dataset/nusantara_dataset/output2.json'  # Path ke file JSON yang dihasilkan sebelumnya
output_dir = '../saved_model/fine-tuned-model'  # Directory di mana model yang telah di fine-tune akan disimpan
fine_tune_model(model_name, dataset_file, output_dir)


  from .autonotebook import tqdm as notebook_tqdm
Downloading shards:  50%|█████     | 1/2 [16:52<16:52, 1012.95s/it]Error while downloading from https://cdn-lfs-us-1.huggingface.co/repos/cb/bb/cbbb461b8ee31c89150d67344db4ab1e1ddc009a21adecf328c38c40481a02d7/0964f7742a2ba55040f62f864d5073a3274c8db213ac79cd3db37f9d721ee9a5?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model-00002-of-00002.safetensors%3B+filename%3D%22model-00002-of-00002.safetensors%22%3B&Expires=1723718527&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcyMzcxODUyN319LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zL2NiL2JiL2NiYmI0NjFiOGVlMzFjODkxNTBkNjczNDRkYjRhYjFlMWRkYzAwOWEyMWFkZWNmMzI4YzM4YzQwNDgxYTAyZDcvMDk2NGY3NzQyYTJiYTU1MDQwZjYyZjg2NGQ1MDczYTMyNzRjOGRiMjEzYWM3OWNkM2RiMzdmOWQ3MjFlZTlhNT9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=WZPdcDJLN0awyRASkzVdSdWHAjVbxCRrSsxJpKQ96KybFjyy5BdqJ6vE%7Eo%7EldKpOxOoREdzs5XEkrpuWiNk8XVcq

ValueError: Tried to use `fp16` but it is not supported on cpu

In [3]:
import os
from transformers import Trainer, TrainingArguments, AutoModelForCausalLM, AutoTokenizer, DataCollatorForLanguageModeling
import torch
import json
import matplotlib.pyplot as plt
from transformers import TrainerCallback

# Load dataset
def load_dataset(json_file):
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

# Fungsi untuk fine-tune model
def fine_tune_model(model_name, dataset_file, output_dir):
    # Load model dan tokenizer
    model = AutoModelForCausalLM.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Load dataset
    data = load_dataset(dataset_file)

    # Konversi data ke format yang dibutuhkan oleh model
    inputs = tokenizer([conv['conversations'][0]['value'] for conv in data], return_tensors='pt', truncation=True, padding='max_length')
    labels = tokenizer([conv['conversations'][1]['value'] for conv in data], return_tensors='pt', truncation=True, padding='max_length')

    # Membuat dataset
    class CustomDataset(torch.utils.data.Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels

        def __getitem__(self, idx):
            item = {key: val[idx] for key, val in self.encodings.items()}
            item['labels'] = self.labels['input_ids'][idx]
            return item

        def __len__(self):
            return len(self.labels['input_ids'])

    dataset = CustomDataset(inputs, labels)

    # Mendefinisikan argumen training
    training_args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=4,  # Ukuran batch yang lebih kecil
        num_train_epochs=3,
        save_steps=10_000,
        save_total_limit=2,
        logging_dir='./logs',
        report_to="none",
        fp16=True,  # Mixed precision training untuk menghemat memori
        gradient_accumulation_steps=2,  # Akumulasi gradien untuk efektifitas batch size
    )

    # Inisialisasi Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
        data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
    )

    # Tambahkan callback untuk memantau loss
    plot_loss_callback = PlotLossesCallback()
    trainer.add_callback(plot_loss_callback)

    # Fine-tuning model
    trainer.train()

    # Menyimpan model
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)

    # Visualisasi loss
    plot_loss_callback.plot_loss(trainer.state)


class PlotLossesCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        logs = logs.copy()
        loss = logs.get('loss')
        if loss is not None:
            state.log_history.append({'step': state.global_step, 'loss': loss})

    def plot_loss(self, state):
        steps = [log['step'] for log in state.log_history]
        losses = [log['loss'] for log in state.log_history]

        plt.plot(steps, losses)
        plt.xlabel('Steps')
        plt.ylabel('Loss')
        plt.title('Training Loss Over Time')
        plt.show()

# Penggunaan contoh
model_name = "kalisai/Nusantara-0.8b-Indo-Chat"  # Ganti dengan model yang kamu inginkan
dataset_file = '../Dataset/nusantara_dataset/output2.json'  # Path ke file JSON yang dihasilkan sebelumnya
output_dir = '../saved_model/fine-tuned-model'  # Directory di mana model yang telah di fine-tune akan disimpan
fine_tune_model(model_name, dataset_file, output_dir)


OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU 

In [4]:
import os
from transformers import Trainer, TrainingArguments, AutoModelForCausalLM, AutoTokenizer, DataCollatorForLanguageModeling
import torch
import json
import gc

# Load dataset
def load_dataset(json_file):
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

# Fungsi untuk fine-tune model
def fine_tune_model(model_name, dataset_file, output_dir):
    # Load model dan tokenizer
    model = AutoModelForCausalLM.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Load dataset
    data = load_dataset(dataset_file)

    # Konversi data ke format yang dibutuhkan oleh model
    inputs = tokenizer([conv['conversations'][0]['value'] for conv in data], return_tensors='pt', truncation=True, padding='max_length', max_length=128)
    labels = tokenizer([conv['conversations'][1]['value'] for conv in data], return_tensors='pt', truncation=True, padding='max_length', max_length=128)

    # Membuat dataset
    class CustomDataset(torch.utils.data.Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels

        def __getitem__(self, idx):
            item = {key: val[idx] for key, val in self.encodings.items()}
            item['labels'] = self.labels['input_ids'][idx]
            return item

        def __len__(self):
            return len(self.labels['input_ids'])

    dataset = CustomDataset(inputs, labels)

    # Mendefinisikan argumen training
    training_args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=2,  # Ubah dari 8 menjadi 2
        num_train_epochs=3,
        save_steps=10_000,
        save_total_limit=2,
        logging_dir='./logs',
        report_to="none",
        fp16=True,  # Aktifkan mixed precision training
        gradient_accumulation_steps=8,  # Mengakumulasi gradien dari beberapa batch sebelum memperbarui model
    )

    # Inisialisasi Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
        data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
    )

    # Fine-tuning model
    trainer.train()

    # Pembersihan memori
    gc.collect()
    torch.cuda.empty_cache()

    # Menyimpan model
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)

# Penggunaan contoh
model_name = "kalisai/Nusantara-0.8b-Indo-Chat"  # Ganti dengan model Nusantara-7b-Indo-Chat jika diperlukan
dataset_file = '../Dataset/nusantara_dataset/output2.json'  # Path ke file JSON yang dihasilkan sebelumnya
output_dir = '../saved_model/fine-tuned-model'  # Directory di mana model yang telah di fine-tune akan disimpan
fine_tune_model(model_name, dataset_file, output_dir)


RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [5]:
import os
from transformers import Trainer, TrainingArguments, AutoModelForCausalLM, AutoTokenizer, DataCollatorForLanguageModeling
import torch
import json
import gc

# Load dataset
def load_dataset(json_file):
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

# Fungsi untuk fine-tune model
def fine_tune_model(model_name, dataset_file, output_dir):
    # Load model dan tokenizer
    model = AutoModelForCausalLM.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Enable gradient checkpointing
    model.gradient_checkpointing_enable()

    # Load dataset
    data = load_dataset(dataset_file)

    # Konversi data ke format yang dibutuhkan oleh model
    inputs = tokenizer([conv['conversations'][0]['value'] for conv in data], return_tensors='pt', truncation=True, padding='max_length', max_length=64)
    labels = tokenizer([conv['conversations'][1]['value'] for conv in data], return_tensors='pt', truncation=True, padding='max_length', max_length=64)

    # Membuat dataset
    class CustomDataset(torch.utils.data.Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels

        def __getitem__(self, idx):
            item = {key: val[idx] for key, val in self.encodings.items()}
            item['labels'] = self.labels['input_ids'][idx]
            return item

        def __len__(self):
            return len(self.labels['input_ids'])

    dataset = CustomDataset(inputs, labels)

    # Mendefinisikan argumen training
    training_args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=1,  # Ubah dari 2 menjadi 1 jika diperlukan
        num_train_epochs=3,
        save_steps=10_000,
        save_total_limit=2,
        logging_dir='./logs',
        report_to="none",
        fp16=True,  # Aktifkan mixed precision training
        gradient_accumulation_steps=8,  # Mengakumulasi gradien dari beberapa batch sebelum memperbarui model
    )

    # Inisialisasi Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
        data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
    )

    # Fine-tuning model
    trainer.train()

    # Pembersihan memori
    gc.collect()
    torch.cuda.empty_cache()

    # Menyimpan model
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)

# Penggunaan contoh
model_name = "kalisai/Nusantara-0.8b-Indo-Chat"  # Ganti dengan model Nusantara-7b-Indo-Chat jika diperlukan
dataset_file = '../Dataset/nusantara_dataset/output2.json'  # Path ke file JSON yang dihasilkan sebelumnya
output_dir = '../saved_model/fine-tuned-model'  # Directory di mana model yang telah di fine-tune akan disimpan
fine_tune_model(model_name, dataset_file, output_dir)


RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [6]:
import os
from transformers import Trainer, TrainingArguments, AutoModelForCausalLM, AutoTokenizer, DataCollatorForLanguageModeling
import torch
import json

# Load dataset
def load_dataset(json_file):
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

# Fungsi untuk fine-tune model
def fine_tune_model(model_name, dataset_file, output_dir):
    # Load model dan tokenizer
    model = AutoModelForCausalLM.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Load dataset
    data = load_dataset(dataset_file)

    # Konversi data ke format yang dibutuhkan oleh model
    inputs = tokenizer([conv['conversations'][0]['value'] for conv in data], return_tensors='pt', truncation=True, padding='max_length', max_length=64)
    labels = tokenizer([conv['conversations'][1]['value'] for conv in data], return_tensors='pt', truncation=True, padding='max_length', max_length=64)

    # Membuat dataset
    class CustomDataset(torch.utils.data.Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels

        def __getitem__(self, idx):
            item = {key: val[idx] for key, val in self.encodings.items()}
            item['labels'] = self.labels['input_ids'][idx]
            return item

        def __len__(self):
            return len(self.labels['input_ids'])

    dataset = CustomDataset(inputs, labels)

    # Mendefinisikan argumen training
    training_args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=1,  # Ukuran batch yang lebih kecil
        gradient_accumulation_steps=16,  # Accumulate gradients for 16 steps
        num_train_epochs=3,
        save_steps=10_000,
        save_total_limit=2,
        logging_dir='./logs',
        report_to="none",
        fp16=True,  # Mixed precision training
        load_best_model_at_end=True,
    )

    # Inisialisasi Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
        data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
    )

    # Fine-tuning model
    trainer.train()

    # Menyimpan model
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)

# Penggunaan contoh
model_name = "kalisai/Nusantara-0.8b-Indo-Chat"  # Ganti dengan model Nusantara-7b-Indo-Chat
dataset_file = '../Dataset/nusantara_dataset/output2.json'  # Path ke file JSON yang dihasilkan sebelumnya
output_dir = '../saved_model/fine-tuned-model'  # Directory di mana model yang telah di fine-tune akan disimpan
fine_tune_model(model_name, dataset_file, output_dir)


ValueError: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: IntervalStrategy.NO
- Save strategy: IntervalStrategy.STEPS

In [7]:
import os
from transformers import Trainer, TrainingArguments, AutoModelForCausalLM, AutoTokenizer, DataCollatorForLanguageModeling
import torch
import json

# Load dataset
def load_dataset(json_file):
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

# Fungsi untuk fine-tune model
def fine_tune_model(model_name, dataset_file, output_dir):
    # Load model dan tokenizer
    model = AutoModelForCausalLM.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Load dataset
    data = load_dataset(dataset_file)

    # Konversi data ke format yang dibutuhkan oleh model
    inputs = tokenizer([conv['conversations'][0]['value'] for conv in data], return_tensors='pt', truncation=True, padding='max_length', max_length=64)
    labels = tokenizer([conv['conversations'][1]['value'] for conv in data], return_tensors='pt', truncation=True, padding='max_length', max_length=64)

    # Membuat dataset
    class CustomDataset(torch.utils.data.Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels

        def __getitem__(self, idx):
            item = {key: val[idx] for key, val in self.encodings.items()}
            item['labels'] = self.labels['input_ids'][idx]
            return item

        def __len__(self):
            return len(self.labels['input_ids'])

    dataset = CustomDataset(inputs, labels)

    # Mendefinisikan argumen training
    training_args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=1,  # Ukuran batch yang lebih kecil
        gradient_accumulation_steps=16,  # Accumulate gradients for 16 steps
        num_train_epochs=3,
        save_steps=10_000,
        save_total_limit=2,
        evaluation_strategy="steps",  # Evaluasi dilakukan setiap beberapa langkah
        eval_steps=5_000,  # Jumlah langkah antara evaluasi
        logging_dir='./logs',
        report_to="none",
        fp16=True,  # Mixed precision training
        load_best_model_at_end=True,
    )

    # Inisialisasi Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
        data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
    )

    # Fine-tuning model
    trainer.train()

    # Menyimpan model
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)

# Penggunaan contoh
model_name = "kalisai/Nusantara-0.8b-Indo-Chat"  # Ganti dengan model Nusantara-7b-Indo-Chat
dataset_file = '../Dataset/nusantara_dataset/output2.json'  # Path ke file JSON yang dihasilkan sebelumnya
output_dir = '../saved_model/fine-tuned-model'  # Directory di mana model yang telah di fine-tune akan disimpan
fine_tune_model(model_name, dataset_file, output_dir)




RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
