In [None]:
import torch
import neptune
from transformers import Trainer, TrainingArguments, AutoModelForCausalLM, AutoTokenizer, DataCollatorForLanguageModeling
import json

# Menggunakan GPU jika tersedia, fallback ke CPU jika OutOfMemoryError terjadi
def load_model(model_name):
    try:
        model = AutoModelForCausalLM.from_pretrained(model_name).to('cuda')
        print("Model loaded on GPU")
    except RuntimeError as e:
        if 'out of memory' in str(e):
            print("Out of memory error on GPU. Loading model on CPU...")
            model = AutoModelForCausalLM.from_pretrained(model_name).to('cpu')
        else:
            raise e
    return model

# Load dataset dari file JSON
def load_dataset(json_file):
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

def fine_tune_model(model_name, dataset_file, output_dir):
    # Load model dan tokenizer
    model = load_model(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Load dataset
    data = load_dataset(dataset_file)

    # Preprocess dataset
    inputs = tokenizer([conv['conversations'][0]['value'] for conv in data], return_tensors='pt', truncation=True, padding='max_length', max_length=64)
    labels = tokenizer([conv['conversations'][1]['value'] for conv in data], return_tensors='pt', truncation=True, padding='max_length', max_length=64)

    class CustomDataset(torch.utils.data.Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels

        def __getitem__(self, idx):
            item = {key: val[idx] for key, val in self.encodings.items()}
            item['labels'] = self.labels['input_ids'][idx]
            return item

        def __len__(self):
            return len(self.labels['input_ids'])

    dataset = CustomDataset(inputs, labels)

    # Logging hyperparameter ke Neptune
    params = {
        "model_name": model_name,
        "batch_size": 1,
        "num_train_epochs": 3,
        "learning_rate": 5e-5,
        "gradient_accumulation_steps": 16,
    }
    run["parameters"] = params

    # Mendefinisikan argumen training
    training_args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=params["batch_size"],
        gradient_accumulation_steps=params["gradient_accumulation_steps"],
        num_train_epochs=params["num_train_epochs"],
        save_steps=500,
        save_total_limit=2,
        evaluation_strategy="steps",
        eval_steps=250,
        logging_dir='./logs',
        report_to="none",
        fp16=torch.cuda.is_available(),
        load_best_model_at_end=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
        data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
    )

    trainer.train()
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)

    run.stop()

    print(f"Model fine-tuning selesai dan disimpan di: {output_dir}")

# Inisialisasi Neptune
run = neptune.init_run(
    project="gabrielbatavia/Pintarpath",
    api_token="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiJhYmNjYmMxMi01NTZiLTRmMzgtOTg3ZC1hMDk0YjJhZGI3MzQifQ==",
    name="Fine-tuning Nusantara Chat Model",
    tags=["fine-tuning", "NLP", "Transformers"],
    dependencies="infer",
    capture_hardware_metrics=True,
)

# Contoh penggunaan
model_name = "kalisai/Nusantara-0.8b-Indo-Chat"
dataset_file = '../Dataset/nusantara_dataset/output2.json'
output_dir = '../saved_model/fine-tuned-model'
fine_tune_model(model_name, dataset_file, output_dir)


[neptune] [info   ] Neptune initialized. Open in the app: https://app.neptune.ai/gabrielbatavia/Pintarpath/e/PIN-3


## code di gcp

In [None]:
import torch
import neptune
from transformers import Trainer, TrainingArguments, AutoModelForCausalLM, AutoTokenizer, DataCollatorForLanguageModeling
import json
import os
from sklearn.metrics import accuracy_score
import subprocess

# Fungsi untuk mendownload dataset dari bucket GCS
def download_from_gcs(bucket_path, local_path):
    subprocess.run(['gsutil', 'cp', bucket_path, local_path])

# Fungsi untuk mengupload model ke bucket GCS
def upload_to_gcs(local_path, bucket_path):
    subprocess.run(['gsutil', 'cp', '-r', local_path, bucket_path])

# Menggunakan GPU jika tersedia, fallback ke CPU jika OutOfMemoryError terjadi
def load_model(model_name):
    try:
        model = AutoModelForCausalLM.from_pretrained(model_name).to('cuda' if torch.cuda.is_available() else 'cpu')
        print("Model loaded on GPU" if torch.cuda.is_available() else "Model loaded on CPU")
    except RuntimeError as e:
        if 'out of memory' in str(e):
            print("Out of memory error on GPU. Loading model on CPU...")
            model = AutoModelForCausalLM.from_pretrained(model_name).to('cpu')
        else:
            raise e
    return model

# Load dataset dari file JSON
def load_dataset(json_file):
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

# Fungsi evaluasi accuracy
def compute_accuracy(pred):
    predictions = pred.predictions.argmax(-1)
    labels = pred.label_ids
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc}

def fine_tune_model(model_name, dataset_file, output_dir):
    # Load model dan tokenizer
    model = load_model(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Load dataset
    data = load_dataset(dataset_file)

    # Preprocess dataset
    inputs = tokenizer([conv['conversations'][0]['value'] for conv in data], return_tensors='pt', truncation=True, padding='max_length', max_length=64)
    labels = tokenizer([conv['conversations'][1]['value'] for conv in data], return_tensors='pt', truncation=True, padding='max_length', max_length=64)

    class CustomDataset(torch.utils.data.Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels

        def __getitem__(self, idx):
            item = {key: val[idx] for key, val in self.encodings.items()}
            item['labels'] = self.labels['input_ids'][idx]
            return item

        def __len__(self):
            return len(self.labels['input_ids'])

    dataset = CustomDataset(inputs, labels)

    # Logging hyperparameter ke Neptune
    params = {
        "model_name": model_name,
        "batch_size": 1,
        "num_train_epochs": 3,
        "learning_rate": 5e-5,
        "gradient_accumulation_steps": 16,
    }
    run["parameters"] = params

    # Mendefinisikan argumen training
    training_args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=params["batch_size"],
        gradient_accumulation_steps=params["gradient_accumulation_steps"],
        num_train_epochs=params["num_train_epochs"],
        save_steps=500,
        save_total_limit=2,
        evaluation_strategy="steps",
        eval_steps=250,
        logging_dir='./logs',
        report_to="none",
        fp16=torch.cuda.is_available(),
        load_best_model_at_end=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
        data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
        compute_metrics=compute_accuracy,  # Tambahkan fungsi compute_metrics
    )

    trainer.train()
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)

    run.stop()

    print(f"Model fine-tuning selesai dan disimpan di: {output_dir}")

# Inisialisasi Neptune
run = neptune.init_run(
    project="gabrielbatavia/Pintarpath",
    api_token=os.getenv("NEPTUNE_API_TOKEN"),  # Gunakan environment variable untuk keamanan
    name="Fine-tuning Nusantara Chat Model",
    tags=["fine-tuning", "NLP", "Transformers"],
    dependencies="infer",
    capture_hardware_metrics=True,
)

# Contoh penggunaan
bucket_dataset_path = 'gs://pintarpath-trying/output2.json'  # Path di GCS
local_dataset_file = './output2.json'  # Path lokal untuk dataset
bucket_output_dir = 'gs://pintarpath-trying/saved_model/fine-tuned-model'  # Path output di GCS
local_output_dir = './fine-tuned-model'  # Path lokal untuk menyimpan model sebelum upload

# Download dataset dari GCS
download_from_gcs(bucket_dataset_path, local_dataset_file)

# Fine-tuning model
fine_tune_model(model_name="kalisai/Nusantara-0.8b-Indo-Chat", dataset_file=local_dataset_file, output_dir=local_output_dir)

# Upload model ke GCS
upload_to_gcs(local_output_dir, bucket_output_dir)


# Improve

In [None]:
import torch
import neptune
from transformers import Trainer, TrainingArguments, AutoModelForCausalLM, AutoTokenizer, DataCollatorForLanguageModeling
import json
import os
from sklearn.metrics import accuracy_score, f1_score
from rouge_score import rouge_scorer
import subprocess
import matplotlib.pyplot as plt
from neptune.types import File

# Inisialisasi Neptune
run = neptune.init_run(
    project="gabrielbatavia/Pintarpath",
    api_token=os.getenv("NEPTUNE_API_TOKEN"),  # Gunakan environment variable untuk keamanan
    name="Fine-tuning Nusantara Chat Model",
    tags=["fine-tuning", "NLP", "Transformers"],
    dependencies="infer",
    capture_hardware_metrics=True,
)

# Fungsi untuk mendownload dataset dari bucket GCS
def download_from_gcs(bucket_path, local_path):
    subprocess.run(['gsutil', 'cp', bucket_path, local_path])

# Fungsi untuk mengupload file ke bucket GCS
def upload_to_gcs(local_path, bucket_path):
    subprocess.run(['gsutil', 'cp', local_path, bucket_path])

# Menggunakan GPU jika tersedia, fallback ke CPU jika OutOfMemoryError terjadi
def load_model(model_name):
    try:
        model = AutoModelForCausalLM.from_pretrained(model_name).to('cuda' if torch.cuda.is_available() else 'cpu')
        print("Model loaded on GPU" if torch.cuda.is_available() else "Model loaded on CPU")
    except RuntimeError as e:
        if 'out of memory' in str(e):
            print("Out of memory error on GPU. Loading model on CPU...")
            model = AutoModelForCausalLM.from_pretrained(model_name).to('cpu')
        else:
            raise e
    return model

# Load dataset dari file JSON
def load_dataset(json_file):
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

# Fungsi evaluasi ROUGE dan F1 Score
def compute_metrics(pred):
    predictions = pred.predictions.argmax(-1)
    labels = pred.label_ids

    # Decode predictions dan labels menjadi teks
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Hitung F1 Score
    f1 = f1_score(decoded_labels, decoded_preds, average='weighted')

    # Hitung ROUGE
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = {key: 0 for key in scorer.score("", "").keys()}
    n = len(decoded_preds)

    for i in range(n):
        scores = scorer.score(decoded_labels[i], decoded_preds[i])
        for key in rouge_scores:
            rouge_scores[key] += scores[key].fmeasure

    # Ambil rata-rata ROUGE score
    rouge_scores = {key: value / n for key, value in rouge_scores.items()}

    # Log metrik ke Neptune secara eksplisit
    run["metrics/accuracy"].append(accuracy_score(labels, predictions))
    run["metrics/f1"].append(f1)
    run["metrics/rouge1"].append(rouge_scores['rouge1'])
    run["metrics/rouge2"].append(rouge_scores['rouge2'])
    run["metrics/rougeL"].append(rouge_scores['rougeL'])

    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1,
        "rouge1": rouge_scores['rouge1'],
        "rouge2": rouge_scores['rouge2'],
        "rougeL": rouge_scores['rougeL']
    }

# Modifikasi fungsi fine_tune_model untuk menyertakan tokenizer sebagai global variable
def fine_tune_model(model_name, dataset_file, output_dir):
    global tokenizer
    # Load model dan tokenizer
    model = load_model(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Load dataset
    data = load_dataset(dataset_file)

    # Preprocess dataset
    inputs = tokenizer([conv['conversations'][0]['value'] for conv in data], return_tensors='pt', truncation=True, padding='max_length', max_length=64)
    labels = tokenizer([conv['conversations'][1]['value'] for conv in data], return_tensors='pt', truncation=True, padding='max_length', max_length=64)

    class CustomDataset(torch.utils.data.Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels

        def __getitem__(self, idx):
            item = {key: val[idx] for key, val in self.encodings.items()}
            item['labels'] = self.labels['input_ids'][idx]
            return item

        def __len__(self):
            return len(self.labels['input_ids'])

    dataset = CustomDataset(inputs, labels)

    # Logging hyperparameter ke Neptune
    params = {
        "model_name": model_name,
        "batch_size": 8,
        "num_train_epochs": 100,
        "learning_rate": 0.005,
        "gradient_accumulation_steps": 16,
    }
    run["parameters"] = params

    # Mendefinisikan argumen training
    training_args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=params["batch_size"],
        gradient_accumulation_steps=params["gradient_accumulation_steps"],
        num_train_epochs=params["num_train_epochs"],
        save_steps=500,
        save_total_limit=2,
        evaluation_strategy="steps",
        eval_steps=250,
        logging_dir='./logs',
        report_to="none",
        fp16=torch.cuda.is_available(),
        load_best_model_at_end=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
        data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
        compute_metrics=compute_metrics,  # Menggunakan fungsi compute_metrics yang baru
    )

    # Jalankan pelatihan
    trainer.train()

    # Contoh Logging Gambar Visualisasi ke Neptune dan GCS
    plt.figure(figsize=(10, 6))
    plt.hist([1, 2, 1], bins=3)  # Contoh histogram, ganti dengan visualisasi aktual
    plt.title("Contoh Visualisasi Histogram")
    
    # Simpan gambar lokal
    local_image_path = "histogram.png"
    plt.savefig(local_image_path)
    
    # Logging gambar ke Neptune
    run["train/distribution"].upload(local_image_path)
    
    # Upload gambar ke GCS
    bucket_image_path = 'gs://pintarpath-trying/visualizations/histogram.png'
    upload_to_gcs(local_image_path, bucket_image_path)

    # Simpan model dan tokenizer
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)

    # Hentikan logging Neptune
    run.stop()

    print(f"Model fine-tuning selesai dan disimpan di: {output_dir}")

# Contoh penggunaan
bucket_dataset_path = 'gs://pintarpath-trying/output2.json'  # Path di GCS
local_dataset_file = './output2.json'  # Path lokal untuk dataset
# bucket_output_dir = 'gs://pintarpath-trying/saved_model/fine-tuned-model'  # Path output di GCS
local_output_dir = './fine-tuned-model'  # Path lokal untuk menyimpan model sebelum upload

# Download dataset dari GCS
download_from_gcs(bucket_dataset_path, local_dataset_file)

# Fine-tuning model
fine_tune_model(model_name="kalisai/Nusantara-0.8b-Indo-Chat", dataset_file=local_dataset_file, output_dir=local_output_dir)

# Upload model ke GCS (Dikomentari untuk menghemat penyimpanan)
# upload_to_gcs(local_output_dir, bucket_output_dir)


In [None]:
import torch
import neptune
from transformers import Trainer, TrainingArguments, AutoModelForCausalLM, AutoTokenizer, DataCollatorForLanguageModeling
import json
import os
from sklearn.metrics import accuracy_score, f1_score
from rouge_score import rouge_scorer
import subprocess
import matplotlib.pyplot as plt

# Inisialisasi Neptune
run = neptune.init_run(
    project="gabrielbatavia/Pintarpath",
    api_token = "",
    name="Fine-tuning Nusantara Chat Model",
    tags=["fine-tuning", "NLP", "Transformers"],
    dependencies="infer",
    capture_hardware_metrics=True,
)

# Fungsi untuk mendownload dataset dari bucket GCS
def download_from_gcs(bucket_path, local_path):
    subprocess.run(['gsutil', 'cp', bucket_path, local_path])

# Fungsi untuk mengupload file ke bucket GCS
def upload_to_gcs(local_path, bucket_path):
    subprocess.run(['gsutil', 'cp', local_path, bucket_path])

# Menggunakan GPU jika tersedia, fallback ke CPU jika OutOfMemoryError terjadi
def load_model(model_name):
    try:
        model = AutoModelForCausalLM.from_pretrained(model_name).to('cuda' if torch.cuda.is_available() else 'cpu')
        print("Model loaded on GPU" if torch.cuda.is_available() else "Model loaded on CPU")
    except RuntimeError as e:
        if 'out of memory' in str(e):
            print("Out of memory error on GPU. Loading model on CPU...")
            model = AutoModelForCausalLM.from_pretrained(model_name).to('cpu')
        else:
            raise e
    return model

# Load dataset dari file JSON
def load_dataset(json_file):
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

# Fungsi evaluasi ROUGE dan F1 Score
def compute_metrics(pred):
    predictions = pred.predictions.argmax(-1)
    labels = pred.label_ids

    # Decode predictions dan labels menjadi teks
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Hitung F1 Score
    f1 = f1_score(decoded_labels, decoded_preds, average='weighted')

    # Hitung ROUGE
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = {key: 0 for key in scorer.score("", "").keys()}
    n = len(decoded_preds)

    for i in range(n):
        scores = scorer.score(decoded_labels[i], decoded_preds[i])
        for key in rouge_scores:
            rouge_scores[key] += scores[key].fmeasure

    # Ambil rata-rata ROUGE score
    rouge_scores = {key: value / n for key, value in rouge_scores.items()}

    # Log metrik ke Neptune secara eksplisit
    run["metrics/accuracy"].append(accuracy_score(labels, predictions))
    run["metrics/f1"].append(f1)
    run["metrics/rouge1"].append(rouge_scores['rouge1'])
    run["metrics/rouge2"].append(rouge_scores['rouge2'])
    run["metrics/rougeL"].append(rouge_scores['rougeL'])

    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1,
        "rouge1": rouge_scores['rouge1'],
        "rouge2": rouge_scores['rouge2'],
        "rougeL": rouge_scores['rougeL']
    }

# Modifikasi fungsi fine_tune_model untuk menyertakan tokenizer sebagai global variable
def fine_tune_model(model_name, dataset_file, output_dir):
    global tokenizer
    # Load model dan tokenizer
    model = load_model(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Load dataset
    data = load_dataset(dataset_file)

    # Preprocess dataset
    inputs = tokenizer([conv['conversations'][0]['value'] for conv in data], return_tensors='pt', truncation=True, padding='max_length', max_length=64)
    labels = tokenizer([conv['conversations'][1]['value'] for conv in data], return_tensors='pt', truncation=True, padding='max_length', max_length=64)

    class CustomDataset(torch.utils.data.Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels

        def __getitem__(self, idx):
            item = {key: val[idx] for key, val in self.encodings.items()}
            item['labels'] = self.labels['input_ids'][idx]
            return item

        def __len__(self):
            return len(self.labels['input_ids'])

    dataset = CustomDataset(inputs, labels)

    # Logging hyperparameter ke Neptune
    params = {
        "model_name": model_name,
        "batch_size": 12,
        "num_train_epochs": 20,
        "learning_rate": 0.005,
        "gradient_accumulation_steps": 16,
    }
    run["parameters"] = params

    # Mendefinisikan argumen training
    training_args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=params["batch_size"],
        gradient_accumulation_steps=params["gradient_accumulation_steps"],
        num_train_epochs=params["num_train_epochs"],
        save_steps=500,
        save_total_limit=2,
        evaluation_strategy="steps",
        eval_steps=250,
        logging_dir='./logs',
        report_to="none",
        fp16=torch.cuda.is_available(),
        load_best_model_at_end=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
        data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
        compute_metrics=compute_metrics,  # Menggunakan fungsi compute_metrics yang baru
    )

    # Jalankan pelatihan
    trainer.train()

    # Contoh Logging Gambar Visualisasi ke Neptune dan GCS
    plt.figure(figsize=(10, 6))
    plt.hist([1, 2, 1], bins=3)  # Contoh histogram, ganti dengan visualisasi aktual
    plt.title("Contoh Visualisasi Histogram")
    
    # Simpan gambar lokal
    local_image_path = "histogram.png"
    plt.savefig(local_image_path)
    
    # Logging gambar ke Neptune
    run["train/distribution"].upload(local_image_path)
    
    # Upload gambar ke GCS
    bucket_image_path = 'gs://pintarpath-trying/visualizations/histogram.png'
    upload_to_gcs(local_image_path, bucket_image_path)

    # Simpan model dan tokenizer
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)

    # Hentikan logging Neptune
    run.stop()

    print(f"Model fine-tuning selesai dan disimpan di: {output_dir}")

# Contoh penggunaan
bucket_dataset_path = 'gs://pintarpath-trying/output2.json'  # Path di GCS
local_dataset_file = './output2.json'  # Path lokal untuk dataset
local_output_dir = './fine-tuned-model'  # Path lokal untuk menyimpan model sebelum upload

# Download dataset dari GCS
download_from_gcs(bucket_dataset_path, local_dataset_file)

# Fine-tuning model
fine_tune_model(model_name="kalisai/Nusantara-0.8b-Indo-Chat", dataset_file=local_dataset_file, output_dir=local_output_dir)

