

<a href="https://colab.research.google.com/drive/1xl9tb59sO4VI530TBETPROdJp-5AQrn0#scrollTo=JwE3dLFl4oTX" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Script 1

In [None]:
# !pip install transformers torch accelerate

# Model Bahasalab/Bahasa-4b-chat only in PyTorch format

In [None]:
import accelerate
print(accelerate.__version__)

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Specify the device to use (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    "Bahasalab/Bahasa-4b-chat-v2",
    torch_dtype="auto"
).to(device)

tokenizer = AutoTokenizer.from_pretrained("Bahasalab/Bahasa-4b-chat")

In [None]:
# Prepare the chat messages
messages = [
    {"role": "system", "content": "Kamu adalah asisten yang membantu seputar isu keuangan"},
    {"role": "user", "content": "siapa kamu"}
]

# Tokenize the chat template
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)

model_inputs = tokenizer([text], return_tensors="pt").to(device)

# Generate the response
generated_ids = model.generate(
    input_ids=model_inputs.input_ids,
    attention_mask=model_inputs.attention_mask,
    max_new_tokens=512,
    eos_token_id=tokenizer.eos_token_id
)

generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

# Decode the generated response
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)


In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForCausalLM, AdamW, get_scheduler
import matplotlib.pyplot as plt
import os
from tqdm import tqdm

# Load dataset
dataset_path = '../data/generative-ai/finansial_dataset.csv'
data = pd.read_csv(dataset_path)

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained("Bahasalab/Bahasa-4b-chat")


In [None]:
# Dataset Class
class FinancialDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        prompt = self.data.iloc[idx]['prompt']
        response = self.data.iloc[idx]['response']
        model_inputs = self.tokenizer(prompt, max_length=self.max_length, truncation=True, padding="max_length", return_tensors="pt")
        labels = self.tokenizer(response, max_length=self.max_length, truncation=True, padding="max_length", return_tensors="pt")
        model_inputs['labels'] = labels['input_ids']
        return model_inputs

# Reduce Batch Size
train_dataset = FinancialDataset(data, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True)  # Reduced batch size

# Detect device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Model Setup
model = AutoModelForCausalLM.from_pretrained("Bahasalab/Bahasa-4b-chat").to(device)

optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)


In [None]:
# Training Loop
loss_values = []

model.train()
for epoch in range(num_epochs):
    epoch_loss = 0
    print(f"Epoch {epoch + 1}/{num_epochs}")
    progress_bar = tqdm(train_dataloader, desc=f"Training Epoch {epoch + 1}")
    for batch in progress_bar:
        # Ensure correct tensor shape
        batch = {k: v.to(device).squeeze(1) if v.dim() > 2 else v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        epoch_loss += loss.item()
        loss_values.append(loss.item())
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        lr_scheduler.step()

        progress_bar.set_postfix(loss=loss.item())

    print(f"Epoch {epoch + 1} Loss: {epoch_loss / len(train_dataloader)}")

In [None]:
# Save Model and Tokenizer
model_save_path = './saved_model'
os.makedirs(model_save_path, exist_ok=True)
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

# Plotting
plt.plot(loss_values)
plt.xlabel('Training Steps')
plt.ylabel('Loss')
plt.title('Training Loss')
plt.show()


In [None]:
# Test Model
def test_model(prompt):
    model.eval()
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    generated_ids = model.generate(input_ids=inputs.input_ids, attention_mask=inputs.attention_mask, max_new_tokens=50)
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return response

test_prompt = "Apa itu finansial?"
print(test_model(test_prompt))

In [None]:
# from google.colab import files
# import os

# # Directory where the model and tokenizer are saved
# model_save_path = './saved_model'

# # Create a zip file of the saved model directory
# os.system(f"zip -r saved_model.zip {model_save_path}")

# # Download the zip file to your local computer
# files.download("saved_model.zip")
