# Script 1

In [None]:
# !pip install transformers tensorflow matplotlib pandas

# Model Bahasalab/Bahasa-4b-chat is likely only available in PyTorch format

In [1]:
# 1. Setup Environment
# !pip install torch transformers pandas matplotlib

import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForCausalLM, AdamW, get_scheduler
import matplotlib.pyplot as plt

# Load dataset
dataset_path = '../data/generative-ai/final_dataset.csv'
data = pd.read_csv(dataset_path)

# 2. Preprocess Data
tokenizer = AutoTokenizer.from_pretrained("Bahasalab/Bahasa-4b-chat")

class FinancialDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        context = self.data.iloc[idx]['context']
        question = self.data.iloc[idx]['question']
        answer = self.data.iloc[idx]['answer']
        inputs = context + " " + question
        model_inputs = self.tokenizer(inputs, max_length=self.max_length, truncation=True, padding="max_length", return_tensors="pt")
        labels = self.tokenizer(answer, max_length=self.max_length, truncation=True, padding="max_length", return_tensors="pt")
        model_inputs['labels'] = labels['input_ids']
        return model_inputs

train_dataset = FinancialDataset(data, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)

# 3. Fine-tune Model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForCausalLM.from_pretrained("Bahasalab/Bahasa-4b-chat").to(device)

optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

progress_bar = range(num_training_steps)
loss_values = []

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.squeeze().to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss_values.append(loss.item())
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

# 4. Create Plotting
plt.plot(loss_values)
plt.xlabel('Training Steps')
plt.ylabel('Loss')
plt.title('Training Loss')
plt.show()

# 5. Test Model
def test_model(prompt):
    model.eval()
    messages = [
        {"role": "system", "content": "Kamu adalah asisten yang membantu seputar isu keuangan"},
        {"role": "user", "content": prompt}
    ]
    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer([text], return_tensors="pt").to(device)
    generated_ids = model.generate(input_ids=inputs.input_ids, attention_mask=inputs.attention_mask, max_new_tokens=50)
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return response

test_prompt = "Apa itu ROI?"
print(test_model(test_prompt))


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



: 

In [1]:
# Required Libraries
# !pip install torch transformers pandas matplotlib

import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForCausalLM, get_scheduler
import matplotlib.pyplot as plt

# Load dataset
dataset_path = '../data/generative-ai/final_dataset.csv'
data = pd.read_csv(dataset_path)

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained("Bahasalab/Bahasa-4b-chat", force_download=False)

# Dataset Class
class FinancialDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        context = self.data.iloc[idx]['context']
        question = self.data.iloc[idx]['question']
        answer = self.data.iloc[idx]['answer']
        inputs = context + " " + question
        model_inputs = self.tokenizer(inputs, max_length=self.max_length, truncation=True, padding="max_length", return_tensors="pt")
        labels = self.tokenizer(answer, max_length=self.max_length, truncation=True, padding="max_length", return_tensors="pt")
        model_inputs['labels'] = labels['input_ids']
        return model_inputs

# Reduce Batch Size
train_dataset = FinancialDataset(data, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)

# Model Setup
device = torch.device("cpu")
model = AutoModelForCausalLM.from_pretrained("Bahasalab/Bahasa-4b-chat", force_download=False).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

# Training Loop
loss_values = []

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.squeeze().to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss_values.append(loss.item())
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

# Plotting
plt.plot(loss_values)
plt.xlabel('Training Steps')
plt.ylabel('Loss')
plt.title('Training Loss')
plt.show()

# Test Model
def test_model(prompt):
    model.eval()
    messages = [
        {"role": "system", "content": "Kamu adalah asisten yang membantu seputar isu keuangan"},
        {"role": "user", "content": prompt}
    ]
    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer([text], return_tensors="pt").to(device)
    generated_ids = model.generate(input_ids=inputs.input_ids, attention_mask=inputs.attention_mask, max_new_tokens=50)
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return response

test_prompt = "Apa itu ROI?"
print(test_model(test_prompt))


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



: 

# Support Tensorflow

In [3]:
# Required Libraries
# !pip install tensorflow transformers pandas matplotlib

import pandas as pd
import tensorflow as tf
from transformers import AutoTokenizer, TFBertLMHeadModel, create_optimizer
import matplotlib.pyplot as plt

# Load dataset
dataset_path = '../data/generative-ai/final_dataset.csv'
data = pd.read_csv(dataset_path)

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained("cahya/bert-base-indonesian-522M", force_download=False)

# Dataset Preparation
class FinancialDataset(tf.data.Dataset):
    def __new__(cls, data, tokenizer, max_length=512):
        def gen():
            for idx in range(len(data)):
                context = data.iloc[idx]['context']
                question = data.iloc[idx]['question']
                answer = data.iloc[idx]['answer']
                inputs = context + " " + question
                model_inputs = tokenizer(inputs, max_length=max_length, truncation=True, padding="max_length", return_tensors="tf")
                labels = tokenizer(answer, max_length=max_length, truncation=True, padding="max_length", return_tensors="tf")
                input_ids = model_inputs['input_ids'][0]
                attention_mask = model_inputs['attention_mask'][0]
                labels = labels['input_ids'][0]
                yield {'input_ids': input_ids, 'attention_mask': attention_mask}, labels
        
        return tf.data.Dataset.from_generator(
            gen,
            output_signature=(
                {
                    'input_ids': tf.TensorSpec(shape=(max_length,), dtype=tf.int32),
                    'attention_mask': tf.TensorSpec(shape=(max_length,), dtype=tf.int32),
                },
                tf.TensorSpec(shape=(max_length,), dtype=tf.int32)
            )
        )

# Create the dataset
train_dataset = FinancialDataset(data, tokenizer)
train_dataset = train_dataset.shuffle(len(data)).batch(4)

# Model Setup
model = TFBertLMHeadModel.from_pretrained("cahya/bert-base-indonesian-522M", is_decoder=True)

# Optimizer and Loss
num_epochs = 3
batch_size = 4
steps_per_epoch = len(data) // batch_size
total_steps = steps_per_epoch * num_epochs
optimizer, lr_schedule = create_optimizer(
    init_lr=5e-5, num_train_steps=total_steps, num_warmup_steps=0
)

# Compile the model with custom loss function
def masked_sparse_categorical_crossentropy(y_true, y_pred):
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    # Mask out the padding tokens
    mask = tf.cast(tf.not_equal(y_true, 0), dtype=tf.float32)
    loss = loss_fn(y_true, y_pred)
    loss *= mask
    return tf.reduce_sum(loss) / tf.reduce_sum(mask)

model.compile(optimizer=optimizer, loss=masked_sparse_categorical_crossentropy)

# Training Loop
history = model.fit(train_dataset, epochs=num_epochs)

# Plotting
loss_values = history.history['loss']
plt.plot(loss_values)
plt.xlabel('Training Steps')
plt.ylabel('Loss')
plt.title('Training Loss')
plt.show()

# Test Model
def test_model(prompt):
    messages = [
        {"role": "system", "content": "Kamu adalah asisten yang membantu seputar isu keuangan"},
        {"role": "user", "content": prompt}
    ]
    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer([text], return_tensors="tf")
    generated_ids = model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], max_new_tokens=50)
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return response

test_prompt = "Apa itu ROI?"
print(test_model(test_prompt))


All model checkpoint layers were used when initializing TFBertLMHeadModel.

All the layers of TFBertLMHeadModel were initialized from the model checkpoint at cahya/bert-base-indonesian-522M.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertLMHeadModel for predictions without further training.


Epoch 1/3


Epoch 2/3

KeyboardInterrupt: 