In [5]:
# Import necessary libraries
import pandas as pd
import torch
import numpy as np
import os
import pickle
import logging
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm  


# Setup logging
logging.basicConfig(filename='train_model.log', level=logging.INFO, 
                    format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger()

# Setup local dataset path
file_path = '../Bert/dataSet/notes1.csv'  # Ganti dengan path lokal Anda
df = pd.read_csv(file_path)
print(df.head())

# Label encoding
label_dict = {label: idx for idx, label in enumerate(df['category'].unique())}
df['label'] = df['category'].replace(label_dict)

# Split dataset into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    df.index.values, df['label'].values, test_size=0.15, random_state=42, stratify=df['label'].values
)

df['data_type'] = 'not_set'
df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'

# Tokenize data
tokenizer = BertTokenizer.from_pretrained('indolem/indobert-base-uncased', do_lower_case=True)

def encode_data(df, data_type):
    return tokenizer.batch_encode_plus(
        df[df['data_type'] == data_type]['note'].values,
        add_special_tokens=True,
        return_attention_mask=True,
        padding='max_length',  
        truncation=True,  
        max_length=256,
        return_tensors='pt'
    )


encoded_data_train = encode_data(df, 'train')
encoded_data_val = encode_data(df, 'val')

input_ids_train, attention_masks_train = encoded_data_train['input_ids'], encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df['data_type'] == 'train']['label'].values)

input_ids_val, attention_masks_val = encoded_data_val['input_ids'], encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df['data_type'] == 'val']['label'].values)

# Create TensorDatasets
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

# Setup DataLoader
batch_size = 3
dataloader_train = DataLoader(dataset_train, sampler=RandomSampler(dataset_train), batch_size=batch_size)
dataloader_val = DataLoader(dataset_val, sampler=SequentialSampler(dataset_val), batch_size=batch_size)

# Load BERT model for sequence classification with pytorch
model = BertForSequenceClassification.from_pretrained(
    "indolem/indobertweet-base-uncased", num_labels=len(label_dict), output_attentions=False, output_hidden_states=False
)

# Setup optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
epochs = 1
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=0, num_training_steps=len(dataloader_train) * epochs
)

# Define evaluation metrics
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

# Training loop
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

def evaluate(dataloader):
    model.eval()
    total_loss = 0
    predictions, true_vals = [], []

    for batch in dataloader:
        batch = tuple(b.to(device) for b in batch)
        with torch.no_grad():
            outputs = model(**{'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]})
        loss, logits = outputs[:2]
        total_loss += loss.item()
        predictions.append(logits.detach().cpu().numpy())
        true_vals.append(batch[2].cpu().numpy())

    avg_loss = total_loss / len(dataloader)
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
    return avg_loss, predictions, true_vals

# Train and evaluate model
for epoch in range(1, epochs + 1):
    model.train()
    total_train_loss = 0
    
    # Initialize tqdm progress bar
    progress_bar = tqdm(dataloader_train, desc=f'Epoch {epoch}/{epochs}', leave=False, disable=False)

    for batch in progress_bar:
        model.zero_grad()
        batch = tuple(b.to(device) for b in batch)
        outputs = model(**{'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]})
        loss = outputs[0]
        total_train_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

        # Update tqdm progress bar description with the current loss
        progress_bar.set_postfix({'loss': loss.item()})

    avg_train_loss = total_train_loss / len(dataloader_train)
    val_loss, predictions, true_vals = evaluate(dataloader_val)
    val_f1 = f1_score_func(predictions, true_vals)

    # Log metrics
    logger.info(f'Epoch {epoch}/{epochs}')
    logger.info(f'Training Loss: {avg_train_loss}')
    logger.info(f'Validation Loss: {val_loss}')
    logger.info(f'Validation F1 Score: {val_f1}')

    # Print metrics
    print(f'Epoch {epoch}/{epochs}')
    print(f'Training Loss: {avg_train_loss}')
    print(f'Validation Loss: {val_loss}')
    print(f'Validation F1 Score: {val_f1}')

    output_dir = 'HasilModel'
    os.makedirs(output_dir, exist_ok=True)
    
    # Save model in .pt format
    model_save_path = os.path.join(output_dir, f'finetuned_BERT_epoch_{epoch}.pt')
    torch.save(model.state_dict(), model_save_path)

# Save final model in .pt format
final_model_path = os.path.join(output_dir, 'model_final.pt')
torch.save(model.state_dict(), final_model_path)


                                             note           category
0       0075 0084 lunas nota baru 4pc r15h5 lgs06           Pinjaman
1                                1 ayam tgl 9 des  Makanan & Minuman
2            1 jt dp kerjaan 1 juta dp sewa mobil            Tagihan
3   1 juta dp makanan minggu 250 rb uang mingguan  Makanan & Minuman
4                                     1 kg daging  Makanan & Minuman


  df['label'] = df['category'].replace(label_dict)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobertweet-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/1:   0%|          | 345/342015 [10:29<178:46:34,  1.88s/it, loss=1.86] 