# Fine Tuned Bert

### For Comments

In [None]:
!pip install -U transformers datasets scikit-learn pandas tqdm

Collecting transformers
  Downloading transformers-4.53.1-py3-none-any.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.7.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (17 kB)
Collecting pandas
  Downloading pandas-2.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading transformers-4.53.1-py3-none-any.whl (10.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m45.7 MB/s[0m eta [36m0:00:00[0m

In [None]:
import os
import pandas as pd
import torch
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
)

In [None]:
train = pd.read_csv("https://raw.githubusercontent.com/Fahmi-mi/Dataset/refs/heads/main/datathon-ristek-ui-2025/input_instagram_brands/train_bert_comment.csv")

In [None]:
os.environ["WANDB_DISABLED"] = "true"
df = train.dropna(subset=["comment", "predicted_label"])


In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["comment"].tolist(), df["predicted_label"].tolist(), test_size=0.2, random_state=42
)

label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(train_labels)
val_labels_encoded = label_encoder.transform(val_labels)
label_names = label_encoder.classes_.tolist()

# Tokenisasi
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

# Dataset PyTorch
class KomentarDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            key: torch.tensor(val[idx]) for key, val in self.encodings.items()
        } | {"labels": torch.tensor(self.labels[idx])}

train_dataset = KomentarDataset(train_encodings, train_labels_encoded)
val_dataset = KomentarDataset(val_encodings, val_labels_encoded)

# Load model dan training setup
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels=len(label_names)
)

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=10,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch",
    report_to=[], 
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Mulai training
trainer.train()

# Simpan model & tokenizer
save_dir = "./model_finetuned"
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
10,2.04
20,1.5564
30,1.303
40,1.4085
50,1.4243
60,1.2137
70,1.1882
80,1.2788
90,1.287
100,1.2362


('./model_finetuned/tokenizer_config.json',
 './model_finetuned/special_tokens_map.json',
 './model_finetuned/vocab.txt',
 './model_finetuned/added_tokens.json')

In [None]:
import json

# Simpan label_names
with open(f"{save_dir}/label_names.json", "w") as f:
    json.dump(label_names, f)

print("✅ label_names berhasil disimpan ke model_finetuned/label_names.json")


✅ label_names berhasil disimpan ke model_finetuned/label_names.json


In [None]:
!zip -r model_finetuned.zip model_finetuned

updating: model_finetuned/ (stored 0%)
updating: model_finetuned/config.json (deflated 55%)
updating: model_finetuned/special_tokens_map.json (deflated 42%)
updating: model_finetuned/tokenizer_config.json (deflated 75%)
updating: model_finetuned/vocab.txt (deflated 53%)
updating: model_finetuned/model.safetensors (deflated 7%)
  adding: model_finetuned/label_names.json (deflated 32%)


In [None]:
from huggingface_hub import notebook_login, upload_folder
notebook_login()
upload_folder(
    repo_id="AzrilFahmiardi/instagram-comments-classifier-bert",
    folder_path="./model_finetuned",            
    path_in_repo=".",                           
    commit_message="Upload model komentar (fine-tuned BERT dengan safetensors)"
)

### For Caption

In [None]:
!pip install -U transformers datasets scikit-learn pandas tqdm

In [None]:
import os
import pandas as pd
import torch
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
)


In [None]:

os.environ["WANDB_DISABLED"] = "true"
df = pd.read_csv("https://raw.githubusercontent.com/Fahmi-mi/Dataset/refs/heads/main/datathon-ristek-ui-2025/input_instagram_brands/train_bert_caption.csv")
df = df.dropna(subset=["post_caption", "label_1"])

# Split data: train & validation
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["post_caption"].tolist(), df["label_1"].tolist(), test_size=0.2, random_state=42
)

# Encode label ke angka
label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(train_labels)
val_labels_encoded = label_encoder.transform(val_labels)
label_names = label_encoder.classes_.tolist()

# Tokenisasi caption
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

# Dataset PyTorch
class CaptionDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            key: torch.tensor(val[idx]) for key, val in self.encodings.items()
        } | {"labels": torch.tensor(self.labels[idx])}

train_dataset = CaptionDataset(train_encodings, train_labels_encoded)
val_dataset = CaptionDataset(val_encodings, val_labels_encoded)

# Load model BERT dan setup Trainer
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels=len(label_names)
)

training_args = TrainingArguments(
    output_dir="./caption_model",
    num_train_epochs=4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=10,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch",
    report_to=[],  
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Training model
trainer.train()

# Simpan model, tokenizer, dan label_names
save_dir = "./caption_model"
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)

# Simpan daftar nama label
import json
with open(f"{save_dir}/label_names.json", "w") as f:
    json.dump(label_names, f)

print("✅ Model & label disimpan di:", save_dir)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
10,1.5086
20,1.3946
30,1.1182
40,1.1279
50,0.9944
60,0.8876


Step,Training Loss
10,1.5086
20,1.3946
30,1.1182
40,1.1279
50,0.9944
60,0.8876
70,0.81


✅ Model & label disimpan di: ./caption_model


In [None]:
from huggingface_hub import notebook_login, upload_folder, create_repo

notebook_login()

upload_folder(
    repo_id="AzrilFahmiardi/instagram-caption-classifier-bert",
    folder_path="./caption_model",
    path_in_repo=".",
    commit_message="Upload model komentar (format .safetensors)"
)

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…