In [1]:
!pip install -U transformers datasets

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
  Attempting uninstall: datasets
    Found existing installation: datasets 2.14.4
    Uninstalling datasets-2.14.4:
      Successfully uninstalled datasets-2.14.4
[31mERROR: pip's dependency r

In [1]:
# In /root/SpatialRGPT/category_classifier/train_classifier.py

import torch
from datasets import load_dataset, ClassLabel
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EvalPrediction
)
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

import os
import shutil

In [2]:
# 1. Configuration
MODEL_NAME = "distilbert-base-uncased"
TRAIN_FILE = "/content/classifier_train_data.jsonl"
VAL_FILE = "/content/classifier_val_data.jsonl"
OUTPUT_DIR = "checkpoints/category_classifier_model"

# 2. Create Dataset
# This makes sure there's no ambiguity about the file locations
TRAIN_FILE_ABS = os.path.abspath(TRAIN_FILE)
VAL_FILE_ABS = os.path.abspath(VAL_FILE)

print(f"Attempting to load train file from: {TRAIN_FILE_ABS}")
print(f"Attempting to load validation file from: {VAL_FILE_ABS}")

# --- Check if files exist before loading ---
if not os.path.exists(TRAIN_FILE_ABS) or not os.path.exists(VAL_FILE_ABS):
    print("\nERROR: One or both of the data files do not exist at the specified paths. Please check your paths and filenames.")
else:
    # --- Load Datasets using absolute paths ---
    try:
        raw_datasets = load_dataset('json',
                                    data_files={
                                        'train': TRAIN_FILE_ABS,
                                        'validation': VAL_FILE_ABS
                                    })
        print("\nDataset loaded successfully!")
        print(raw_datasets)
    except Exception as e:
        print(f"\nAn error occurred: {e}")

Attempting to load train file from: /content/classifier_train_data.jsonl
Attempting to load validation file from: /content/classifier_val_data.jsonl


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]


Dataset loaded successfully!
DatasetDict({
    train: Dataset({
        features: ['text', 'category'],
        num_rows: 499083
    })
    validation: Dataset({
        features: ['text', 'category'],
        num_rows: 1942
    })
})


In [3]:
# 3. Create Label Mappings
# This will map string labels like "distance" to integer IDs like 0, 1, 2...
print("Creating class labels...")
class_label_feature = ClassLabel(names=raw_datasets['train'].unique('category'))
label2id = {label: i for i, label in enumerate(class_label_feature.names)}
id2label = {i: label for i, label in enumerate(class_label_feature.names)}
num_labels = len(class_label_feature.names)

print(f"Found {num_labels} unique categories.")
print(f"Label mapping (label2id): {label2id}")

Creating class labels...
Found 4 unique categories.
Label mapping (label2id): {'distance': 0, 'mcq': 1, 'left_right': 2, 'count': 3}


In [4]:
# 4. Preprocess and Tokenize Data
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def preprocess_function(examples):
    # Tokenize the texts
    tokenized_inputs = tokenizer(examples["text"], truncation=True, padding=True, max_length=128, return_tensors='pt')
    # Map text labels to integer labels
    tokenized_inputs["label"] = [label2id[label] for label in examples["category"]]
    return tokenized_inputs

print("Tokenizing datasets...")
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True, batch_size=10000)

Loading tokenizer...


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Tokenizing datasets...


Map:   0%|          | 0/499083 [00:00<?, ? examples/s]

Map:   0%|          | 0/1942 [00:00<?, ? examples/s]

In [5]:
# 5. Load Pre-trained Model
print(f"Loading model '{MODEL_NAME}' for sequence classification...")
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)

Loading model 'distilbert-base-uncased' for sequence classification...


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# 6. Define Metrics for Evaluation
def compute_metrics(p: EvalPrediction):
    preds = np.argmax(p.predictions, axis=1)
    return {
        'accuracy': accuracy_score(p.label_ids, preds),
        'f1_macro': f1_score(p.label_ids, preds, average='macro'),
    }

# 7. Define Training Arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    learning_rate=2e-5,
    per_device_train_batch_size=512, # Adjust based on A100 memory
    per_device_eval_batch_size=512,
    num_train_epochs=2, # Usually 2-4 epochs is enough for fine-tuning
    weight_decay=0.01,
    eval_strategy="epoch", # Evaluate at the end of each epoch
    save_strategy="epoch",
    logging_steps=100,
    load_best_model_at_end=True,
    push_to_hub=False, # Set to True if you want to save to Hugging Face Hub
    report_to="wandb" # Or "none"
)

# 8. Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [7]:
# 9. Train the Model
print("\n--- Starting Fine-tuning ---")
trainer.train()

# 10. Evaluate the Final Model
print("\n--- Final Evaluation ---")
eval_results = trainer.evaluate()
print(eval_results)

# 11. Save the Final Model & Tokenizer
print(f"\nSaving final model and tokenizer to {OUTPUT_DIR}")
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print("\n--- Classifier Training Complete ---")




--- Starting Fine-tuning ---


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mminhdv0201[0m ([33mminhdv0201-ho-chi-minh-city-university-of-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,0.0005,0.000319,1.0,1.0
2,0.0002,0.000157,1.0,1.0



--- Final Evaluation ---


{'eval_loss': 0.00015749165322631598, 'eval_accuracy': 1.0, 'eval_f1_macro': 1.0, 'eval_runtime': 3.9421, 'eval_samples_per_second': 492.63, 'eval_steps_per_second': 1.015, 'epoch': 2.0}

Saving final model and tokenizer to checkpoints/category_classifier_model

--- Classifier Training Complete ---


In [57]:
from huggingface_hub import login, whoami
import os

# Overwrite Colab's token by setting your own token in the environment
os.environ["HF_TOKEN"] = "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"
os.environ["HF_HUB_TOKEN"] = "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"

# Login explicitly using your token (this updates your local config too)
login(token="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", write_permission=True)

# Verify login
user_info = whoami()
print(f"\n✅ Successfully logged in as: {user_info['name']}")
whoami()


Fine-grained tokens added complexity to the permissions, making it irrelevant to check if a token has 'write' access.
Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.



✅ Successfully logged in as: DangMinh21


{'type': 'user',
 'id': '667534d24b1e66191682a4ef',
 'name': 'DangMinh21',
 'fullname': 'Dang Van Minh',
 'isPro': False,
 'avatarUrl': '/avatars/f8ee5be957ba535f3acfeaad879bb174.svg',
 'orgs': [],
 'auth': {'type': 'access_token',
  'accessToken': {'displayName': 'download-spatialrgpt',
   'role': 'fineGrained',
   'createdAt': '2025-06-02T03:12:45.205Z',
   'fineGrained': {'canReadGatedRepos': True,
    'global': [],
    'scoped': [{'entity': {'_id': '667534d24b1e66191682a4ef',
       'type': 'user',
       'name': 'DangMinh21'},
      'permissions': ['repo.content.read', 'repo.write']}]}}}}

In [58]:
# Define your repository name
hub_model_repo = "aicity-challenge-question-classifier"

# Push the model and tokenizer
trainer.push_to_hub()

training_args.bin:   0%|          | 0.00/5.30k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/DangMinh21/category_classifier_model/commit/fd48b97635bfc692042f2a8e2009255e66db7baf', commit_message='End of training', commit_description='', oid='fd48b97635bfc692042f2a8e2009255e66db7baf', pr_url=None, repo_url=RepoUrl('https://huggingface.co/DangMinh21/category_classifier_model', endpoint='https://huggingface.co', repo_type='model', repo_id='DangMinh21/category_classifier_model'), pr_revision=None, pr_num=None)

In [60]:
# In create_submission.py, you can now load your classifier from the Hub:

from transformers import pipeline

# Your Hugging Face username and the repo name you chose
HUB_MODEL_ID = f"DangMinh21/category_classifier_model"

print(f"Loading category classifier from the Hub: {HUB_MODEL_ID}")
category_classifier = pipeline("text-classification", model=HUB_MODEL_ID, device=0)

def infer_category_with_model(question: str) -> str:
    # The pipeline handles tokenization for you
    prediction = category_classifier(question, top_k=1)
    if prediction:
        return prediction[0]['label']
    return "unknown" # Fallback

question = "Considering the pallets <mask> <mask> <mask> <mask> <mask> <mask>, how much distance is there between the rightmost pallet and the shelf <mask>?"

infer_category_with_model(question)

Loading category classifier from the Hub: DangMinh21/category_classifier_model


config.json:   0%|          | 0.00/757 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Device set to use cuda:0


'distance'

In [61]:
question = "Can you determine if the pallet <mask> is to the left of the pallet <mask> based on the current viewing angle?"

infer_category_with_model(question)

'left_right'

In [62]:
question = "Among the buffer region <mask>, the buffer region <mask>, and the buffer region <mask>, which one appears on the rightmost side?"

infer_category_with_model(question)

'mcq'

In [63]:
question = "Using the buffer masks <mask> <mask> <mask> and pallet masks <mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask>, how many pallets are situated in the buffer region closest to the shelf on the right among <mask> <mask>?"

infer_category_with_model(question)

'count'