In [1]:
from datasets import load_dataset, concatenate_datasets
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load base dataset
dataset = load_dataset("scottymcgee/food-text-dataset")

In [3]:
# Load Vietnamese CSV
vietnamese_dataset = load_dataset("csv", data_files="vietnamese_food_english.csv")["train"]

In [4]:
# Combine base sets
base_dataset = concatenate_datasets([dataset["original"], dataset["augmented"]])
updated_train = concatenate_datasets([base_dataset, vietnamese_dataset]).shuffle(seed=42)

In [5]:
# Split for train/test
updated_splits = updated_train.train_test_split(test_size=0.15, seed=42)
train_dataset = updated_splits["train"]
test_dataset = updated_splits["test"]

In [6]:
# Label setup
unique_labels = sorted(list(set(base_dataset["label"])))


In [7]:
# Add Vietnamese if missing
if "Vietnamese" not in unique_labels:
    unique_labels.append("Vietnamese")

print("✅ Old + new labels:", unique_labels)

✅ Old + new labels: ['American', 'Chinese', 'Italian', 'Mexican', 'Thai', 'Vietnamese']


In [8]:
# Label mapping
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for label, i in label2id.items()}

print("Label2id mapping:", label2id)

Label2id mapping: {'American': 0, 'Chinese': 1, 'Italian': 2, 'Mexican': 3, 'Thai': 4, 'Vietnamese': 5}


In [9]:
# Encode string labels to numeric
def encode_label(example):
    example["label"] = label2id[example["label"]]
    return example

In [10]:
train_dataset = train_dataset.map(encode_label)
test_dataset = test_dataset.map(encode_label)

In [11]:
model_name = "maryzhang/24679-text-distilbert-food-cuisine-classifier"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [12]:
# Preprocess text
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=256)


In [13]:
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)

Map: 100%|██████████| 171/171 [00:00<00:00, 3019.94 examples/s]


In [14]:
# # Load model with updated labels
# model = AutoModelForSequenceClassification.from_pretrained(
#     model_name,
#     num_labels=len(unique_labels),
#     id2label=id2label,
#     label2id=label2id
# )
# this lead to mismatch size of the trainning

In [15]:
from transformers import AutoConfig

# Load config and update num_labels
config = AutoConfig.from_pretrained(model_name)
config.num_labels = len(unique_labels)
config.id2label = id2label
config.label2id = label2id

# Load model but ignore mismatched classifier layer
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    config=config,
    ignore_mismatched_sizes=True  # <— key fix
)

print("✅ Model loaded with reinitialized classification head for", len(unique_labels), "labels.")


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at maryzhang/24679-text-distilbert-food-cuisine-classifier and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([5]) in the checkpoint and torch.Size([6]) in the model instantiated
- classifier.weight: found shape torch.Size([5, 768]) in the checkpoint and torch.Size([6, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ Model loaded with reinitialized classification head for 6 labels.


In [16]:
# Metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="weighted"),
        "precision": precision_score(labels, preds, average="weighted"),
        "recall": recall_score(labels, preds, average="weighted"),
    }

In [20]:
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,     # allow overwriting checkpoints
    do_train=True,                 # enable training
    do_eval=True,                  # enable evaluation
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",          # where TensorBoard logs go
    logging_first_step=True,
    logging_steps=100,             # log every 100 steps
    save_steps=500,                # save checkpoint every 500 steps
    save_total_limit=1,            # keep only the last checkpoint
    seed=42,
    fp16=False,                    # only enable if Apex is installed
)


In [21]:
# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [22]:
# Train and evaluate
trainer.train()
results = trainer.evaluate()
print(results)



Step,Training Loss
1,1.9255
100,0.5084




{'eval_loss': 0.04271969571709633, 'eval_accuracy': 1.0, 'eval_f1': 1.0, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_runtime': 8.9831, 'eval_samples_per_second': 19.036, 'eval_steps_per_second': 1.225, 'epoch': 3.0}


In [23]:
# Save updated model
trainer.save_model("./distilbert-food-cuisine-updated")