# Multiclass Classification of Cultural Items
**Homework 1 - Multilingual Natural Language Processing**

*By Joshua Edwin & Clemens Kubach*

Add general project description here at the end.

## Installs
Check that all required dependencies are installed as defined in `pyproject.toml`. Follow the `README.md` for more detailed instructions.

## Machine Setup

### Imports

In [4]:
import pandas as pd
from datasets import DatasetDict, Dataset
import logging
import os

from evaluate import CombinedEvaluations

try:
    from google.colab import userdata  # type: ignore

    IN_COLAB = True
except ImportError:
    IN_COLAB = False

logger = logging.getLogger(__name__)

from pathlib import Path

_utils_dir = Path(str(os.path.abspath(''))).parent
_src_dir = _utils_dir.parent.parent
REPO_ROOT = _src_dir.parent
DATA_DIR = REPO_ROOT / "data"
LOG_DIR = REPO_ROOT / "logs"
WANDB_DIR = REPO_ROOT / "wandb"

os.environ["WANDB_PROJECT"] = "mnlp-h1-lm"
os.environ["WANDB_DIR"] = str(WANDB_DIR)

In [5]:
import wandb
wandb.login()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mkubach[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

### Huggingface Login and Loading Data
To access the dataset, there are three options available that will be tried in the following fallback-order:


1.   From HF via HF_TOKEN secret/envvar if set.
2.   From HF via inserting the HF token manually in the login dialog.
3.   From local `./train.csv` and `./valid.csv` files.

Afterwards the hf dataset instance and train, val dataframes can be accessed via `hf_dataset`, `df_train` and `df_val`.

In [6]:
from os import environ

from datasets import load_dataset
from huggingface_hub import login
from datasets.exceptions import DatasetNotFoundError
from huggingface_hub.errors import HfHubHTTPError


def extract_dev_subsets_from_hf_dataset(
    ds: DatasetDict,
) -> tuple[pd.DataFrame, pd.DataFrame]:
    _df_train = pd.DataFrame(ds["train"])  # Silver-labeled training set
    _df_validation = pd.DataFrame(ds["validation"])  # Gold-labeled dev set
    return _df_train, _df_validation


def read_hf_token() -> str | None:
    if IN_COLAB:
        try:
            return userdata.get("HF_TOKEN")
        except KeyError:
            return None
    else:
        return environ.get("HF_TOKEN", None)


def do_blocking_hf_login():
    # run the login in a separate cell because login is non-blocking
    try:
        token = read_hf_token()
        login(token=token)
        if token is None:
            # block until logged-in
            input("Press enter of finish login!")
    except (HfHubHTTPError, DatasetNotFoundError):
        print(
            "Login via HF_TOKEN secret/envvar and via manual login widget failed "
            "or not authorized."
        )


do_blocking_hf_login()

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [7]:
def load_train_val_data() -> tuple[DatasetDict, pd.DataFrame, pd.DataFrame]:
    try:
        _ds = load_dataset("sapienzanlp/nlp2025_hw1_cultural_dataset")
        _df_train, _df_val = extract_dev_subsets_from_hf_dataset(_ds)
        return _ds, _df_train, _df_val
    except (HfHubHTTPError, DatasetNotFoundError) as e:
        logger.error(
            f"Something went wrong during HF dataset access: {e}. "
            "Falling back to local files:"
        )
    try:
        train_df = pd.read_csv("train.csv")
        valid_df = pd.read_csv("valid.csv")

        train_dataset = Dataset.from_pandas(train_df)
        valid_dataset = Dataset.from_pandas(valid_df)

        # Create a DatasetDict
        dataset_dict = DatasetDict(
            {"train": train_dataset, "validation": valid_dataset}
        )
        return dataset_dict, train_df, valid_df
    except FileNotFoundError:
        raise FileNotFoundError(
            "Tried to access the dataset from Huggingface "
            "(via HF_TOKEN secret/envvar and manual auth) and from the local disk"
            "(via train.csv and valid.csv in the cwd) without success."
        )


hf_dataset, df_train, df_val = load_train_val_data()

# Show samples
print("\nTrain Set:")
display(df_train.head())

print("\nValidation Set:")
display(df_val.head())


Train Set:


Unnamed: 0,item,name,description,type,category,subcategory,label
0,http://www.wikidata.org/entity/Q32786,916,2012 film by M. Mohanan,entity,films,film,cultural exclusive
1,http://www.wikidata.org/entity/Q371,!!!,American dance-punk band from California,entity,music,musical group,cultural representative
2,http://www.wikidata.org/entity/Q3729947,¡Soborno!,Mort & Phil comic,entity,comics and anime,comics,cultural representative
3,http://www.wikidata.org/entity/Q158611,+44,American band,entity,music,musical group,cultural representative
4,http://www.wikidata.org/entity/Q280375,1 Monk Street,"building in Monmouth, Wales",entity,architecture,building,cultural exclusive



Validation Set:


Unnamed: 0,item,name,description,type,category,subcategory,label
0,http://www.wikidata.org/entity/Q15786,1. FC Nürnberg,"German sports club based in Nuremberg, Bavaria",entity,sports,sports club,cultural representative
1,http://www.wikidata.org/entity/Q268530,77 Records,UK record label,entity,music,record label,cultural exclusive
2,http://www.wikidata.org/entity/Q216153,A Bug's Life,1998 animated film directed by John Lasseter a...,entity,comics and anime,animated film,cultural representative
3,http://www.wikidata.org/entity/Q593,A Gang Story,2011 film by Olivier Marchal,entity,films,film,cultural exclusive
4,http://www.wikidata.org/entity/Q192185,Aaron Copland,"American composer, composition teacher, writer...",entity,performing arts,choreographer,cultural representative


In [None]:
df_train["label"].unique()

In [None]:
print("HF dataset instance keys:", list(hf_dataset.keys()))
print("Train columns:", list(df_train.columns))
print("Val columns:", list(df_val.columns))

## LM-based Approach

In [None]:
from enum import IntEnum


class Labels(IntEnum):
    cultural_agnostic = 0
    cultural_representative = 1
    cultural_exclusive = 2

LABEL2ID = {
    "cultural agnostic": 0,
    "cultural representative": 1,
    "cultural exclusive": 2,
}
ID2LABEL = {k: v for v, k in LABEL2ID.items()}

In [None]:
from transformers import PreTrainedTokenizer, AutoTokenizer, DataCollatorWithPadding


class PreProcessor:

    def __init__(
            self,
            tokenizer_name: str,
            agg_in_fields: tuple[str, ...] = ("name", "description"),
    ):
        self.tokenizer_name = tokenizer_name
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

        self.agg_in_fields = agg_in_fields

    def __call__(self, samples):
        return self.preprocess_function(samples, self.tokenizer, self.agg_in_fields)

    @staticmethod
    def preprocess_function(
            samples,
            tokenizer: PreTrainedTokenizer,
            agg_in_fields: tuple[str, ...] = ("name", "description"),
    ):
        """Aggregate and tokenize input fields and one-hot encode labels."""
        to_tokenize = [samples[col] for col in agg_in_fields]
        input_samples = tokenizer(*to_tokenize, truncation=False, padding=True)
        input_samples["labels"] = [LABEL2ID[label] for label in samples["label"]]
        return input_samples

preprocessor = PreProcessor("distilbert/distilbert-base-uncased")
data_collator = DataCollatorWithPadding(tokenizer=preprocessor.tokenizer)
preprocessed_hf_dataset = hf_dataset.map(preprocessor, batched=True)
required_columns = preprocessor.tokenizer.model_input_names + ['labels']
preprocessed_hf_dataset.set_format(type='torch', columns=required_columns)

preprocessed_hf_dataset


In [None]:
import evaluate
import numpy as np


class Evaluator:

    def __init__(self):
        self.accuracy = evaluate.load("accuracy")
        self.f1 = evaluate.load("f1")
        self.precision = evaluate.load("precision")
        self.recall = evaluate.load("recall")

    def __call__(self, eval_pred):
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
        accuracy_results = self.accuracy.compute(predictions=predictions, references=labels)
        f1_results = self.f1.compute(predictions=predictions, references=labels, average="micro")
        precision_results = self.precision.compute(predictions=predictions, references=labels, average="micro")
        recall_results = self.recall.compute(predictions=predictions, references=labels, average="micro")
        # combine results to one dict
        results = {**accuracy_results, **f1_results, **precision_results, **recall_results}
        return results


evaluator = Evaluator()

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

MODEL_NAME = "distilbert/distilbert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, label2id=LABEL2ID, id2label=ID2LABEL
)

In [None]:
from datetime import datetime

timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
unique_run_name = f"{MODEL_NAME}-{timestamp}" # <--- Create unique name

training_args = TrainingArguments(
    output_dir=LOG_DIR / MODEL_NAME,
    learning_rate=1e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    logging_steps=100,
    num_train_epochs=20,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
    report_to="wandb",
    run_name=unique_run_name,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=preprocessed_hf_dataset["train"],
    eval_dataset=preprocessed_hf_dataset["validation"],
    processing_class=preprocessor.tokenizer,
    data_collator=data_collator,
    compute_metrics=evaluator,
)

trainer.train()
wandb.finish()