1. Setup

In [24]:
!pip install -q transformers datasets evaluate scikit-learn

import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from google.colab import drive

2. Mount Drive

In [25]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


3. Paths

In [26]:
dataset_path = '/content/drive/MyDrive/CulturalIA_shared_folder/Dataset/'
RAW_TEST_CSV_PATH = dataset_path + 'test_unlabeled.csv'
MODEL_DIR = '/content/drive/MyDrive/path_to_model/lm_based/model'  # Update this if needed

4. Load CSV

In [27]:
df = pd.read_csv(RAW_TEST_CSV_PATH)
print(df.head())

                                      item                        name  \
0  http://www.wikidata.org/entity/Q2427430  Northeast Flag Replacement   
1   http://www.wikidata.org/entity/Q125482                        imam   
2    http://www.wikidata.org/entity/Q15789            FC Bayern Munich   
3   http://www.wikidata.org/entity/Q582496                   Fome Zero   
4   http://www.wikidata.org/entity/Q572811               Anthony Award   

                                         description          type  \
0  Zhang Xueliang's announcement on 29 December 1...       concept   
1                        Islamic leadership position       concept   
2       association football club in Munich, Germany  named entity   
3  program intended to eradicate hunger and extre...  named entity   
4  awards given at Bouchercon for mystery literature  named entity   

                  category        subcategory  
0                  History   historical event  
1  philosophy and religion   religious

# 5. Load model and tokenizer

In [23]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, local_files_only=True)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR, local_files_only=True)
model.eval()

HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name': '/content/drive/MyDrive/path_to_model/lm_based/model'. Use `repo_type` argument if needed.

# 6. Label map (used during training)

In [None]:
labels = ["cultural agnostic", "cultural representative", "cultural exclusive"]
id2label = {i: label for i, label in enumerate(labels)}

# 7. Load caches (optional)

In [None]:
wikidata_cache_path = '/content/drive/MyDrive/CulturalIA_shared_folder/Dataset/wikidata_cache_ultra.pkl'
summary_cache_path = '/content/drive/MyDrive/CulturalIA_shared_folder/Dataset/wiki_summary_cache.pkl'

wikidata_cache = pickle.load(open(wikidata_cache_path, "rb")) if os.path.exists(wikidata_cache_path) else {}
summary_cache = pickle.load(open(summary_cache_path, "rb")) if os.path.exists(summary_cache_path) else {}

# 8. Build enrichment helpers (same as in training)

In [None]:
def build_text(x):
    summary = summary_cache.get(x["item"], "")
    meta = wikidata_cache.get(x["item"], {})
    fields = [
        f"[ATTACHMENT] {meta.get('attachment', 0)}",
        f"[SPREAD] {meta.get('spread', 0)}",
        f"[SPECIFICITY] {meta.get('specificity', 0)}",
        f"[LANGUAGES] {meta.get('n_languages', 0)}",
        f"[INSTANCEOF] {meta.get('n_instanceof', 0)}",
        f"[SUBCLASSOF] {meta.get('n_subclassof', 0)}",
        f"[DESCRIBEDBY] {meta.get('n_describedby', 0)}",
        f"[CATEGORY] {x['category']}",
        f"[TYPE] {x['type']}",
        f"[SUBCATEGORY] {x.get('subcategory', '')}",
        f"[NAME] {x['name']}",
        f"[DESC] {x['description']}",
        f"[WIKI] {summary}"
    ]
    return " | ".join(fields)

df["text"] = df.apply(build_text, axis=1)

# 9. Tokenize all at once

In [16]:
encodings = tokenizer(df["text"].tolist(), padding=True, truncation=True, return_tensors="pt")

NameError: name 'tokenizer' is not defined

In [17]:
# 10. Run inference

In [None]:
with torch.no_grad():
    outputs = model(**encodings)
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    preds = torch.argmax(probs, dim=1)

# 11. Add predictions back to DataFrame

In [None]:
df["predicted_label_id"] = preds.numpy()
df["predicted_label"] = df["predicted_label_id"].map(id2label)

# 12. Save predictions

In [None]:
output_path = dataset_path + "test_predictions.csv"
df.to_csv(output_path, index=False)
print(f"Saved predictions to {output_path}")