In [None]:
URL_FILE_1 = "https://raw.githubusercontent.com/giankev/wikidata_cultural_classifier/refs/heads/main/dataset_parser.py"
URL_FILE_2 = "https://raw.githubusercontent.com/giankev/wikidata_cultural_classifier/refs/heads/main/wiki_extractor.py"
URL_FILE_3 = "https://raw.githubusercontent.com/giankev/wikidata_cultural_classifier/refs/heads/main/custom_dataset.py"
URL_FILE_4 = "https://raw.githubusercontent.com/giankev/wikidata_cultural_classifier/refs/heads/main/enwiki_features.py"

print("Downloading file 1...")
!wget {URL_FILE_1}

print("\nDownloading file 2...")
!wget {URL_FILE_2}

print("\nDownloading file 3...")
!wget {URL_FILE_3}

print("\nDownloading file 4...")
!wget {URL_FILE_4}

print("\nDownload completati. Contenuto della directory corrente:")

Downloading file 1...
--2025-05-04 19:15:16--  https://raw.githubusercontent.com/giankev/wikidata_cultural_classifier/refs/heads/main/dataset_parser.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9788 (9.6K) [text/plain]
Saving to: ‘dataset_parser.py.1’


2025-05-04 19:15:16 (68.1 MB/s) - ‘dataset_parser.py.1’ saved [9788/9788]


Downloading file 2...
--2025-05-04 19:15:16--  https://raw.githubusercontent.com/giankev/wikidata_cultural_classifier/refs/heads/main/wiki_extractor.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 

In [None]:
!pip install xgboost pandas scikit-learn wikidata



# **VERY IMPORTANT: If the collab notebook doesn’t mount the shared folder or doesn’t load the files correctly, it might be necessary to add the shared project folder as a shortcut to your own google drive.**

In [None]:
# Imports
import os
import xgboost as xgb
import pandas as pd
import numpy as np

# Custom classes
from custom_dataset import CustomData
from enwiki_features import enwiki_augment                                      # wikipedia page statistics (5 features)

# Drive
from google.colab import drive
drive.mount('/content/drive')
dataset_path = '/content/drive/MyDrive/CulturalIA_shared_folder/Dataset/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#the column type used in train and valid contain only 'entity' and 'concept' meanwhile test_unlabeld contain also 'named entity', we map 'named entity' to 'entity'
def _norm_type(val):
        s = str(val).lower()
        if "entity" in s:
            return "entity"
        if "concept" in s:
            return "concept"
        return val  # leave as‑is

In [None]:
# Percorsi file
RAW_TEST_CSV        = dataset_path + 'test_unlabeled.csv'
PROCESSED_TEST_CSV  = dataset_path + 'test_unlabeled_processed.csv'

# Flag per forzare il ricalcolo
FORCE_REPROCESS = False

df_processed = None

# --- Try to load the already processed file, if it exists ---
if not FORCE_REPROCESS and os.path.exists(PROCESSED_TEST_CSV):
    print(f"Loading preprocessed data: {PROCESSED_TEST_CSV}...")
    try:
        df_processed = pd.read_csv(PROCESSED_TEST_CSV)
        df_processed['type'] = df_processed['type'].apply(_norm_type)           # normalize type column
        print("Loading data done.")
    except Exception as e:
        print(f"Loading error {PROCESSED_TEST_CSV}, starting recomputing: {e}")
        df_processed = None  # Force recomputing

# --- If not existing or forced to, recompute the features ---
if df_processed is None:
    print("Processing test data non labeled...")
    try:
        # 1. Load the initial CSV file
        df_test_raw = pd.read_csv(RAW_TEST_CSV)
        df_test_raw['type'] = df_test_raw['type'].apply(_norm_type)             # normalize type column

        # 2. Instantiate preprocessor and add features
        processor_test = CustomData(df_test_raw)
        df_test_featured = processor_test.add_feature()

        if df_test_featured is not None:
            # 3. Apply preprocess (encoding, scaling, ecc.)
            df_processed = processor_test.preprocess_data(df_test_featured)

            if df_processed is not None:
                # 4. Save result on an output CSV
                df_processed.to_csv(PROCESSED_TEST_CSV, index=False)
                print(f"Test data processed and saved in {PROCESSED_TEST_CSV}.")
            else:
                print("WARNING: Test data is None or empty, saving failed.")
        else:
            print("ERROR: Failed to add preprocessed features.")

    except Exception as e:
        print(f"ERROR during data preprocessing: {e}")
        df_processed = None

# --- final check ---
if df_processed is not None:
    print("\n--- Test processed ---")
    print(f"Shape of preprocessed data: {df_processed.shape}")
else:
    print("\nERROR: could not load preprocessed data.")

df_processed.head()

Loading preprocessed data: /content/drive/MyDrive/CulturalIA_shared_folder/Dataset/test_unlabeled_processed.csv...
Loading data done.

--- Test processed ---
Shape of preprocessed data: (300, 33)


Unnamed: 0,item,type,number_sitelinks,sitelinks_translation_entropy,number_claims,po_P495,po_P1343,po_P2596,po_P17,number_of_P31,...,category_Music,category_Performing arts,category_Visual Arts,category_architecture,category_food,category_media,category_philosophy and religion,category_politics,category_sports,category_transportation
0,http://www.wikidata.org/entity/Q2427430,0,17,3.969816,7,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,http://www.wikidata.org/entity/Q125482,0,99,4.254674,53,0,1,0,0,2,...,0,0,0,0,0,0,1,0,0,0
2,http://www.wikidata.org/entity/Q15789,1,112,4.12854,97,0,0,0,1,2,...,0,0,0,0,0,0,0,0,1,0
3,http://www.wikidata.org/entity/Q582496,1,12,0.816689,6,0,0,0,1,1,...,0,0,0,0,0,0,0,1,0,0
4,http://www.wikidata.org/entity/Q572811,1,11,2.732159,10,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0


In [None]:

# Percorsi file
RAW_TEST_CSV        = dataset_path + 'test_unlabeled.csv'
PROCESSED_TEST_CSV  = dataset_path + 'test_unlabeled_augmented.csv'

# Flag to force data augmentation recomputing
FORCE_REPROCESS = False

df_aug = None

# Load the augmented data if present
if not FORCE_REPROCESS:
    try:
        df_aug = pd.read_csv(PROCESSED_TEST_CSV)
        print(f"Augmented data shape: {df_aug.shape}")
    except Exception as e:
        print(f"ERROR - could not load augmented data, trying to preprocess it: {e}")

# data augmentation from wikipedia scraping
if FORCE_REPROCESS or df_aug is None:
    enwiki_augment(RAW_TEST_CSV, PROCESSED_TEST_CSV, 10)
    try:
        df_aug = pd.read_csv(PROCESSED_TEST_CSV)
        print(f"Augmented data shape: {df_aug.shape}")
    except Exception as e:
        print(f"ERROR - could not load augmented data: {e}")

# Display the first few rows
df_aug.head()

Augmented data shape: (300, 11)


Unnamed: 0,item,name,type,category,subcategory,title,page_length,num_links,mean_sitelinks_count,median_sitelinks_count,std_sitelinks_count
0,http://www.wikidata.org/entity/Q2427430,Northeast Flag Replacement,concept,History,historical event,Northeast Flag Replacement,2156,18,68.055556,45.5,55.538247
1,http://www.wikidata.org/entity/Q125482,imam,concept,philosophy and religion,religious leader,Imam,5134,62,71.516667,56.5,69.720033
2,http://www.wikidata.org/entity/Q15789,FC Bayern Munich,named entity,sports,sports club,FC Bayern Munich,49066,369,48.81768,33.0,53.145687
3,http://www.wikidata.org/entity/Q582496,Fome Zero,named entity,politics,government agency,Fome Zero,4791,25,56.583333,40.0,62.625818
4,http://www.wikidata.org/entity/Q572811,Anthony Award,named entity,Literature,literary award,Anthony Awards,428,5,15.5,15.0,8.616844


In [None]:
# augmented data with Custom Data
df_processed.columns.tolist()

['item',
 'type',
 'number_sitelinks',
 'sitelinks_translation_entropy',
 'number_claims',
 'po_P495',
 'po_P1343',
 'po_P2596',
 'po_P17',
 'number_of_P31',
 'sum_cultural_claims',
 'po_P172',
 'po_P1268',
 'po_P136',
 'category_Biology',
 'category_Books',
 'category_Comics and Anime',
 'category_Fashion',
 'category_Films',
 'category_Geography',
 'category_Gestures and habits',
 'category_History',
 'category_Literature',
 'category_Music',
 'category_Performing arts',
 'category_Visual Arts',
 'category_architecture',
 'category_food',
 'category_media',
 'category_philosophy and religion',
 'category_politics',
 'category_sports',
 'category_transportation']

In [None]:
# loaded augmented data from wikipedia scraping
df_aug.columns.tolist()

['item',
 'name',
 'type',
 'category',
 'subcategory',
 'title',
 'page_length',
 'num_links',
 'mean_sitelinks_count',
 'median_sitelinks_count',
 'std_sitelinks_count']

In [None]:
# merging features from the two different csv by item
df_val_aug_concat = df_processed.merge(
    df_aug.drop(columns=['type', 'category', 'subcategory']),
    on='item',
    how='inner'
)

# ensure that all the columns (the features names) are lower case
df_val_aug_concat.columns = df_val_aug_concat.columns.str.lower()

# display the final list of features
features = df_val_aug_concat.columns.tolist()
for name in features:
    print(name)

item
type
number_sitelinks
sitelinks_translation_entropy
number_claims
po_p495
po_p1343
po_p2596
po_p17
number_of_p31
sum_cultural_claims
po_p172
po_p1268
po_p136
category_biology
category_books
category_comics and anime
category_fashion
category_films
category_geography
category_gestures and habits
category_history
category_literature
category_music
category_performing arts
category_visual arts
category_architecture
category_food
category_media
category_philosophy and religion
category_politics
category_sports
category_transportation
name
title
page_length
num_links
mean_sitelinks_count
median_sitelinks_count
std_sitelinks_count


In [None]:
# Make a copy
df = df_val_aug_concat.copy()

# List columns to skip
skip = ['item', 'name', 'title']

# Determine which columns to convert
to_numeric = [c for c in df.columns if c not in skip]

# Apply to_numeric just on those columns
df[to_numeric] = df[to_numeric].apply(pd.to_numeric, errors='coerce')

print("Final shape:")
print(f"  Data: {df.shape}")

Final shape:
  Data: (300, 40)


Load the model, inference, and get the ouput predictions.

In [None]:
# Load the XGBoost model
models_path = '/content/drive/MyDrive/CulturalIA_shared_folder/Models/'
model_path = models_path + 'xgb_best_model_77.json'
booster = xgb.Booster()
booster.load_model(model_path)

# Ensure that all the feature names are lower case and match the set columns
booster.feature_names = [f.lower() for f in booster.feature_names]

# XGBoost is sensitive to column order, rearrange their order to be correct
feat_cols = booster.feature_names

# Convert test data to DMatrix for inference
dtest = xgb.DMatrix(df[feat_cols])

# Run inference
y_pred_raw = booster.predict(dtest)
y_pred = np.argmax(y_pred_raw, axis=1)

Beware of label mapping for the accuracy computation, wrong mapping could lead to the wrong labels in the output file and a very poor accuracy when evaluating the model.





In [None]:
# Map numeric predictions to class labels
label_mapping = {
    0: 'cultural agnostic',
    1: 'cultural representative',
    2: 'cultural exclusive'
}

df['label'] = [label_mapping[i] for i in y_pred]

WARNING: Executing this cell will overwrite the predictions CSV present in the shared folder "Output" folder

In [None]:
# Columns to keep
keep_cols = ['item', 'name', 'title', 'label']

# Filter the DataFrame
df = df[keep_cols]

# Save to a new CSV
output_path = '/content/drive/MyDrive/CulturalIA_shared_folder/Outputs/CulturalIA_output_modello2.csv'
df.to_csv(output_path, index=False)

print(f"Saved predictions to {output_path}")

Saved predictions to /content/drive/MyDrive/CulturalIA_shared_folder/Outputs/CulturalIA_output_modello2.csv
