# CultureScope: Cultural Specificity Classification

**Task:** Classify items into three cultural specificity categories:
- `cultural agnostic` - Universally known, no specific cultural ownership
- `cultural representative` - Associated with a culture but known globally
- `cultural exclusive` - Known primarily within a specific culture

**Model:** XGBoost with Wikipedia/Wikidata feature extraction

**HuggingFace Model:** [ArchitRastogi/CultureScope-XGBoost](https://huggingface.co/ArchitRastogi/CultureScope-XGBoost)

---

## Notebook Structure

- **Part 1:** Full pipeline with Wikipedia feature extraction at inference time
- **Part 2:** Fallback approach without external API calls

---

## Setup

In [None]:
# Install required packages
!pip install -q datasets huggingface_hub joblib xgboost lightgbm catboost scikit-learn pandas numpy requests hf_transfer

In [2]:
import os
import json
import time
import warnings
import requests
import numpy as np
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
import joblib
import pickle
from typing import Dict, Optional, List
from concurrent.futures import ThreadPoolExecutor, as_completed
from sklearn.preprocessing import LabelEncoder
from datasets import load_dataset
from huggingface_hub import hf_hub_download, login, whoami
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report

warnings.filterwarnings('ignore')

print("Setup complete.")

Setup complete.


## Configuration

In [None]:
# ----- CONFIGURATION -----

# Input: Choose one of the following options
USE_HUGGINGFACE_DATASET = True  # Set to False to use local CSV file
INPUT_CSV_PATH = "test.csv"     # Path to input CSV (used if USE_HUGGINGFACE_DATASET=False)

# HuggingFace dataset configuration
HF_DATASET_NAME = "sapienzanlp/nlp2025_hw1_cultural_dataset"  # Dataset name on HuggingFace
HF_DATASET_SPLIT = "Enter split name"                               # Split to use

if USE_HUGGINGFACE_DATASET:
    HF_TOKEN = ""  # Add HuggingFace token here, as the datatset is gated
    login(token=HF_TOKEN)
    print("Logged in as:", whoami())

# Model configuration
HF_MODEL_REPO = "ArchitRastogi/CultureScope-XGBoost"  # Model repository for Part 1 
MODEL_FILENAME = "best_model.pkl"                     # Model file name

HF_MODEL_REPO_PART2 =  "ArchitRastogi/CultureScope-Ensemble-NoWiki"
MODEL_FILENAME_PART2 =  "ensemble_simple.pkl"

# Output configuration
OUTPUT_CSV_PATH = "predictions.csv"  # Output file path

# Feature extraction settings
MAX_WORKERS = 16        # Parallel workers for API calls
REQUEST_TIMEOUT = 10    # API request timeout in seconds

print(f"Input: {'HuggingFace Dataset' if USE_HUGGINGFACE_DATASET else INPUT_CSV_PATH}")
print(f"Output: {OUTPUT_CSV_PATH}")


## Load Test Data

In [4]:
def load_test_data(use_hf: bool = True, csv_path: str = None) -> pd.DataFrame:
    """
    Load test data from HuggingFace Hub or local CSV file.
    
    Args:
        use_hf: If True, load from HuggingFace Hub
        csv_path: Path to local CSV file (used if use_hf=False)
    
    Returns:
        DataFrame with test data
    """
    if use_hf:
        print(f"Loading dataset from HuggingFace: {HF_DATASET_NAME}")
        dataset = load_dataset(HF_DATASET_NAME, split=HF_DATASET_SPLIT,token=HF_TOKEN,)
        df = dataset.to_pandas()
    else:
        print(f"Loading dataset from CSV: {csv_path}")
        df = pd.read_csv(csv_path)
    
    # Ensure required columns exist
    required_cols = ['item', 'name', 'description', 'type', 'category', 'subcategory']
    for col in required_cols:
        if col not in df.columns:
            df[col] = ''
    
    # Fill missing values
    df = df.fillna('')
    
    print(f"Loaded {len(df)} samples.")
    return df


# Load test data
test_df = load_test_data(use_hf=USE_HUGGINGFACE_DATASET, csv_path=INPUT_CSV_PATH)
test_df.head()

Loading dataset from HuggingFace: sapienzanlp/nlp2025_hw1_cultural_dataset
Loaded 300 samples.


Unnamed: 0,item,name,description,type,category,subcategory,label
0,http://www.wikidata.org/entity/Q15786,1. FC Nürnberg,"German sports club based in Nuremberg, Bavaria",entity,sports,sports club,cultural representative
1,http://www.wikidata.org/entity/Q268530,77 Records,UK record label,entity,music,record label,cultural exclusive
2,http://www.wikidata.org/entity/Q216153,A Bug's Life,1998 animated film directed by John Lasseter a...,entity,comics and anime,animated film,cultural representative
3,http://www.wikidata.org/entity/Q593,A Gang Story,2011 film by Olivier Marchal,entity,films,film,cultural exclusive
4,http://www.wikidata.org/entity/Q192185,Aaron Copland,"American composer, composition teacher, writer...",entity,performing arts,choreographer,cultural representative


---

# Part 1: With Wikipedia Feature Extraction

This approach extracts features from Wikipedia and Wikidata APIs at inference time to enrich the input data with cultural and geographic metadata.

## 1.1 Feature Extraction Module

In [5]:
class WikiDataExtractor:
    """
    Extract features from Wikipedia and Wikidata for cultural classification.
    
    Features extracted:
    - Wikipedia: language count, page length, categories, external links
    - Wikidata: statements, cultural properties, geographic properties
    """
    
    # Cultural property IDs in Wikidata
    CULTURAL_PROPERTIES = {
        'P17', 'P495', 'P2596', 'P27', 'P1412', 'P37',
        'P103', 'P361', 'P131', 'P625'
    }
    
    # Geographic property IDs in Wikidata
    GEOGRAPHIC_PROPERTIES = {
        'P17', 'P131', 'P625', 'P30', 'P47', 'P150',
        'P36', 'P1376'
    }
    
    def __init__(self, timeout: int = 10):
        self.timeout = timeout
        self.session = requests.Session()
        # Set proper User-Agent header
        self.session.headers.update({
            'User-Agent': 'CulturalClassifier/1.0 (Educational Research Project; Python/requests)'
        })
    
    def extract_features(self, item_name: str, wikidata_id: str = None) -> Dict:
        """
        Extract all features for a single item.
        
        Args:
            item_name: Name of the item to search
            wikidata_id: Optional Wikidata ID (e.g., 'Q12345')
        
        Returns:
            Dictionary of extracted features
        """
        features = self._get_default_features()
        
        try:
            # Extract Wikipedia features
            wiki_features = self._get_wikipedia_features(item_name)
            features.update(wiki_features)
            
            # Extract Wikidata features
            qid = wikidata_id or self._search_wikidata(item_name)
            if qid:
                wikidata_features = self._get_wikidata_features(qid)
                features.update(wikidata_features)
        except Exception as e:
            # Silently fail and return defaults
            pass
        
        return features
    
    def _get_default_features(self) -> Dict:
        """Return default feature values."""
        return {
            'num_languages': 0,
            'en_page_length': 0,
            'num_categories': 0,
            'num_external_links': 0,
            'has_coordinates': 0,
            'num_statements': 0,
            'statement_diversity': 0,
            'num_cultural_properties': 0,
            'num_geographic_properties': 0,
            'has_country': 0,
            'has_origin_country': 0,
            'has_culture_property': 0,
            'num_identifiers': 0
        }
    
    def _get_wikipedia_features(self, title: str) -> Dict:
        """Extract features from Wikipedia API."""
        features = {}
        
        # Get language links count
        url = "https://en.wikipedia.org/w/api.php"
        params = {
            'action': 'query',
            'titles': title,
            'prop': 'langlinks|categories|extlinks|coordinates|info',
            'lllimit': 'max',
            'cllimit': 'max',
            'ellimit': 'max',
            'format': 'json'
        }
        
        response = self.session.get(url, params=params, timeout=self.timeout)
        response.raise_for_status()
        data = response.json()
        
        pages = data.get('query', {}).get('pages', {})
        if pages:
            page = list(pages.values())[0]
            if 'missing' not in page:
                features['num_languages'] = len(page.get('langlinks', []))
                features['en_page_length'] = page.get('length', 0)
                features['num_categories'] = len(page.get('categories', []))
                features['num_external_links'] = len(page.get('extlinks', []))
                features['has_coordinates'] = 1 if 'coordinates' in page else 0
        
        return features
    
    def _search_wikidata(self, query: str) -> Optional[str]:
        """Search for Wikidata entity ID."""
        url = "https://www.wikidata.org/w/api.php"
        params = {
            'action': 'wbsearchentities',
            'search': query,
            'language': 'en',
            'limit': 1,
            'format': 'json'
        }
        
        response = self.session.get(url, params=params, timeout=self.timeout)
        response.raise_for_status()
        data = response.json()
        
        results = data.get('search', [])
        return results[0]['id'] if results else None
    
    def _get_wikidata_features(self, qid: str) -> Dict:
        """Extract features from Wikidata API."""
        features = {}
        
        url = f"https://www.wikidata.org/wiki/Special:EntityData/{qid}.json"
        response = self.session.get(url, timeout=self.timeout)
        response.raise_for_status()
        data = response.json()
        
        entity = data.get('entities', {}).get(qid, {})
        claims = entity.get('claims', {})
        
        # Count statements and properties
        all_properties = set(claims.keys())
        features['num_statements'] = sum(len(v) for v in claims.values())
        features['statement_diversity'] = len(all_properties)
        
        # Cultural and geographic properties
        features['num_cultural_properties'] = len(all_properties & self.CULTURAL_PROPERTIES)
        features['num_geographic_properties'] = len(all_properties & self.GEOGRAPHIC_PROPERTIES)
        
        # Specific property flags
        features['has_country'] = 1 if 'P17' in claims else 0
        features['has_origin_country'] = 1 if 'P495' in claims else 0
        features['has_culture_property'] = 1 if 'P2596' in claims else 0
        
        # Count identifier properties (P prefixes in certain ranges)
        identifiers = [p for p in all_properties if p.startswith('P') and 
                      p[1:].isdigit() and int(p[1:]) > 200]
        features['num_identifiers'] = len(identifiers)
        
        return features


print("WikiDataExtractor class defined.")

WikiDataExtractor class defined.


## 1.2 Extract Features for Test Data

In [6]:
def extract_features_parallel(df: pd.DataFrame, max_workers: int = 16) -> pd.DataFrame:
    """
    Extract Wikipedia/Wikidata features for all items in parallel.
    
    Args:
        df: DataFrame with 'name' and optionally 'item' columns
        max_workers: Number of parallel workers
    
    Returns:
        DataFrame with extracted features
    """
    extractor = WikiDataExtractor(timeout=REQUEST_TIMEOUT)
    results = []
    
    def process_row(idx, row):
        item_name = row['name']
        wikidata_id = None
        
        # Extract Wikidata ID from item URI if available
        if 'item' in row and row['item']:
            item_str = str(row['item'])
            if 'wikidata.org' in item_str:
                wikidata_id = item_str.split('/')[-1]
        
        features = extractor.extract_features(item_name, wikidata_id)
        features['_idx'] = idx
        return features
    
    print(f"Extracting features for {len(df)} items (workers={max_workers})...")
    start_time = time.time()
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {
            executor.submit(process_row, idx, row): idx
            for idx, row in df.iterrows()
        }
        
        for i, future in enumerate(as_completed(futures)):
            try:
                result = future.result()
                results.append(result)
            except Exception as e:
                idx = futures[future]
                results.append({'_idx': idx, **extractor._get_default_features()})
            
            # Progress update every 50 items
            if (i + 1) % 50 == 0 or (i + 1) == len(df):
                elapsed = time.time() - start_time
                rate = (i + 1) / elapsed
                print(f"  Progress: {i+1}/{len(df)} ({rate:.1f} items/sec)")
    
    # Convert to DataFrame and sort by original index
    features_df = pd.DataFrame(results).sort_values('_idx').drop('_idx', axis=1)
    features_df.index = df.index
    
    elapsed = time.time() - start_time
    print(f"Feature extraction complete. Time: {elapsed:.1f}s")
    
    return features_df


# Extract raw features
raw_features_df = extract_features_parallel(test_df, max_workers=MAX_WORKERS)
# Test if API extraction works for ONE item
sample_item = test_df.iloc[2]  # A Bug's Life
print(f"\nTesting: {sample_item['name']}")
print(f"Item URL: {sample_item['item']}")

extractor = WikiDataExtractor(timeout=10)

Extracting features for 300 items (workers=16)...
  Progress: 50/300 (50.3 items/sec)
  Progress: 100/300 (63.5 items/sec)
  Progress: 150/300 (70.6 items/sec)
  Progress: 200/300 (73.0 items/sec)
  Progress: 250/300 (74.4 items/sec)
  Progress: 300/300 (75.9 items/sec)
Feature extraction complete. Time: 4.0s

Testing: A Bug's Life
Item URL: http://www.wikidata.org/entity/Q216153
QID: Q216153

Extracted features:
  num_languages: 63
  en_page_length: 82095
  num_categories: 38
  num_external_links: 169
  num_statements: 194
  statement_diversity: 132
  num_cultural_properties: 1
  has_origin_country: 1
  num_identifiers: 122


## 1.3 Feature Engineering Module

In [8]:
def engineer_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Engineer features EXACTLY like training / tuning.
    
    Expects raw Wikipedia/Wikidata columns from extract_features_parallel.
    """
    feature_df = df.copy()

    # Ensure required raw columns exist
    for col in [
        'num_languages', 'en_page_length', 'num_categories', 'num_external_links',
        'num_statements', 'num_identifiers', 'statement_diversity',
        'num_cultural_properties', 'num_geographic_properties',
        'has_coordinates', 'has_country', 'has_culture_property', 'has_origin_country'
    ]:
        if col not in feature_df.columns:
            feature_df[col] = 0

    # ------- Log transforms -------
    feature_df['log_num_languages']      = np.log1p(feature_df['num_languages'])
    feature_df['log_en_page_length']     = np.log1p(feature_df['en_page_length'])
    feature_df['log_num_statements']     = np.log1p(feature_df['num_statements'])
    feature_df['log_num_categories']     = np.log1p(feature_df['num_categories'])
    feature_df['log_num_external_links'] = np.log1p(feature_df['num_external_links'])
    feature_df['log_num_identifiers']    = np.log1p(feature_df['num_identifiers'])
    feature_df['log_statement_diversity'] = np.log1p(feature_df['statement_diversity'])

    # ------- Ratio features -------
    feature_df['cultural_ratio']   = feature_df['num_cultural_properties'] / (feature_df['statement_diversity'] + 1)
    feature_df['geographic_ratio'] = feature_df['num_geographic_properties'] / (feature_df['statement_diversity'] + 1)
    feature_df['identifier_ratio'] = feature_df['num_identifiers'] / (feature_df['num_statements'] + 1)
    feature_df['categories_per_page']    = feature_df['num_categories'] / (feature_df['en_page_length'] + 1)
    feature_df['external_links_per_page'] = feature_df['num_external_links'] / (feature_df['en_page_length'] + 1)
    

    # ------- Interaction features -------
    feature_df['languages_x_statements'] = (
        feature_df['log_num_languages'] * feature_df['log_num_statements']
    )
    feature_df['languages_x_page_length'] = (
        feature_df['log_num_languages'] * feature_df['log_en_page_length']
    )
    feature_df['has_country_x_languages'] = (
        feature_df['has_country'].astype(int) * feature_df['log_num_languages']
    )
    feature_df['has_country_x_statements'] = (
        feature_df['has_country'].astype(int) * feature_df['log_num_statements']
    )
    feature_df['cultural_x_geographic'] = (
        feature_df['num_cultural_properties'] * feature_df['num_geographic_properties']
    )

    # ------- Composite scores -------
    feature_df['global_reach_score'] = (
        feature_df['log_num_languages'] * 0.5 +
        feature_df['log_en_page_length'] * 0.3 +
        feature_df['log_num_external_links'] * 0.2
    )

    feature_df['cultural_specificity_score'] = (
        feature_df['num_cultural_properties'] * 2.0 +
        feature_df['num_geographic_properties'] * 1.5 +
        feature_df['has_country'].astype(int) * 1.0 +
        feature_df['has_origin_country'].astype(int) * 1.0 +
        feature_df['has_culture_property'].astype(int) * 2.0
    )

    feature_df['info_richness_score'] = (
        feature_df['log_num_statements'] * 0.4 +
        feature_df['log_statement_diversity'] * 0.4 +
        feature_df['log_num_identifiers'] * 0.2
    )

    feature_df['page_quality_score'] = (
        feature_df['log_en_page_length'] * 0.4 +
        feature_df['log_num_categories'] * 0.3 +
        feature_df['log_num_external_links'] * 0.3
    )

    # ------- Binary thresholds -------
    feature_df['is_highly_global']   = (feature_df['num_languages'] > 20).astype(int)
    feature_df['is_niche']           = (feature_df['num_languages'] < 10).astype(int)
    feature_df['has_long_page']      = (feature_df['en_page_length'] > 10000).astype(int)
    feature_df['has_many_statements'] = (feature_df['num_statements'] > 30).astype(int)

    # ------- Polynomial features -------
    feature_df['num_languages_squared']        = feature_df['log_num_languages'] ** 2
    feature_df['cultural_specificity_squared'] = feature_df['cultural_specificity_score'] ** 2

    # EXACT feature order as training - 38 features
    feature_cols = [
        'log_num_languages', 'log_en_page_length', 'log_num_categories',
        'log_num_external_links', 'log_num_statements', 'log_num_identifiers',
        'log_statement_diversity', 'num_cultural_properties', 'num_geographic_properties',
        'has_coordinates', 'has_country', 'has_culture_property', 'has_origin_country',
        'cultural_ratio', 'geographic_ratio', 'identifier_ratio',
        'categories_per_page', 'external_links_per_page',
        'languages_x_statements', 'languages_x_page_length',
        'has_country_x_languages', 'has_country_x_statements', 'cultural_x_geographic',
        'global_reach_score', 'cultural_specificity_score',
        'info_richness_score', 'page_quality_score',
        'is_highly_global', 'is_niche', 'has_long_page', 'has_many_statements',
        'num_languages_squared', 'cultural_specificity_squared'
    ]

    X = feature_df[feature_cols].fillna(0)
    return X

# Use it on the raw wiki features
engineered_features = engineer_features(raw_features_df)
print("Engineered features shape:", engineered_features.shape)

Engineered features shape: (300, 33)


In [10]:
# Apply feature engineering
features_df = engineer_features(raw_features_df)

print(f"Engineered features shape: {features_df.shape}")
print(f"Feature columns: {len(features_df.columns)}")

Engineered features shape: (300, 33)
Feature columns: 33


## 1.4 Load Model and Predict

In [11]:
# Download model from HuggingFace Hub
print(f"Downloading model from {HF_MODEL_REPO}...")
model_path = hf_hub_download(repo_id=HF_MODEL_REPO, filename=MODEL_FILENAME)
print(f"Model downloaded to: {model_path}")

# Load the model
model = joblib.load(model_path)
print("Model loaded successfully.")

Downloading model from ArchitRastogi/CultureScope-XGBoost...
Model downloaded to: /workspace/.cache/huggingface/hub/models--ArchitRastogi--CultureScope-XGBoost/snapshots/7ed86538361dcc5140c4f3bf938158bef072c7b9/best_model.pkl
Model loaded successfully.


In [12]:
# Unpack model bundle (best_model.pkl / cultural_classifier_tuned.pkl)
booster       = model["model"]
scaler        = model["scaler"]
label_encoder = model["label_encoder"]
feature_names = model["feature_names"]

# 1) Align engineered features with training feature order
X = engineered_features[feature_names].fillna(0)
X_scaled = scaler.transform(X)

# 2) XGBoost prediction
dtest = xgb.DMatrix(X_scaled, feature_names=feature_names)
proba = booster.predict(dtest)

# Class indices -> labels
pred_indices = np.argmax(proba, axis=1)
pred_labels = label_encoder.inverse_transform(pred_indices)

print("Sample predictions:", pred_labels[:10])

# 3) Build dataframe with predictions
pred_df = test_df.copy()

# overwrite / create prediction column named 'label' (this is the predicted label)
pred_df["label"] = pred_labels

print("\nPrediction distribution:")
print(pred_df["label"].value_counts())


Sample predictions: ['cultural representative' 'cultural exclusive' 'cultural representative'
 'cultural agnostic' 'cultural representative' 'cultural exclusive'
 'cultural representative' 'cultural exclusive' 'cultural representative'
 'cultural agnostic']

Prediction distribution:
label
cultural agnostic          187
cultural exclusive          67
cultural representative     46
Name: count, dtype: int64


## 1.5 Save Predictions (Part 1)

In [13]:
# Columns we want in the final CSV, in order
output_columns = [
    "item",
    "name",
    "description",
    "type",
    "category",
    "subcategory",
    "label",  # predicted label
]

# Keep only those columns (will error if any are missing, which is good to catch)
final_pred_df = pred_df[output_columns]

# Save to CSV
final_pred_df.to_csv(OUTPUT_CSV_PATH, index=False)
print(f"Predictions saved to: {OUTPUT_CSV_PATH}")
final_pred_df.head(1)


Predictions saved to: predictions.csv


Unnamed: 0,item,name,description,type,category,subcategory,label
0,http://www.wikidata.org/entity/Q15786,1. FC Nürnberg,"German sports club based in Nuremberg, Bavaria",entity,sports,sports club,cultural representative


In [14]:
# Display sample predictions
print("\nSample predictions:")
final_pred_df[['name', 'category', 'label']].head(12)


Sample predictions:


Unnamed: 0,name,category,label
0,1. FC Nürnberg,sports,cultural representative
1,77 Records,music,cultural exclusive
2,A Bug's Life,comics and anime,cultural representative
3,A Gang Story,films,cultural agnostic
4,Aaron Copland,performing arts,cultural representative
5,Aarwangen Castle,architecture,cultural exclusive
6,abaya,fashion,cultural representative
7,Academy of San Carlos,visual arts,cultural exclusive
8,Africa,geography,cultural representative
9,African American literature,literature,cultural agnostic


---

# Part 2: Without External API Calls

This fallback approach uses a pre-trained ensemble model that relies on text-based features when external API access is not available. The model uses TF-IDF features from item metadata.

**Note:** This approach requires the `ensemble_simple.pkl` file in the working directory.

## 2.1 Load Model and Configuration

In [15]:
# Download model from HuggingFace Hub
print(f"Downloading model from {HF_MODEL_REPO_PART2}...")
ENSEMBLE_MODEL_PATH = hf_hub_download(repo_id=HF_MODEL_REPO_PART2, filename=MODEL_FILENAME_PART2)
print(f"Model downloaded to: {ENSEMBLE_MODEL_PATH}")

print("Loading ensemble model...")
with open(ENSEMBLE_MODEL_PATH, 'rb') as f:
    model_data = pickle.load(f)
print("Model loaded successfully.")

OUTPUT_CSV_PATH_PART2 = "predictions_no_api.csv"

print(f"Output Path set to {OUTPUT_CSV_PATH_PART2}")

Downloading model from ArchitRastogi/CultureScope-Ensemble-NoWiki...
Model downloaded to: /workspace/.cache/huggingface/hub/models--ArchitRastogi--CultureScope-Ensemble-NoWiki/snapshots/430a4cbcfc660ac9cd51ce93dd6fecf83f71b4a9/ensemble_simple.pkl
Loading ensemble model...
Model loaded successfully.
Output Path set to predictions_no_api.csv


## 2.2 Load Ensemble Model and Prepare Feature Engineering Function

In [16]:
# Extract model components
xgb_model = model_data['xgb_model']
lgb_model = model_data['lgb_model']
catboost_model = model_data['catboost_model']
scaler = model_data['scaler']
label_encoder = model_data['label_encoder']
feature_names = model_data['feature_names']
ensemble_weights = model_data['ensemble_weights']

print("Model loaded successfully")
print(f"  Feature count: {len(feature_names)}")
print(f"  Classes: {label_encoder.classes_}")
print(f"  Ensemble weights: XGB={ensemble_weights['xgb']:.3f}, "
      f"LGB={ensemble_weights['lgb']:.3f}, "
      f"CAT={ensemble_weights['catboost']:.3f}")


def create_basic_features(df):
    """Create basic numerical features from available columns"""
    print("Creating basic features from available data...")
    
    features = pd.DataFrame()
    
    # Text length features
    features['name_length'] = df['name'].fillna('').astype(str).str.len()
    features['desc_length'] = df['description'].fillna('').astype(str).str.len()
    features['name_word_count'] = df['name'].fillna('').astype(str).str.split().str.len()
    features['desc_word_count'] = df['description'].fillna('').astype(str).str.split().str.len()
    
    # Categorical encoding
    features['type_entity'] = (df['type'] == 'entity').astype(int)
    features['type_concept'] = (df['type'] == 'concept').astype(int)
    
    # Category features (one-hot top categories)
    top_categories = ['music', 'films', 'sports', 'literature', 'visual arts', 
                      'architecture', 'media', 'history', 'politics', 'food']
    for cat in top_categories:
        features[f'cat_{cat.replace(" ", "_")}'] = (df['category'] == cat).astype(int)
    
    # Default values for enriched features (zeros since model expects them)
    enriched_cols = ['num_languages', 'en_page_length', 'num_statements', 'num_categories',
                     'num_external_links', 'num_identifiers', 'statement_diversity',
                     'num_cultural_properties', 'num_geographic_properties',
                     'has_coordinates', 'has_country', 'has_culture_property', 'has_origin_country']
    
    for col in enriched_cols:
        if col.startswith('has_'):
            features[col] = 0
        elif col.startswith('num_'):
            features[col] = 1
        else:
            features[col] = 0
    
    # Fill NaN with 0
    features = features.fillna(0)
    
    print(f"  Created {len(features.columns)} features")
    return features

print("\nFeature engineering function ready")

Model loaded successfully
  Feature count: 29
  Classes: ['cultural agnostic' 'cultural exclusive' 'cultural representative']
  Ensemble weights: XGB=0.339, LGB=0.326, CAT=0.335

Feature engineering function ready


## 2.3  Make Predictions and Save Results

In [17]:
# Create basic features from test data
print(f"Making predictions on {len(test_df)} samples...")
X_test = create_basic_features(test_df)

# Scale features
print("Scaling features...")
X_test_scaled = scaler.transform(X_test)

# Get predictions from all models
print("Running ensemble prediction...")
dtest = xgb.DMatrix(X_test_scaled, feature_names=feature_names)

xgb_pred_proba = xgb_model.predict(dtest)
lgb_pred_proba = lgb_model.predict(X_test_scaled)
cat_pred_proba = catboost_model.predict_proba(X_test_scaled)

# Weighted ensemble
ensemble_pred_proba = (
    ensemble_weights['xgb'] * xgb_pred_proba +
    ensemble_weights['lgb'] * lgb_pred_proba +
    ensemble_weights['catboost'] * cat_pred_proba
)

# Get final predictions
y_pred_encoded = np.argmax(ensemble_pred_proba, axis=1)
y_pred = label_encoder.inverse_transform(y_pred_encoded)

print("Predictions complete")

# Create output DataFrame with same format as Part 1
output_df = test_df[['item', 'name', 'description', 'type', 'category', 'subcategory']].copy()
output_df['label'] = y_pred

# Add probability columns
for i, class_label in enumerate(label_encoder.classes_):
    output_df[f'prob_{class_label}'] = ensemble_pred_proba[:, i]

# Save predictions
output_df.to_csv(OUTPUT_CSV_PATH_PART2, index=False)
print(f"\nPredictions saved to: {OUTPUT_CSV_PATH_PART2}")

# Show prediction distribution
print("\nPrediction distribution:")
print(output_df['label'].value_counts())

# Display sample predictions
print("\nSample predictions:")
display(output_df[['name', 'category', 'label']].head(12))

# Set output_part2 for final summary cell
output_part2 = OUTPUT_CSV_PATH_PART2

Making predictions on 300 samples...
Creating basic features from available data...
  Created 29 features
Scaling features...
Running ensemble prediction...
Predictions complete

Predictions saved to: predictions_no_api.csv

Prediction distribution:
label
cultural agnostic          134
cultural exclusive         124
cultural representative     42
Name: count, dtype: int64

Sample predictions:


Unnamed: 0,name,category,label
0,1. FC Nürnberg,sports,cultural exclusive
1,77 Records,music,cultural exclusive
2,A Bug's Life,comics and anime,cultural representative
3,A Gang Story,films,cultural representative
4,Aaron Copland,performing arts,cultural representative
5,Aarwangen Castle,architecture,cultural exclusive
6,abaya,fashion,cultural agnostic
7,Academy of San Carlos,visual arts,cultural representative
8,Africa,geography,cultural exclusive
9,African American literature,literature,cultural exclusive


---

## Summary

This notebook provides two approaches for cultural specificity classification:

| Approach | Model | Features | Output File |
|----------|-------|----------|-------------|
| Part 1 | XGBoost (Tuned) | Wikipedia/Wikidata | `predictions.csv` |
| Part 2 | Ensemble | Text-only | `predictions_no_api.csv` |

**Recommended:** Use Part 1 for best accuracy (F1: 0.6029).

In [18]:
# Final output summary
print("\n" + "="*50)
print("OUTPUT FILES")
print("="*50)
print(f"\nPart 1 predictions: {OUTPUT_CSV_PATH}")
if output_part2 is not None:
    print(f"Part 2 predictions: {OUTPUT_CSV_PATH_PART2}")
print("\nDone.")


OUTPUT FILES

Part 1 predictions: predictions.csv
Part 2 predictions: predictions_no_api.csv

Done.


## Results on Validation Set

### Part 1
Number of samples: 300
Accuracy: 0.6367
Macro F1: 0.6029

Per-class metrics:
                         precision    recall  f1-score   support

      cultural agnostic       0.60      0.96      0.74       117
     cultural exclusive       0.64      0.57      0.60        76
cultural representative       0.78      0.34      0.47       107

               accuracy                           0.64       300
              macro avg       0.67      0.62      0.60       300
           weighted avg       0.68      0.64      0.61       300

### Part 2 
Number of samples: 300
Accuracy: 0.5867
Macro F1: 0.5558

Per-class metrics:
                         precision    recall  f1-score   support

      cultural agnostic       0.68      0.78      0.73       117
     cultural exclusive       0.47      0.76      0.58        76
cultural representative       0.64      0.25      0.36       107

               accuracy                           0.59       300
              macro avg       0.60      0.60      0.56       300
           weighted avg       0.61      0.59      0.56       300

### Script to Test Predictions vs Real value 

In [None]:
import argparse
import pandas as pd

from sklearn.metrics import accuracy_score, f1_score, classification_report


def load_data(valid_path: str, pred_path: str):
    valid = pd.read_csv(valid_path)
    preds = pd.read_csv(pred_path)

    # Expect an `item` column to align rows + a `label` column
    valid_labels = valid[["item", "label"]].rename(columns={"label": "true_label"})
    pred_labels = preds[["item", "label"]].rename(columns={"label": "pred_label"})

    merged = pd.merge(valid_labels, pred_labels, on="item", how="inner")

    if len(merged) == 0:
        raise ValueError("No overlapping `item` ids between valid.csv and predictions.csv.")

    if len(merged) != len(valid_labels):
        print(
            f"Warning: only {len(merged)} / {len(valid_labels)} items from valid.csv "
            f"were found in predictions.csv"
        )

    return merged["true_label"], merged["pred_label"]


def main():
    parser = argparse.ArgumentParser(description="Evaluate predictions against ground truth.")
    parser.add_argument("--valid", default="valid.csv", help="Path to valid.csv (ground truth)")
    parser.add_argument(
        "--predictions", default="predictions.csv", help="Path to predictions.csv"
    )
    args = parser.parse_args()

    y_true, y_pred = load_data(args.valid, args.predictions)

    # Accuracy
    acc = accuracy_score(y_true, y_pred)

    # F1 for 3-class (macro treats all classes equally)
    f1_macro = f1_score(y_true, y_pred, average="macro")

    print("Number of samples:", len(y_true))
    print(f"Accuracy: {acc:.4f}")
    print(f"Macro F1: {f1_macro:.4f}")
    print("\nPer-class metrics:")
    print(classification_report(y_true, y_pred))


if __name__ == "__main__":
    main()
