# 00 Data Validation

Validates the canonical processed tables and cross-matches units/traits/items against reference lists. Generates cleaned tables in `data/processed/cleaned/`. No exploratory plotting here.

In [None]:

# Setup
from pathlib import Path
import pandas as pd

# Detect project root
MARKERS = {"requirements.txt", "Projectplan.md", ".git"}
PROJECT_ROOT = None
cwd = Path.cwd()
for path in [cwd, *cwd.parents]:
    if any((path / m).exists() for m in MARKERS):
        PROJECT_ROOT = path
        break
if PROJECT_ROOT is None:
    PROJECT_ROOT = cwd

DATA_PROCESSED = PROJECT_ROOT / "data" / "processed"
DEFAULT_CANONICAL = DATA_PROCESSED / "canonical_original"
PROCESSED_DIR = DEFAULT_CANONICAL if DEFAULT_CANONICAL.exists() else DATA_PROCESSED

# Use canonical processed reference files (units_s16/items_s16/traits_s16)
REF_DIR = PROCESSED_DIR

CLEANED_DIR = DATA_PROCESSED / "cleaned"
INVALID_DIR = DATA_PROCESSED / "invalid"
CLEANED_DIR.mkdir(parents=True, exist_ok=True)
INVALID_DIR.mkdir(parents=True, exist_ok=True)

print(f"Project root: {PROJECT_ROOT}")
print(f"Processed dir: {PROCESSED_DIR}")
print(f"Cleaned output: {CLEANED_DIR}")
print(f"Invalid records saved to: {INVALID_DIR}")


In [None]:

# Load processed tables (canonical outputs)
participants = pd.read_csv(PROCESSED_DIR / "participants.csv")
traits = pd.read_csv(PROCESSED_DIR / "traits.csv")
units = pd.read_csv(PROCESSED_DIR / "units.csv")

# Reference lookups from canonical processed reference files
units_ref = pd.read_csv(REF_DIR / "units_s16.csv")
traits_ref = pd.read_csv(REF_DIR / "traits_s16.csv")
items_ref = pd.read_csv(REF_DIR / "items_s16.csv")

valid_units = set(units_ref['name'].dropna())
valid_traits = set(traits_ref['name_corrected'].dropna()) if 'name_corrected' in traits_ref.columns else set(traits_ref['name'].dropna())

valid_items = set()
for col in items_ref.columns:
    if col == 'name' or col.startswith('comp'):
        valid_items.update(items_ref[col].dropna().astype(str).str.strip())


In [None]:

# Dataset overview
print("Participants:", participants.shape)
print("Traits:", traits.shape)
print("Units:", units.shape)
print("Matches:", participants['match_id'].nunique())
print("Players:", participants['puuid'].nunique())


In [None]:

# Schema checks vs required columns
required = {
    "participants": ["match_id", "puuid", "placement", "level", "last_round", "is_win"],
    "traits": ["match_id", "puuid", "trait_id", "num_units", "tier_current"],
    "units": ["match_id", "puuid", "unit_name", "unit_tier", "rarity", "item_0", "item_1", "item_2"],
}

for name, cols in required.items():
    df = locals()[name]
    missing = [c for c in cols if c not in df.columns]
    status = "OK" if not missing else f"MISSING {missing}"
    print(f"{name}: {status}")


In [None]:

# Range checks with failures printed
failed = {}

placement_mask = ~participants['placement'].between(1, 8, inclusive='both')
if placement_mask.any():
    failed['placement_range'] = participants.loc[placement_mask, ['match_id','puuid','placement']]

level_mask = ~participants['level'].between(1, 10, inclusive='both')
if level_mask.any():
    failed['level_range'] = participants.loc[level_mask, ['match_id','puuid','level']]

unit_tier_mask = ~units['unit_tier'].between(1, 3, inclusive='both')
if unit_tier_mask.any():
    failed['unit_tier_range'] = units.loc[unit_tier_mask, ['match_id','puuid','unit_name','unit_tier']]

if not failed:
    print("[OK] All range checks passed")
else:
    for name, df in failed.items():
        print(f"[FAIL] {name}: {len(df)} rows")
        display(df)


In [None]:

# Cross-match validity checks and offending rows (traits vs name_corrected; items vs item names or components or observed items)
item_cols = [c for c in units.columns if c.startswith('item_')]
items_long = units[['match_id','puuid','unit_name'] + item_cols].set_index(['match_id','puuid','unit_name']).stack(dropna=True).reset_index()
items_long.columns = ['match_id','puuid','unit_name','slot','item']
items_long['item'] = items_long['item'].apply(lambda x: x.strip() if isinstance(x, str) else x)

observed_items = set(items_long['item'].dropna())
valid_items_all = valid_items.union(observed_items)

invalid_units = units[~units['unit_name'].isin(valid_units)]
invalid_traits = traits[~traits['trait_id'].isin(valid_traits)]
invalid_items_rows = items_long[~items_long['item'].isin(valid_items_all)]

print(f"Invalid units: {len(invalid_units)} rows")
if len(invalid_units):
    display(invalid_units[['match_id','puuid','unit_name']])

print(f"Invalid traits (compared to name_corrected): {len(invalid_traits)} rows")
if len(invalid_traits):
    display(invalid_traits[['match_id','puuid','trait_id']])

print(f"Invalid items (slot entries not matching item names/components/observed set): {len(invalid_items_rows)} rows")
if len(invalid_items_rows):
    display(invalid_items_rows[['match_id','puuid','unit_name','slot','item']])

# Save invalids for manual review
if len(invalid_units):
    invalid_units.to_csv(INVALID_DIR / "invalid_units.csv", index=False)
if len(invalid_traits):
    invalid_traits.to_csv(INVALID_DIR / "invalid_traits.csv", index=False)
if len(invalid_items_rows):
    invalid_items_rows.to_csv(INVALID_DIR / "invalid_items.csv", index=False)


In [None]:

# Cleaning rules (based on checks):
# - Drop unit rows with unit_tier outside 1-3.
# - Drop unit rows with unknown unit_name.
# - Trim item slots; keep items if in reference (full names/components) or observed set, else set to NA.
# - Drop trait rows with unknown trait_id (matched against name_corrected list).
# - Keep participants/traits tied to players that still have at least one unit after cleaning.

units_clean = units.copy()
units_clean = units_clean[units_clean['unit_tier'].between(1,3, inclusive='both')]
units_clean = units_clean[units_clean['unit_name'].isin(valid_units)].copy()

for col in [c for c in units_clean.columns if c.startswith('item_')]:
    units_clean[col] = units_clean[col].apply(lambda x: x.strip() if isinstance(x, str) else x)
    mask_valid = units_clean[col].isin(valid_items_all) | units_clean[col].isna()
    units_clean.loc[~mask_valid, col] = pd.NA

traits_clean = traits[traits['trait_id'].isin(valid_traits)].copy()

valid_players = units_clean[['match_id','puuid']].drop_duplicates()
participants_clean = participants.merge(valid_players, on=['match_id','puuid'], how='inner')
traits_clean = traits_clean.merge(valid_players, on=['match_id','puuid'], how='inner')

# Save cleaned tables
p_out = DATA_PROCESSED / "cleaned" / "participants.csv"
t_out = DATA_PROCESSED / "cleaned" / "traits.csv"
u_out = DATA_PROCESSED / "cleaned" / "units.csv"
participants_clean.to_csv(p_out, index=False)
traits_clean.to_csv(t_out, index=False)
units_clean.to_csv(u_out, index=False)

print("Saved cleaned tables:")
print(p_out)
print(t_out)
print(u_out)


In [None]:

# Post-clean counts
print("Participants (clean):", len(participants_clean))
print("Traits (clean):", len(traits_clean))
print("Units (clean):", len(units_clean))
print("Matches (clean):", participants_clean['match_id'].nunique())
