# 01 Exploratory Analysis

EDA on cleaned TFT match data: quick validation, descriptive stats, distributions, correlations, and early performance patterns.


In [None]:

# Setup
from pathlib import Path
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

MARKERS = {"requirements.txt", "Projectplan.md", ".git"}
PROJECT_ROOT = None
cwd = Path.cwd()
for path in [cwd, *cwd.parents]:
    if any((path / m).exists() for m in MARKERS):
        PROJECT_ROOT = path
        break
if PROJECT_ROOT is None:
    PROJECT_ROOT = cwd

DATA_PROCESSED = PROJECT_ROOT / "data" / "processed"
CLEANED_DIR = DATA_PROCESSED / "cleaned"
CANONICAL_DIR = DATA_PROCESSED / "canonical_original"

OUTPUTS_DIR = PROJECT_ROOT / "outputs"
FIGURES_DIR = OUTPUTS_DIR / "figures"
for p in [OUTPUTS_DIR, FIGURES_DIR]:
    p.mkdir(parents=True, exist_ok=True)

pd.set_option("display.max_columns", 100)
pd.set_option("display.width", 180)
plt.rcParams.update({"figure.figsize": (10, 6), "figure.dpi": 120})
sns.set_theme(style="whitegrid", palette="muted")

print(f"Project root: {PROJECT_ROOT}")
print(f"Cleaned dir: {CLEANED_DIR}")


In [None]:

# Load cleaned tables
participants = pd.read_csv(CLEANED_DIR / "participants.csv")
traits = pd.read_csv(CLEANED_DIR / "traits.csv")
units = pd.read_csv(CLEANED_DIR / "units.csv")

# Reference lookups from canonical reference files
units_ref = pd.read_csv(CANONICAL_DIR / "units_s16.csv")
traits_ref = pd.read_csv(CANONICAL_DIR / "traits_s16.csv")
items_ref = pd.read_csv(CANONICAL_DIR / "items_s16.csv")

valid_units = set(units_ref['name'].dropna())
valid_traits = set(traits_ref['name_corrected'].dropna()) if 'name_corrected' in traits_ref.columns else set(traits_ref['name'].dropna())
valid_items = set()
for col in items_ref.columns:
    if col == 'name' or col.startswith('comp'):
        valid_items.update(items_ref[col].dropna().astype(str).str.strip())


In [None]:

# Flatten items (retain match_id/puuid for per-player aggregation)
item_cols = [c for c in units.columns if c.startswith('item_')]
items_long = units[['match_id','puuid','unit_name'] + item_cols].set_index(['match_id','puuid','unit_name']).stack(dropna=True).reset_index()
items_long.columns = ['match_id','puuid','unit_name','slot','item']
items_long['item'] = items_long['item'].apply(lambda x: x.strip() if isinstance(x, str) else x)


In [None]:

# Quick validation on cleaned data
invalid_units = units[~units['unit_name'].isin(valid_units)]
invalid_traits = traits[~traits['trait_id'].isin(valid_traits)]
invalid_items_rows = items_long[~items_long['item'].isin(valid_items)]

print(f"Cleaned invalid units: {len(invalid_units)}")
print(f"Cleaned invalid traits: {len(invalid_traits)}")
print(f"Cleaned invalid items: {len(invalid_items_rows)}")


In [None]:

# Descriptive stats for numeric columns
numeric_cols = ['placement', 'level', 'unit_tier']
unit_numeric = units[['unit_tier']].copy()
unit_numeric['placement'] = units.merge(participants[['match_id','puuid','placement']], on=['match_id','puuid'], how='left')['placement']
unit_numeric['level'] = units.merge(participants[['match_id','puuid','level']], on=['match_id','puuid'], how='left')['level']

summary_stats = unit_numeric[numeric_cols].describe()
summary_stats


In [None]:

# Trait distribution (frequency of active trait appearances)
trait_counts = traits['trait_id'].value_counts().reset_index()
trait_counts.columns = ['trait_id','count']
trait_counts = trait_counts.merge(traits_ref[['name_corrected','rank']], left_on='trait_id', right_on='name_corrected', how='left')
trait_counts.head(10)


In [None]:

# Item distribution (flatten item slots)
item_counts = items_long['item'].value_counts().reset_index()
item_counts.columns = ['item','count']
item_counts.head(10)


In [None]:

# Placement correlation with basic player/unit features
unit_counts = units.groupby(['match_id','puuid']).size().rename('units_per_board')
item_slots_per_player = items_long.groupby(['match_id','puuid']).size().rename('items_per_board')
unit_tier_avg = units.groupby(['match_id','puuid'])['unit_tier'].mean().rename('avg_unit_tier')

player_df = participants.merge(unit_counts, on=['match_id','puuid'], how='left')                            .merge(item_slots_per_player, on=['match_id','puuid'], how='left')                            .merge(unit_tier_avg, on=['match_id','puuid'], how='left')

corr_cols = ['placement','level','units_per_board','items_per_board','avg_unit_tier']
corr_matrix = player_df[corr_cols].corr()
plt.figure(figsize=(6,5))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation matrix')
plt.tight_layout()
fig_corr_path = FIGURES_DIR / "01_corr_matrix.png"
plt.savefig(fig_corr_path, bbox_inches='tight')
print(f"Saved correlation heatmap to {fig_corr_path}")
corr_matrix


In [None]:

# Item heatmap: units vs items (top 25 items)
item_freq = items_long['item'].value_counts()
top_items = item_freq.head(25).index
heat_df = (
    items_long[items_long['item'].isin(top_items)]
    .groupby(['unit_name','item']).size()
    .unstack(fill_value=0)
)

plt.figure(figsize=(14, max(6, 0.2 * len(heat_df))))
sns.heatmap(heat_df, cmap='Blues', cbar_kws={'label': 'count'})
plt.title('Item frequency by unit (top 25 items)')
plt.xlabel('Item')
plt.ylabel('Unit')
plt.xticks(rotation=90)
plt.tight_layout()
heatmap_path = FIGURES_DIR / "01_items_heatmap.png"
plt.savefig(heatmap_path, bbox_inches='tight')
print(f"Saved item heatmap to {heatmap_path}")


In [None]:

# Identify interesting patterns (top units/items by lowest avg placement)
unit_perf = units.merge(participants[['match_id','puuid','placement']], on=['match_id','puuid'], how='left')
unit_perf_stats = unit_perf.groupby('unit_name')['placement'].agg(['count','mean']).rename(columns={'mean':'avg_placement'}).sort_values('avg_placement')
print("Top 10 units by lowest avg placement:")
print(unit_perf_stats.head(10))

items_perf = items_long.merge(participants[['match_id','puuid','placement']], on=['match_id','puuid'], how='left')
item_perf_stats = items_perf.groupby('item')['placement'].agg(['count','mean']).rename(columns={'mean':'avg_placement'}).sort_values('avg_placement')
print("Top 10 items by lowest avg placement:")
print(item_perf_stats.head(10))
