# 01 Exploratory Analysis

Initial EDA on processed TFT match data: descriptive stats, trait/item distributions, and simple correlations.


In [None]:

# Setup
from pathlib import Path
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

MARKERS = {"requirements.txt", "Projectplan.md", ".git"}
PROJECT_ROOT = None
cwd = Path.cwd()
for path in [cwd, *cwd.parents]:
    if any((path / m).exists() for m in MARKERS):
        PROJECT_ROOT = path
        break
if PROJECT_ROOT is None:
    PROJECT_ROOT = cwd

DATA_PROCESSED = PROJECT_ROOT / "data" / "processed"
DEFAULT_CANONICAL = DATA_PROCESSED / "canonical_original"
if DEFAULT_CANONICAL.exists():
    PROCESSED_DIR = DEFAULT_CANONICAL
else:
    candidates = [p for p in DATA_PROCESSED.iterdir() if (p / "participants.csv").exists()]
    PROCESSED_DIR = candidates[0] if candidates else DATA_PROCESSED

DATA_RAW = PROJECT_ROOT / "data" / "raw"
OUTPUTS_DIR = PROJECT_ROOT / "outputs"
FIGURES_DIR = OUTPUTS_DIR / "figures"
for p in [OUTPUTS_DIR, FIGURES_DIR]:
    p.mkdir(parents=True, exist_ok=True)

pd.set_option("display.max_columns", 100)
pd.set_option("display.width", 180)
plt.rcParams.update({"figure.figsize": (10, 6), "figure.dpi": 120})
sns.set_theme(style="whitegrid", palette="muted")

print(f"Project root: {PROJECT_ROOT}")
print(f"Processed dir: {PROCESSED_DIR}")


In [None]:

# Load processed canonical tables
participants = pd.read_csv(PROCESSED_DIR / "participants.csv")
traits = pd.read_csv(PROCESSED_DIR / "traits.csv")
units = pd.read_csv(PROCESSED_DIR / "units.csv")
items_ref = pd.read_csv(DATA_RAW / "items_s16.csv")
traits_ref = pd.read_csv(DATA_RAW / "traits_s16.csv")


In [None]:

# Descriptive stats for numeric columns
numeric_cols = ['placement', 'level', 'unit_tier']
unit_numeric = units[['unit_tier']].copy()
unit_numeric['placement'] = units.merge(participants[['match_id','puuid','placement']], on=['match_id','puuid'], how='left')['placement']
unit_numeric['level'] = units.merge(participants[['match_id','puuid','level']], on=['match_id','puuid'], how='left')['level']

summary_stats = unit_numeric[numeric_cols].describe()
summary_stats


In [None]:

# Trait distribution (frequency of active trait appearances)
trait_counts = traits['trait_id'].value_counts().reset_index()
trait_counts.columns = ['trait_id','count']
trait_counts = trait_counts.merge(traits_ref[['name']], left_on='trait_id', right_on='name', how='left', indicator=True)
trait_counts.head(10)


In [None]:

# Item distribution (flatten item slots)
item_cols = [c for c in units.columns if c.startswith('item_')]
items_long = units[['unit_name'] + item_cols].set_index('unit_name').stack(dropna=True).reset_index()
items_long.columns = ['unit_name','slot','item']
items_long['item'] = items_long['item'].astype(str)

item_counts = items_long['item'].value_counts().reset_index()
item_counts.columns = ['item','count']
item_counts.head(10)


In [None]:

# Placement correlation with basic player/unit features
# Aggregate per player per match
unit_counts = units.groupby(['match_id','puuid']).size().rename('units_per_board')
item_counts_player = items_long.groupby(['unit_name']).size()  # not per player; skip
item_slots_per_player = items_long.groupby(['match_id','puuid']).size().rename('items_per_board')
unit_tier_avg = units.groupby(['match_id','puuid'])['unit_tier'].mean().rename('avg_unit_tier')

player_df = participants.merge(unit_counts, on=['match_id','puuid'], how='left')                            .merge(item_slots_per_player, on=['match_id','puuid'], how='left')                            .merge(unit_tier_avg, on=['match_id','puuid'], how='left')

corr_cols = ['placement','level','units_per_board','items_per_board','avg_unit_tier']
corr_matrix = player_df[corr_cols].corr()
plt.figure(figsize=(6,5))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation matrix')
plt.tight_layout()
fig_corr_path = FIGURES_DIR / "01_corr_matrix.png"
plt.savefig(fig_corr_path, bbox_inches='tight')
print(f"Saved correlation heatmap to {fig_corr_path}")
corr_matrix


In [None]:

# Item heatmap: units vs items (all slots treated as entries, filtered to items in items_s16)
valid_items = set(items_ref['name'].dropna())
items_long_valid = items_long[items_long['item'].isin(valid_items)]

item_totals = items_long_valid['item'].value_counts()
top_items = item_totals.head(25).index
heat_df = (
    items_long_valid[items_long_valid['item'].isin(top_items)]
    .groupby(['unit_name','item']).size()
    .unstack(fill_value=0)
)

plt.figure(figsize=(14, max(6, 0.2 * len(heat_df))))
sns.heatmap(heat_df, cmap='Blues', cbar_kws={'label': 'count'})
plt.title('Item frequency by unit (top 25 items)')
plt.xlabel('Item')
plt.ylabel('Unit')
plt.xticks(rotation=90)
plt.tight_layout()
heatmap_path = FIGURES_DIR / "01_items_heatmap.png"
plt.savefig(heatmap_path, bbox_inches='tight')
print(f"Saved item heatmap to {heatmap_path}")


In [None]:

# Identify interesting patterns (top units by avg placement, top items by avg placement)
unit_perf = units.merge(participants[['match_id','puuid','placement']], on=['match_id','puuid'], how='left')
unit_perf_stats = unit_perf.groupby('unit_name')['placement'].agg(['count','mean']).rename(columns={'mean':'avg_placement'}).sort_values('avg_placement')
print("Top 10 units by lowest avg placement:")
print(unit_perf_stats.head(10))

items_perf = items_long_valid.merge(participants[['match_id','puuid','placement']], left_on=['match_id','puuid'], right_on=['match_id','puuid'], how='left')
item_perf_stats = items_perf.groupby('item')['placement'].agg(['count','mean']).rename(columns={'mean':'avg_placement'}).sort_values('avg_placement')
print("Top 10 items by lowest avg placement:")
print(item_perf_stats.head(10))
