# 01 Exploratory Analysis

EDA on cleaned TFT match data: quick validation, descriptive stats, distributions, correlations, and early performance patterns.

In [None]:

# Setup
from pathlib import Path
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

MARKERS = {"requirements.txt", "Projectplan.md", ".git"}
PROJECT_ROOT = None
cwd = Path.cwd()
for path in [cwd, *cwd.parents]:
    if any((path / m).exists() for m in MARKERS):
        PROJECT_ROOT = path
        break
if PROJECT_ROOT is None:
    PROJECT_ROOT = cwd

DATA_PROCESSED = PROJECT_ROOT / "data" / "processed"
CLEANED_DIR = DATA_PROCESSED / "cleaned"
CANONICAL_DIR = DATA_PROCESSED / "canonical_original"

OUTPUTS_DIR = PROJECT_ROOT / "outputs"
FIGURES_DIR = OUTPUTS_DIR / "figures"
for p in [OUTPUTS_DIR, FIGURES_DIR]:
    p.mkdir(parents=True, exist_ok=True)

pd.set_option("display.max_columns", 100)
pd.set_option("display.width", 180)
plt.rcParams.update({"figure.figsize": (10, 6), "figure.dpi": 120})
sns.set_theme(style="whitegrid", palette="muted")

print(f"Project root: {PROJECT_ROOT}")
print(f"Cleaned dir: {CLEANED_DIR}")


In [None]:

# Load cleaned tables
participants = pd.read_csv(CLEANED_DIR / "participants.csv")
traits = pd.read_csv(CLEANED_DIR / "traits.csv")
units = pd.read_csv(CLEANED_DIR / "units.csv")

# Reference lookups from canonical reference files and special items
units_ref = pd.read_csv(CANONICAL_DIR / "units_s16.csv")
traits_ref = pd.read_csv(CANONICAL_DIR / "traits_s16.csv")
items_ref = pd.read_csv(CANONICAL_DIR / "items_s16.csv")

special_items_path = CLEANED_DIR / "special_items.csv"
special_items = pd.read_csv(special_items_path) if special_items_path.exists() else pd.DataFrame(columns=['item','type'])

valid_units = set(units_ref['name'].dropna())
valid_traits = set(traits_ref['name_corrected'].dropna()) if 'name_corrected' in traits_ref.columns else set(traits_ref['name'].dropna())

valid_items = set()
item_type_map = {}
for col in items_ref.columns:
    if col == 'name':
        names = items_ref[col].dropna().astype(str).str.strip()
        valid_items.update(names)
        item_type_map.update(dict(zip(names, items_ref['type'])))
    elif col.startswith('comp'):
        comps = items_ref[col].dropna().astype(str).str.strip()
        valid_items.update(comps)
        for c in comps:
            item_type_map[c] = 'Component'

if not special_items.empty:
    special_names = special_items['item'].dropna().astype(str).str.strip()
    valid_items.update(special_names)
    for itm, typ in zip(special_items['item'], special_items.get('type', [])):
        item_type_map[str(itm).strip()] = typ if pd.notna(typ) else 'Special'


In [None]:

# Flatten items with match_id/puuid for per-player aggregation
item_cols = [c for c in units.columns if c.startswith('item_')]
items_long = units[['match_id','puuid','unit_name'] + item_cols].set_index(['match_id','puuid','unit_name']).stack(dropna=True).reset_index()
items_long.columns = ['match_id','puuid','unit_name','slot','item']
items_long['item'] = items_long['item'].apply(lambda x: x.strip() if isinstance(x, str) else x)
items_long['item_type'] = items_long['item'].map(item_type_map).fillna('Unknown')


In [None]:

# Quick validation on cleaned data
invalid_units = units[~units['unit_name'].isin(valid_units)]
invalid_traits = traits[~traits['trait_id'].isin(valid_traits)]
invalid_items_rows = items_long[~items_long['item'].isin(valid_items)]

print(f"Cleaned invalid units: {len(invalid_units)}")
print(f"Cleaned invalid traits: {len(invalid_traits)}")
print(f"Cleaned invalid items: {len(invalid_items_rows)}")


In [None]:

# Descriptive stats for numeric columns
numeric_cols = ['placement', 'level', 'unit_tier']
unit_numeric = units[['unit_tier']].copy()
unit_numeric['placement'] = units.merge(participants[['match_id','puuid','placement']], on=['match_id','puuid'], how='left')['placement']
unit_numeric['level'] = units.merge(participants[['match_id','puuid','level']], on=['match_id','puuid'], how='left')['level']
summary_stats = unit_numeric[numeric_cols].describe()
summary_stats


In [None]:

# Trait distribution (frequency of active trait appearances)
trait_counts = traits['trait_id'].value_counts().reset_index()
trait_counts.columns = ['trait_id','count']
trait_counts = trait_counts.merge(traits_ref[['name_corrected','rank']], left_on='trait_id', right_on='name_corrected', how='left')
trait_counts.head(10)


In [None]:

# Item distribution (flatten item slots)
item_counts = items_long['item'].value_counts().reset_index()
item_counts.columns = ['item','count']
item_counts.head(10)


In [None]:

# Correlations: use placement_score (higher is better) to align sign
placement_score = 9 - participants['placement']  # 1st place -> 8, 8th -> 1
participants = participants.assign(placement_score=placement_score)

unit_cost_map = dict(zip(units_ref['name'], units_ref['cost']))
units_cost_df = units.copy()
units_cost_df['unit_cost'] = units_cost_df['unit_name'].map(unit_cost_map)
units_cost_df['unit_total_cost'] = units_cost_df['unit_cost'] * units_cost_df['unit_tier']

unit_counts = units.groupby(['match_id','puuid']).size().rename('units_per_board')
items_per_player = items_long.groupby(['match_id','puuid']).size().rename('num_of_items')
unit_cost_avg = units_cost_df.groupby(['match_id','puuid'])['unit_cost'].mean().rename('avg_unit_cost')
unit_total_cost_avg = units_cost_df.groupby(['match_id','puuid'])['unit_total_cost'].mean().rename('avg_unit_total_cost')

avg_trait_tier = traits.groupby(['match_id','puuid'])['tier_current'].mean().rename('avg_trait_tier')

player_df = participants.merge(unit_counts, on=['match_id','puuid'], how='left')                            .merge(items_per_player, on=['match_id','puuid'], how='left')                            .merge(unit_cost_avg, on=['match_id','puuid'], how='left')                            .merge(unit_total_cost_avg, on=['match_id','puuid'], how='left')                            .merge(avg_trait_tier, on=['match_id','puuid'], how='left')

corr_cols = ['placement_score','level','num_of_items','avg_trait_tier','avg_unit_cost','avg_unit_total_cost']
corr_matrix = player_df[corr_cols].corr()
plt.figure(figsize=(7,6))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation matrix (higher placement_score = better)')
plt.tight_layout()
fig_corr_path = FIGURES_DIR / "01_corr_matrix.png"
plt.savefig(fig_corr_path, bbox_inches='tight')
print(f"Saved correlation heatmap to {fig_corr_path}")
corr_matrix


In [None]:

# Win/loss counts per player across matches
player_outcomes = participants.groupby('puuid').agg(
    games=('placement','count'),
    wins=('is_win','sum'),
    avg_placement=('placement','mean')
).reset_index()
player_outcomes['win_rate'] = player_outcomes['wins'] / player_outcomes['games']
player_outcomes['placement_score'] = 9 - player_outcomes['avg_placement']

plt.figure(figsize=(6,4))
sns.scatterplot(data=player_outcomes, x='win_rate', y='placement_score', alpha=0.5)
plt.title('Win rate vs placement_score (per player)')
plt.xlabel('Win rate (is_win<=4)')
plt.ylabel('Placement score (higher=better)')
plt.tight_layout()
player_outcomes_path = FIGURES_DIR / "01_player_winrate_vs_score.png"
plt.savefig(player_outcomes_path, bbox_inches='tight')
print(f"Saved player outcome plot to {player_outcomes_path}")


In [None]:

# Item type totals (aggregate, not per participant)
item_type_totals = items_long['item_type'].value_counts().reset_index()
item_type_totals.columns = ['item_type','count']
plt.figure(figsize=(8,4))
sns.barplot(data=item_type_totals, x='item_type', y='count', color='seagreen')
plt.title('Total item type counts (all games)')
plt.xlabel('Item type')
plt.ylabel('Total count')
plt.tight_layout()
item_type_bar_path = FIGURES_DIR / "01_item_type_totals.png"
plt.savefig(item_type_bar_path, bbox_inches='tight')
print(f"Saved item type totals plot to {item_type_bar_path}")


In [None]:

# Item count dot plots for items_s16 only (exclude specials/components)
full_items = set(items_ref['name'].dropna().astype(str).str.strip())
items_long_full = items_long[items_long['item'].isin(full_items)].copy()

per_game_counts = items_long_full.groupby(['match_id','item']).size().reset_index(name='count')
plt.figure(figsize=(14, 5))
sns.stripplot(data=per_game_counts, x='item', y='count', jitter=0.25, alpha=0.6)
plt.title('Item counts per game (items_s16 only)')
plt.xlabel('Item')
plt.ylabel('Count within game')
plt.xticks(rotation=90)
plt.tight_layout()
per_game_path = FIGURES_DIR / "01_items_per_game_strip.png"
plt.savefig(per_game_path, bbox_inches='tight')
print(f"Saved per-game item count plot to {per_game_path}")

items_long_full['player_id'] = items_long_full.apply(lambda r: f"{r['match_id']}_{r['puuid']}", axis=1)
per_part_counts = items_long_full.groupby(['player_id','item']).size().reset_index(name='count')
plt.figure(figsize=(14, 5))
sns.stripplot(data=per_part_counts, x='item', y='count', jitter=0.25, alpha=0.6)
plt.title('Item counts per participant (items_s16 only)')
plt.xlabel('Item')
plt.ylabel('Count for participant')
plt.xticks(rotation=90)
plt.tight_layout()
per_part_path = FIGURES_DIR / "01_items_per_participant_strip.png"
plt.savefig(per_part_path, bbox_inches='tight')
print(f"Saved per-participant item count plot to {per_part_path}")


In [None]:

# Unit cost per board (per participant per match) using cost from units_s16
unit_cost_sum = units_cost_df.groupby(['match_id','puuid'])['unit_cost'].sum().rename('unit_cost_per_board')
unit_total_cost_sum = units_cost_df.groupby(['match_id','puuid'])['unit_total_cost'].sum().rename('unit_total_cost_per_board')

player_df = player_df.merge(unit_cost_sum, on=['match_id','puuid'], how='left')                      .merge(unit_total_cost_sum, on=['match_id','puuid'], how='left')

fig, axes = plt.subplots(1, 2, figsize=(12, 4))

sns.histplot(player_df['unit_cost_per_board'], bins=20, ax=axes[0])
axes[0].set_title('Unit cost per board')
axes[0].set_xlabel('Sum of unit cost')

sns.histplot(player_df['unit_total_cost_per_board'], bins=20, ax=axes[1])
axes[1].set_title('Unit total cost per board (cost * star)')
axes[1].set_xlabel('Sum of total unit cost')

fig.tight_layout()
fig_cost_path = FIGURES_DIR / "01_unit_costs.png"
fig.savefig(fig_cost_path, bbox_inches='tight')
print(f"Saved unit cost plots to {fig_cost_path}")


In [None]:

# Item heatmap: units vs items (top 25 items)
item_freq = items_long['item'].value_counts()
top_items = item_freq.head(25).index
heat_df = (
    items_long[items_long['item'].isin(top_items)]
    .groupby(['unit_name','item']).size()
    .unstack(fill_value=0)
)

plt.figure(figsize=(14, max(6, 0.2 * len(heat_df))))
sns.heatmap(heat_df, cmap='Blues', cbar_kws={'label': 'count'})
plt.title('Item frequency by unit (top 25 items)')
plt.xlabel('Item')
plt.ylabel('Unit')
plt.xticks(rotation=90)
plt.tight_layout()
heatmap_path = FIGURES_DIR / "01_items_heatmap.png"
plt.savefig(heatmap_path, bbox_inches='tight')
print(f"Saved item heatmap to {heatmap_path}")


In [None]:

# Identify interesting patterns (top units/items by lowest avg placement)
unit_perf = units.merge(participants[['match_id','puuid','placement']], on=['match_id','puuid'], how='left')
unit_perf_stats = unit_perf.groupby('unit_name')['placement'].agg(['count','mean']).rename(columns={'mean':'avg_placement'}).sort_values('avg_placement')
print("Top 10 units by lowest avg placement:")
print(unit_perf_stats.head(10))

items_perf = items_long.merge(participants[['match_id','puuid','placement']], on=['match_id','puuid'], how='left')
item_perf_stats = items_perf.groupby('item')['placement'].agg(['count','mean']).rename(columns={'mean':'avg_placement'}).sort_values('avg_placement')
print("Top 10 items by lowest avg placement:")
print(item_perf_stats.head(10))
