In [16]:
# Smash Bros Ultimate Frame Data Analysis
# ======================================

# This notebook loads and processes frame data for Super Smash Bros Ultimate
# Based on the SakurAI project structure

In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pathlib import Path

In [18]:
# Set visualization styles
sns.set_theme(style="whitegrid")
plt.rcParams["figure.figsize"] = (12, 8)

In [19]:
# Define path to data directory
DATA_DIR = "./data"  # Adjust if needed

In [20]:
# Check if data directory exists
if not os.path.exists(DATA_DIR):
    print(f"Warning: Data directory '{DATA_DIR}' not found")
    
    # Create the directory if it doesn't exist
    os.makedirs(DATA_DIR, exist_ok=True)
    print(f"Created data directory at '{DATA_DIR}'")

In [21]:
# Function to load and display dataframe info
def load_and_describe_dataframe(file_name):
    """
    Load a CSV file and return a pandas DataFrame.
    Also prints information about the DataFrame.
    
    Args:
        file_name (str): Name of the CSV file in the data directory
        
    Returns:
        pandas.DataFrame: Loaded and processed DataFrame
    """
    file_path = os.path.join(DATA_DIR, file_name)
    
    if not os.path.exists(file_path):
        print(f"Error: File '{file_name}' not found in '{DATA_DIR}'")
        return None
    
    # Load the CSV file
    df = pd.read_csv(file_path)
    
    # Print information about the DataFrame
    print(f"\n{'='*50}")
    print(f"File: {file_name}")
    print(f"Shape: {df.shape[0]} rows x {df.shape[1]} columns")
    print(f"{'='*50}")
    print("\nSample data (first 5 rows):")
    print(df.head())
    print("\nColumn data types:")
    print(df.dtypes)
    print("\nMissing values per column:")
    print(df.isna().sum())
    
    return df

In [22]:
# Load all dataframes
print("Loading Smash Bros Ultimate frame data...")

Loading Smash Bros Ultimate frame data...


In [23]:
# Characters dataframe
characters_df = load_and_describe_dataframe("characters.csv")


File: characters.csv
Shape: 89 rows x 94 columns

Sample data (first 5 rows):
     character_id       file_name hitboxes         name number        value  \
0        01_mario        01_mario      NaN        Mario     01        mario   
1  02_donkey-kong  02_donkey-kong      NaN  Donkey Kong     02  donkey-kong   
2         03_link         03_link      NaN         Link     03         link   
3  04e_dark-samus  04e_dark-samus      NaN   Dark Samus    04e   dark-samus   
4        04_samus        04_samus      NaN        Samus     04        samus   

   internal_id attr_completed  attr_id    attr_name  ... param_WallJumpHSpeed  \
0          1.0           True      1.0        Mario  ...                  1.3   
1          2.0           True      2.0  Donkey Kong  ...                  1.3   
2          3.0           True      3.0         Link  ...                  1.3   
3          5.0           True      5.0   Dark Samus  ...                  1.3   
4          4.0           True      4.0   

In [24]:
# Moves dataframe
moves_df = load_and_describe_dataframe("moves.csv")


File: moves.csv
Shape: 3621 rows x 12 columns

Sample data (first 5 rows):
  character_id  internal_id  move_index complete  faf  frames grabs hurtboxes  \
0     01_mario          1.0           0      NaN   19      19   NaN       NaN   
1     01_mario          1.0           1      NaN   21      21   NaN       NaN   
2     01_mario          1.0           2      NaN   33      33   NaN       NaN   
3     01_mario          1.0           3      NaN   25      25   NaN       NaN   
4     01_mario          1.0           4      NaN   25      25   NaN       NaN   

                name notes type         value  
0              Jab 1   NaN  NaN     MarioJab1  
1              Jab 2   NaN  NaN     MarioJab2  
2              Jab 3   NaN  NaN     MarioJab3  
3       Forward Tilt   NaN  NaN    MarioFTilt  
4  Forward Tilt (Up)   NaN  NaN  MarioFTiltUp  

Column data types:
character_id     object
internal_id     float64
move_index        int64
complete         object
faf               int64
frames   

In [25]:
# Hitboxes dataframe
hitboxes_df = load_and_describe_dataframe("hitboxes.csv")


File: hitboxes.csv
Shape: 11408 rows x 48 columns

Sample data (first 5 rows):
  character_id  internal_id  move_index  hitbox_index move_name absorbable  \
0     01_mario          1.0           0             0     Jab 1      False   
1     01_mario          1.0           0             1     Jab 1      False   
2     01_mario          1.0           0             2     Jab 1      False   
3     01_mario          1.0           0             3     Jab 1      False   
4     01_mario          1.0           1             0     Jab 2      False   

   angle   bkb       bone          clang_rebound  ... status trip  \
0  361.0  20.0        top  attack_setoff_kind_on  ...    NaN  0.0   
1  361.0  20.0        top  attack_setoff_kind_on  ...    NaN  0.0   
2  180.0  15.0        top  attack_setoff_kind_on  ...    NaN  0.0   
3  361.0  15.0        top  attack_setoff_kind_on  ...    NaN  0.0   
4  361.0  20.0  shoulderr  attack_setoff_kind_on  ...    NaN  0.0   

                  type  unk    x   x

In [26]:
# Throws dataframe
throws_df = load_and_describe_dataframe("throws.csv")


File: throws.csv
Shape: 392 rows x 43 columns

Sample data (first 5 rows):
     character_id  internal_id  move_index  throw_index      move_name  \
0        01_mario          1.0          24            0  Forward Throw   
1        01_mario          1.0          25            0     Back Throw   
2        01_mario          1.0          26            0       Up Throw   
3        01_mario          1.0          27            0     Down Throw   
4  02_donkey-kong          2.0          20            0     Back Throw   

  absorbable  angle  bkb bone clang_rebound  ...              sfxlevel  \
0        NaN     45   60  NaN           NaN  ...  attack_sound_level_s   
1        NaN     45   70  NaN           NaN  ...  attack_sound_level_s   
2        NaN     90   70  NaN           NaN  ...  attack_sound_level_s   
3        NaN     68   40  NaN           NaN  ...  attack_sound_level_s   
4        NaN     40   60  NaN           NaN  ...  attack_sound_level_s   

                     sfxtype  shie

In [27]:
# Basic data cleaning function
def clean_dataframe(df, name):
    """
    Perform basic cleaning operations on a DataFrame.
    
    Args:
        df (pandas.DataFrame): DataFrame to clean
        name (str): Name of the DataFrame for display purposes
        
    Returns:
        pandas.DataFrame: Cleaned DataFrame
    """
    if df is None:
        return None
    
    print(f"\n\nCleaning {name} DataFrame...")
    
    # 1. Create a copy to avoid warnings
    df_clean = df.copy()
    
    # 2. Convert column names to lowercase and replace spaces with underscores
    df_clean.columns = [col.lower().replace(' ', '_') for col in df_clean.columns]
    
    # 3. Handle specific data types based on DataFrame
    if name == "characters":
        # Character-specific cleaning
        pass
    
    elif name == "moves":
        # Handle any move-specific cleaning
        # For example, handle 'autoCancel' fields if they exist
        if 'autocancel1' in df_clean.columns:
            df_clean['autocancel1'] = pd.to_numeric(df_clean['autocancel1'], errors='coerce')
            df_clean['autocancel2'] = pd.to_numeric(df_clean['autocancel2'], errors='coerce')
    
    elif name == "hitboxes":
        # Handle hitbox specific cleaning
        # Convert damage to numeric
        if 'damage' in df_clean.columns:
            df_clean['damage'] = pd.to_numeric(df_clean['damage'], errors='coerce')
        
        # Fix move types for Incineroar as mentioned in ISSUES.md
        if 'type' in df_clean.columns:
            # We need to populate this when we know the exact issue
            pass
    
    elif name == "throws":
        # Throw-specific cleaning
        pass
    
    # 4. Report the changes
    print(f"  - Standardized column names")
    print(f"  - Applied dataframe-specific cleaning")
    
    # 5. Report missing data after cleaning
    missing_data = df_clean.isna().sum()
    missing_columns = missing_data[missing_data > 0]
    if not missing_columns.empty:
        print("\n  Columns with missing values after cleaning:")
        for col, count in missing_columns.items():
            percent = (count / len(df_clean)) * 100
            print(f"  - {col}: {count} missing values ({percent:.2f}%)")
    else:
        print("\n  No missing values after cleaning")
    
    return df_clean

# Clean all dataframes
characters_clean = clean_dataframe(characters_df, "characters")
moves_clean = clean_dataframe(moves_df, "moves")
hitboxes_clean = clean_dataframe(hitboxes_df, "hitboxes")
throws_clean = clean_dataframe(throws_df, "throws")

# Display information about available characters
if characters_clean is not None:
    print("\n\nCharacters in the dataset:")
    if 'name' in characters_clean.columns:
        character_list = characters_clean['name'].sort_values().tolist()
    elif 'character_id' in characters_clean.columns:
        character_list = characters_clean['character_id'].sort_values().tolist()
    else:
        character_list = []
    
    # Print in multiple columns
    num_cols = 4
    num_chars = len(character_list)
    chars_per_col = (num_chars + num_cols - 1) // num_cols
    
    for i in range(0, chars_per_col):
        row = []
        for j in range(num_cols):
            idx = i + j * chars_per_col
            if idx < num_chars:
                row.append(character_list[idx].ljust(20))
            else:
                row.append('')
        print(''.join(row))

# Now that we have loaded and cleaned the data, we can begin analysis
print("\n\nData loading and cleaning complete. Ready for analysis!")



Cleaning characters DataFrame...
  - Standardized column names
  - Applied dataframe-specific cleaning

  Columns with missing values after cleaning:
  - hitboxes: 88 missing values (98.88%)
  - internal_id: 5 missing values (5.62%)
  - attr_completed: 5 missing values (5.62%)
  - attr_id: 5 missing values (5.62%)
  - attr_name: 5 missing values (5.62%)
  - attr_number: 5 missing values (5.62%)
  - attr_series: 5 missing values (5.62%)
  - attr_value: 5 missing values (5.62%)
  - attr_version: 5 missing values (5.62%)
  - param_walkspeed: 8 missing values (8.99%)
  - param_walkaddacceleration: 8 missing values (8.99%)
  - param_walkbaseacceleration: 8 missing values (8.99%)
  - param_groundfriction: 8 missing values (8.99%)
  - param_dashinitialspeed: 8 missing values (8.99%)
  - param_runaddacceleration: 8 missing values (8.99%)
  - param_runbaseacceleration: 8 missing values (8.99%)
  - param_runspeed: 8 missing values (8.99%)
  - param_jumpsquat: 8 missing values (8.99%)
  - param