# Feature engineering for the ML model  

## Introduction
This notebook will be used to create all the input features for the ML model, and bring them together in one dataframe table

In [1]:
import requests
import os
import pandas as pd
import numpy
from dotenv import load_dotenv
import logging

logging.basicConfig(level=logging.INFO)

In [2]:
load_dotenv()
API_KEY = os.getenv("RAPIDAPI_KEY")
API_HOST = "v2.nba.api-sports.io"
BASE_URL = "https://v2.nba.api-sports.io"

HEADERS = {
    "X-RapidAPI-Key": API_KEY,
    "X-RapidAPI-Host": API_HOST
}

log = logging.getLogger(__name__)

def make_api_request(endpoint, params=None):
    """Helper function to make API requests with error handling"""
    url = f"{BASE_URL}/{endpoint}"
    try:
        response = requests.get(url, headers=HEADERS, params=params)
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException as e:
        log.error(f"Error making request to {endpoint}: {e}")
        return None

log.info("API configuration complete!")
log.info(f"Base URL: {BASE_URL}")

INFO:__main__:API configuration complete!
INFO:__main__:Base URL: https://v2.nba.api-sports.io


In [3]:
seasons_data = make_api_request("seasons/")
log.info("Fetched seasons data: %s", seasons_data)
if seasons_data and 'response' in seasons_data:
    seasons = seasons_data['response']
    seasons = sorted(seasons, reverse=True)
    seasons = seasons[1:]  # Exclude the most recent season
else:
    DEFAULT_SEASONS = [2022, 2021, 2020, 2019, 2018]
    log.info(f"⚠️ Could not fetch seasons. Using default: {DEFAULT_SEASONS}")

INFO:__main__:Fetched seasons data: {'get': 'seasons/', 'parameters': [], 'errors': [], 'results': 11, 'response': [2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025]}


In [4]:
games_by_season = {}

for season in seasons:
    games_data = make_api_request("games", params={
        "season": season,
        "league": "Standard"
    })
    
    if games_data and 'response' in games_data:
        games_list = games_data['response']
        game_ids = [game['id'] for game in games_list if game['scores']['home']['points'] is not None]
        games_by_season[season] = game_ids
        log.info(f"Season {season}: Loaded {len(game_ids)} completed games")
    else:
        log.warning(f"Could not fetch games data for season {season}")
        games_by_season[season] = []

log.info(f"Total seasons processed: {len(games_by_season)}")


INFO:__main__:Season 2024: Loaded 1392 completed games
INFO:__main__:Season 2023: Loaded 1386 completed games
INFO:__main__:Season 2022: Loaded 1391 completed games
INFO:__main__:Season 2021: Loaded 1390 completed games
INFO:__main__:Season 2020: Loaded 1221 completed games
INFO:__main__:Season 2019: Loaded 1257 completed games
INFO:__main__:Season 2018: Loaded 1393 completed games
INFO:__main__:Season 2017: Loaded 1392 completed games
INFO:__main__:Season 2016: Loaded 1413 completed games
INFO:__main__:Season 2015: Loaded 1427 completed games
INFO:__main__:Total seasons processed: 10


In [5]:
# Iterate through all games in all seasons and fetch player statistics
player_stats_all_games = []
season = seasons[0] 
game_ids = [games_by_season[season][0]]  # Process only the first game of the first season for demonstration
log.info(f"Processing {len(game_ids)} games from season {season}")

for game_id in game_ids:
    player_stats_data = make_api_request("players/statistics", params={
        "game": game_id
    })
    
    if player_stats_data and 'response' in player_stats_data:
        player_stats_list = player_stats_data['response']
        
        # Extract detailed statistics
        for stat in player_stats_list:
            record = {
                'game_id': game_id,
                'season': season,
                'player_id': stat['player']['id'],
                'player_name': stat['player']['firstname'] + ' ' + stat['player']['lastname'],
                'team_id': stat['team']['id'],
                'team_name': stat['team']['name'],
                'position': stat.get('pos', 'N/A'),
                'minutes': stat.get('min', '0'),
                'points': stat.get('points', 0),
                'total_rebounds': stat.get('totReb', 0),
                'assists': stat.get('assists', 0),
                'steals': stat.get('steals', 0),
                'blocks': stat.get('blocks', 0),
                'turnovers': stat.get('turnovers', 0),
                'personal_fouls': stat.get('pFouls', 0),
                'fg_made': stat.get('fgm', 0),
                'fg_attempted': stat.get('fga', 0),
                'fg_pct': stat.get('fgp', '0'),
                'three_made': stat.get('tpm', 0),
                'three_attempted': stat.get('tpa', 0),
                'three_pct': stat.get('tpp', '0'),
                'ft_made': stat.get('ftm', 0),
                'ft_attempted': stat.get('fta', 0),
                'ft_pct': stat.get('ftp', '0'),
                'offensive_reb': stat.get('offReb', 0),
                'defensive_reb': stat.get('defReb', 0),
                'plus_minus': stat.get('plusMinus', '0'),
            }
            player_stats_all_games.append(record)
    else:
        log.warning(f"Could not fetch player statistics for game {game_id}")

player_stats_df = pd.DataFrame(player_stats_all_games)
log.info(f"Total player records fetched: {len(player_stats_df)}")

# TODO: Create aggregated features (total points, total steals, etc.) for each player


INFO:__main__:Processing 1 games from season 2024
INFO:__main__:Total player records fetched: 36


# Sanika's Section - Team Aggregations

## Objective
Aggregate player-level statistics to team-level features for each game. This will create the input features for our ML model.

In [6]:
# Step 1: Inspect the player-level data
log.info("=== Data Inspection ===")
log.info(f"Total records: {len(player_stats_df)}")
log.info(f"Columns: {list(player_stats_df.columns)}")
log.info(f"\nData types:\n{player_stats_df.dtypes}")
log.info(f"\nUnique games: {player_stats_df['game_id'].nunique()}")
log.info(f"Unique teams: {player_stats_df['team_id'].nunique()}")

# Display sample data
print("\n=== Sample Player Data ===")
player_stats_df.head()

INFO:__main__:=== Data Inspection ===
INFO:__main__:Total records: 36
INFO:__main__:Columns: ['game_id', 'season', 'player_id', 'player_name', 'team_id', 'team_name', 'position', 'minutes', 'points', 'total_rebounds', 'assists', 'steals', 'blocks', 'turnovers', 'personal_fouls', 'fg_made', 'fg_attempted', 'fg_pct', 'three_made', 'three_attempted', 'three_pct', 'ft_made', 'ft_attempted', 'ft_pct', 'offensive_reb', 'defensive_reb', 'plus_minus']
INFO:__main__:
Data types:
game_id             int64
season              int64
player_id           int64
player_name        object
team_id             int64
team_name          object
position           object
minutes            object
points              int64
total_rebounds      int64
assists             int64
steals              int64
blocks              int64
turnovers           int64
personal_fouls      int64
fg_made             int64
fg_attempted        int64
fg_pct             object
three_made          int64
three_attempted     int64
three


=== Sample Player Data ===


Unnamed: 0,game_id,season,player_id,player_name,team_id,team_name,position,minutes,points,total_rebounds,...,fg_pct,three_made,three_attempted,three_pct,ft_made,ft_attempted,ft_pct,offensive_reb,defensive_reb,plus_minus
0,14045,2024,195,Aaron Gordon,9,Denver Nuggets,PF,15,7,3,...,30.0,1,4,25.0,0,0,0.0,1,2,0
1,14045,2024,1014,Michael Porter Jr.,9,Denver Nuggets,SF,15,12,2,...,62.5,1,3,33.3,1,2,50.0,1,1,2
2,14045,2024,279,Nikola Jokic,9,Denver Nuggets,C,17,14,8,...,71.4,0,0,0.0,4,5,80.0,3,5,4
3,14045,2024,383,Jamal Murray,9,Denver Nuggets,PG,15,3,1,...,25.0,1,3,33.3,0,0,0.0,0,1,0
4,14045,2024,3420,Christian Braun,9,Denver Nuggets,G,13,5,5,...,50.0,0,1,0.0,1,2,50.0,1,4,7


In [7]:
# Step 2: Data Preparation - Convert string percentages to floats
log.info("=== Data Preparation ===")

# Create a copy to avoid modifying original
df_clean = player_stats_df.copy()

# Convert percentage columns from string to float
percentage_cols = ['fg_pct', 'three_pct', 'ft_pct']
for col in percentage_cols:
    # Convert string percentage to float (e.g., "75.5" -> 75.5)
    df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce')

# Convert plus_minus to numeric
df_clean['plus_minus'] = pd.to_numeric(df_clean['plus_minus'], errors='coerce')

# Fill NaN values with 0 for percentage columns (0% if no attempts)
df_clean[percentage_cols] = df_clean[percentage_cols].fillna(0)
df_clean['plus_minus'] = df_clean['plus_minus'].fillna(0)

log.info("Data types after conversion:")
log.info(f"\n{df_clean[['fg_pct', 'three_pct', 'ft_pct', 'plus_minus']].dtypes}")
log.info(f"\nNull values:\n{df_clean.isnull().sum()[df_clean.isnull().sum() > 0]}")

print("✓ Data preparation complete!")

INFO:__main__:=== Data Preparation ===
INFO:__main__:Data types after conversion:
INFO:__main__:
fg_pct        float64
three_pct     float64
ft_pct        float64
plus_minus      int64
dtype: object
INFO:__main__:
Null values:
Series([], dtype: int64)


✓ Data preparation complete!


In [8]:
# Step 3: Define aggregation rules based on requirements
log.info("=== Creating Team-Level Aggregations ===")

# Define aggregation dictionary
# Based on the requirements:
# - ft_pct: mean (cannot sum percentages)
# - fg_made, fg_attempted, three_made, three_attempted, ft_made: sum + mean
# - Additional stats: points, rebounds, assists, steals, blocks, turnovers, fouls

aggregation_dict = {
    # Shooting metrics (from requirements)
    'ft_pct': 'mean',  # Team free throw efficiency
    'fg_made': ['sum', 'mean'],  # Total and distribution of field goals
    'fg_attempted': ['sum', 'mean'],  # Total volume and shot distribution
    'three_made': ['sum', 'mean'],  # Total three-pointers and distribution
    'three_attempted': ['sum', 'mean'],  # Three-point strategy and distribution
    'ft_made': ['sum', 'mean'],  # Total free throw points and distribution
    # Keep identifiers
    'team_name': 'first',
    'season': 'first'
}

log.info(f"Aggregation rules defined for {len(aggregation_dict)} variables")
print("✓ Aggregation rules created!")

INFO:__main__:=== Creating Team-Level Aggregations ===
INFO:__main__:Aggregation rules defined for 8 variables


✓ Aggregation rules created!


In [9]:
# Step 4: Perform aggregation - Group by game_id and team_id
log.info("=== Aggregating Player Stats to Team Level ===")

# Group by game and team to get team-level stats
team_features_df = df_clean.groupby(['game_id', 'team_id']).agg(aggregation_dict).reset_index()

# Flatten multi-level column names
# e.g., ('fg_made', 'sum') becomes 'fg_made_sum'
team_features_df.columns = ['_'.join(col).strip('_') if isinstance(col, tuple) else col 
                             for col in team_features_df.columns.values]

# Rename columns for clarity (remove trailing underscores from single-agg columns)
team_features_df.columns = [col.replace('_first', '') for col in team_features_df.columns]

log.info(f"Team features created: {team_features_df.shape[0]} rows, {team_features_df.shape[1]} columns")
log.info(f"Columns: {list(team_features_df.columns)}")

print(f"\n✓ Team-level aggregation complete!")
print(f"   - Games: {team_features_df['game_id'].nunique()}")
print(f"   - Teams: {team_features_df['team_id'].nunique()}")
print(f"   - Total records: {len(team_features_df)}")

INFO:__main__:=== Aggregating Player Stats to Team Level ===
INFO:__main__:Team features created: 2 rows, 15 columns
INFO:__main__:Columns: ['game_id', 'team_id', 'ft_pct_mean', 'fg_made_sum', 'fg_made_mean', 'fg_attempted_sum', 'fg_attempted_mean', 'three_made_sum', 'three_made_mean', 'three_attempted_sum', 'three_attempted_mean', 'ft_made_sum', 'ft_made_mean', 'team_name', 'season']



✓ Team-level aggregation complete!
   - Games: 1
   - Teams: 2
   - Total records: 2


In [10]:
# Step 5: Validation and Display Results
log.info("=== Validation ===")

# Check that we have exactly 2 teams per game
teams_per_game = team_features_df.groupby('game_id').size()
log.info(f"Teams per game: {teams_per_game.values}")

# Display the key metrics from requirements
required_metrics = ['ft_pct_mean', 'fg_made_sum', 'fg_made_mean', 'fg_attempted_sum', 
                   'fg_attempted_mean', 'three_made_sum', 'three_made_mean',
                   'three_attempted_sum', 'three_attempted_mean', 'ft_made_sum', 'ft_made_mean']

log.info("\n=== Required Metrics (from your table) ===")
if all(col in team_features_df.columns for col in required_metrics):
    log.info("✓ All required metrics present!")
else:
    missing = [col for col in required_metrics if col not in team_features_df.columns]
    log.info(f"⚠ Missing metrics: {missing}")

log.info("=== Full Team Features Sample ===")

INFO:__main__:=== Validation ===
INFO:__main__:Teams per game: [2]
INFO:__main__:
=== Required Metrics (from your table) ===
INFO:__main__:✓ All required metrics present!
INFO:__main__:=== Full Team Features Sample ===


In [11]:
# Display all team-level aggregated features
print("=" * 80)
print("SANIKA'S FINDINGS: TEAM-LEVEL AGGREGATED ATTRIBUTES")
print("=" * 80)

# Display basic info
print(f"\nDataset Overview:")
print(f"  - Total teams analyzed: {len(team_features_df)}")
print(f"  - Total games: {team_features_df['game_id'].nunique()}")
print(f"  - Season: {team_features_df['season'].iloc[0]}")
print(f"  - Total features: {len(team_features_df.columns)}")

# Display each team's aggregates
for idx, row in team_features_df.iterrows():
    print("\n" + "=" * 80)
    print(f"TEAM {idx + 1}: {row['team_name']} (Team ID: {row['team_id']})")
    print("=" * 80)
    print(f"Game ID: {row['game_id']} | Season: {row['season']}")
    
    print("\n--- REQUIRED SHOOTING METRICS ---")
    print(f"  Free Throw Percentage (Mean):     {row['ft_pct_mean']:.2f}%")
    print(f"  Field Goals Made (Sum):            {row['fg_made_sum']}")
    print(f"  Field Goals Made (Mean):           {row['fg_made_mean']:.2f}")
    print(f"  Field Goals Attempted (Sum):       {row['fg_attempted_sum']}")
    print(f"  Field Goals Attempted (Mean):      {row['fg_attempted_mean']:.2f}")
    print(f"  Three-Pointers Made (Sum):         {row['three_made_sum']}")
    print(f"  Three-Pointers Made (Mean):        {row['three_made_mean']:.2f}")
    print(f"  Three-Pointers Attempted (Sum):    {row['three_attempted_sum']}")
    print(f"  Three-Pointers Attempted (Mean):   {row['three_attempted_mean']:.2f}")
    print(f"  Free Throws Made (Sum):            {row['ft_made_sum']}")
    print(f"  Free Throws Made (Mean):           {row['ft_made_mean']:.2f}")

print("\n" + "=" * 80)
print("END OF AGGREGATED ATTRIBUTES")
print("=" * 80)

SANIKA'S FINDINGS: TEAM-LEVEL AGGREGATED ATTRIBUTES

Dataset Overview:
  - Total teams analyzed: 2
  - Total games: 1
  - Season: 2024
  - Total features: 15

TEAM 1: Boston Celtics (Team ID: 2)
Game ID: 14045 | Season: 2024

--- REQUIRED SHOOTING METRICS ---
  Free Throw Percentage (Mean):     29.17%
  Field Goals Made (Sum):            37
  Field Goals Made (Mean):           2.06
  Field Goals Attempted (Sum):       98
  Field Goals Attempted (Mean):      5.44
  Three-Pointers Made (Sum):         20
  Three-Pointers Made (Mean):        1.11
  Three-Pointers Attempted (Sum):    61
  Three-Pointers Attempted (Mean):   3.39
  Free Throws Made (Sum):            13
  Free Throws Made (Mean):           0.72

TEAM 2: Denver Nuggets (Team ID: 9)
Game ID: 14045 | Season: 2024

--- REQUIRED SHOOTING METRICS ---
  Free Throw Percentage (Mean):     36.39%
  Field Goals Made (Sum):            37
  Field Goals Made (Mean):           2.06
  Field Goals Attempted (Sum):       86
  Field Goals Attemp

In [12]:
# Display summary table of all aggregated attributes
print("\n\n" + "=" * 100)
print("COMPLETE ATTRIBUTE SUMMARY TABLE")
print("=" * 100)

# Show full dataframe with all aggregates
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

print("\nAll Team-Level Aggregated Features:")
team_features_df



COMPLETE ATTRIBUTE SUMMARY TABLE

All Team-Level Aggregated Features:


Unnamed: 0,game_id,team_id,ft_pct_mean,fg_made_sum,fg_made_mean,fg_attempted_sum,fg_attempted_mean,three_made_sum,three_made_mean,three_attempted_sum,three_attempted_mean,ft_made_sum,ft_made_mean,team_name,season
0,14045,2,29.166667,37,2.055556,98,5.444444,20,1.111111,61,3.388889,13,0.722222,Boston Celtics,2024
1,14045,9,36.388889,37,2.055556,86,4.777778,12,0.666667,32,1.777778,17,0.944444,Denver Nuggets,2024


## Aditya's features

I had to cover the following variables:
- ft_attempted (direct from API response)
- minutes (direct from API response)
- assist_to_turnover_ratio (formula)
- three_point_rate (formula)
- offensive_reb_rate (formula)
- true_shooting_pct (formula)

In [13]:
def convert_minutes_to_numeric(minutes_str):
    """
    Convert minutes from 'MM:SS' format to decimal minutes.
    Example: '25:30' -> 25.5
    """
    if pd.isna(minutes_str) or minutes_str == '0' or minutes_str == 0:
        return 0.0
    
    try:
        if isinstance(minutes_str, str) and ':' in minutes_str:
            parts = minutes_str.split(':')
            mins = float(parts[0])
            secs = float(parts[1]) if len(parts) > 1 else 0
            return mins + (secs / 60.0)
        else:
            return float(minutes_str)
    except:
        return 0.0

In [14]:
# Make a copy of player stats
df = player_stats_df.copy()

# Convert minutes to numeric (handle 'MM:SS' format from API)
df['minutes_numeric'] = df['minutes'].apply(convert_minutes_to_numeric)

# Convert other fields to numeric (in case they're strings)
numeric_cols = ['ft_attempted', 'assists', 'turnovers', 'three_made', 
                'fg_made', 'offensive_reb', 'total_rebounds', 'points', 'fg_attempted']

for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)

# Filter out players who didn't play (DNP)
df_played = df[df['minutes_numeric'] > 0].copy()

# Group by game and team, compute aggregations
agg_dict_aditya = {
    # Free throw attempts aggregations
    'ft_attempted': ['sum', 'mean'],
    
    # Minutes aggregations
    'minutes_numeric': ['sum', 'mean', 'max'],
    
    # Additional fields needed for engineered features
    # these are for the last 4 variables labeled as 'formula'
    'assists': 'sum',
    'turnovers': 'sum',
    'three_made': 'sum',
    'fg_made': 'sum',
    'offensive_reb': 'sum',
    'total_rebounds': 'sum',
    'points': 'sum',
    'fg_attempted': 'sum'
}

aditya_team_features = df_played.groupby(['game_id', 'team_id']).agg(agg_dict_aditya).reset_index()

# Flatten column names
aditya_team_features.columns = ['_'.join(col).strip('_') if col[1] else col[0] 
                                 for col in aditya_team_features.columns.values]

# Rename minutes columns for clarity
aditya_team_features = aditya_team_features.rename(columns={
    'minutes_numeric_sum': 'minutes_sum',
    'minutes_numeric_mean': 'minutes_mean',
    'minutes_numeric_max': 'minutes_max'
})


In [15]:
aditya_team_features['assist_to_turnover_ratio'] = aditya_team_features.apply(
    lambda row: row['assists_sum'] / row['turnovers_sum'] 
    if row['turnovers_sum'] > 0 else row['assists_sum'],
    axis=1
)

In [16]:
aditya_team_features['three_point_rate'] = aditya_team_features.apply(
    lambda row: row['three_made_sum'] / row['fg_made_sum']
    if row['fg_made_sum'] > 0 else 0.0,
    axis=1
)

In [17]:
aditya_team_features['offensive_reb_rate'] = aditya_team_features.apply(
    lambda row: row['offensive_reb_sum'] / row['total_rebounds_sum']
    if row['total_rebounds_sum'] > 0 else 0.0,
    axis=1
)


In [18]:
aditya_team_features['true_shooting_pct'] = aditya_team_features.apply(
    lambda row: (row['points_sum'] / 
                (2 * (row['fg_attempted_sum'] + 0.44 * row['ft_attempted_sum'])))
    if (row['fg_attempted_sum'] + 0.44 * row['ft_attempted_sum']) > 0 else 0.0,
    axis=1
)

In [19]:
aditya_features_final = aditya_team_features[[
    'game_id',
    'team_id',
    'ft_attempted_sum',
    'ft_attempted_mean',
    'minutes_sum',
    'minutes_mean',
    'minutes_max',
    'assist_to_turnover_ratio',
    'three_point_rate',
    'offensive_reb_rate',
    'true_shooting_pct'
]].copy()

log.info("Aditya's features created successfully!")
log.info(f"Total features: {len(aditya_features_final.columns) - 2}")

INFO:__main__:Aditya's features created successfully!
INFO:__main__:Total features: 9


In [20]:
team_names = player_stats_df.groupby('team_id')['team_name'].first().to_dict()
aditya_features_final['team_name'] = aditya_features_final['team_id'].map(team_names)

print("=" * 80)
print("ADITYA'S FINDINGS: TEAM-LEVEL AGGREGATED ATTRIBUTES")
print("=" * 80)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

print("\nAll Team-Level Aggregated Features (Aditya's Section):")
aditya_features_final

ADITYA'S FINDINGS: TEAM-LEVEL AGGREGATED ATTRIBUTES

All Team-Level Aggregated Features (Aditya's Section):


Unnamed: 0,game_id,team_id,ft_attempted_sum,ft_attempted_mean,minutes_sum,minutes_mean,minutes_max,assist_to_turnover_ratio,three_point_rate,offensive_reb_rate,true_shooting_pct,team_name
0,14045,2,17,1.0,240.0,14.117647,25.0,1.705882,0.540541,0.386364,0.507205,Boston Celtics
1,14045,9,26,1.529412,239.0,14.058824,23.0,1.318182,0.324324,0.244898,0.52853,Denver Nuggets




## Dev's 6 Variables
1. **points** - Total team scoring output (sum, mean, std, max)
2. **assists** - Ball movement and playmaking (sum, mean, std, max)
3. **total_rebounds** - Overall rebounding dominance (sum, mean, std, max)
4. **offensive_reb** - Second-chance opportunities (sum, mean, std, max)
5. **defensive_reb** - Defensive rebounding control (sum, mean, std, max)
6. **steals** - Defensive pressure and disruptions (sum, mean, std, max)

### Aggregation Meanings:
- **Sum**: Total team output
- **Mean**: Distribution per player
- **Std**: Variance (star-dependent vs balanced team)
- **Max**: Best individual performance

In [21]:
# my variables - similar to what Sanika did but for my 6
my_vars = {
    'points': ['sum', 'mean', 'std', 'max'],
    'assists': ['sum', 'mean', 'std', 'max'],
    'total_rebounds': ['sum', 'mean', 'std', 'max'],
    'offensive_reb': ['sum', 'mean'],
    'defensive_reb': ['sum', 'mean'],
    'steals': ['sum', 'mean'],
    'team_name': 'first',
    'season': 'first'
}

print(f"Variables: {len(my_vars) - 2}")  # -2 for team_name and season

Variables: 6


In [22]:
# aggregate my variables
my_features = df_clean.groupby(['game_id', 'team_id']).agg(my_vars).reset_index()

# fix column names
my_features.columns = ['_'.join(col).strip('_') if isinstance(col, tuple) else col 
                       for col in my_features.columns.values]
my_features.columns = [col.replace('_first', '') for col in my_features.columns]

print(f"Shape: {my_features.shape}")
print(f"Columns: {len(my_features.columns)}")

Shape: (2, 22)
Columns: 22


In [23]:
# check if all variables have the 4 aggregations
vars_to_check = ['points', 'assists', 'total_rebounds', 'offensive_reb', 'defensive_reb', 'steals']

for v in vars_to_check:
    cols = [f'{v}_sum', f'{v}_mean', f'{v}_std', f'{v}_max']
    check = all(c in my_features.columns for c in cols)
    print(f"{v}: {'✓' if check else 'X'}")

print(f"\nTotal features: {len([c for c in my_features.columns if '_sum' in c or '_mean' in c or '_std' in c or '_max' in c])}")

points: ✓
assists: ✓
total_rebounds: ✓
offensive_reb: X
defensive_reb: X
steals: X

Total features: 18


In [24]:
# display results
print("="*80)
print("My aggregated features")
print("="*80)

for i, row in my_features.iterrows():
    print(f"\nTeam: {row['team_name']} | Game: {row['game_id']}")
    print(f"-"*60)
    
    # points, assists, rebounds
    print(f"Points: sum={row['points_sum']:.0f}, mean={row['points_mean']:.2f}, std={row['points_std']:.2f}, max={row['points_max']:.0f}")
    print(f"Assists: sum={row['assists_sum']:.0f}, mean={row['assists_mean']:.2f}, std={row['assists_std']:.2f}, max={row['assists_max']:.0f}")
    print(f"Total Reb: sum={row['total_rebounds_sum']:.0f}, mean={row['total_rebounds_mean']:.2f}, std={row['total_rebounds_std']:.2f}, max={row['total_rebounds_max']:.0f}")
    
    print(f"\nRebounding breakdown:")
    print(f"  Off Reb: sum={row['offensive_reb_sum']:.0f}, mean={row['offensive_reb_mean']:.2f}")
    print(f"  Def Reb: sum={row['defensive_reb_sum']:.0f}, mean={row['defensive_reb_mean']:.2f}")
    
    print(f"\nDefense:")
    print(f"  Steals: sum={row['steals_sum']:.0f}, mean={row['steals_mean']:.2f}")

My aggregated features

Team: Boston Celtics | Game: 14045
------------------------------------------------------------
Points: sum=107, mean=5.94, std=5.43, max=21
Assists: sum=29, mean=1.61, std=1.85, max=6
Total Reb: sum=44, mean=2.44, std=2.71, max=11

Rebounding breakdown:
  Off Reb: sum=17, mean=0.94
  Def Reb: sum=27, mean=1.50

Defense:
  Steals: sum=13, mean=0.72

Team: Denver Nuggets | Game: 14045
------------------------------------------------------------
Points: sum=103, mean=5.72, std=4.43, max=14
Assists: sum=29, mean=1.61, std=1.88, max=8
Total Reb: sum=49, mean=2.72, std=2.24, max=8

Rebounding breakdown:
  Off Reb: sum=12, mean=0.67
  Def Reb: sum=37, mean=2.06

Defense:
  Steals: sum=10, mean=0.56


In [25]:
# save my features
my_features

Unnamed: 0,game_id,team_id,points_sum,points_mean,points_std,points_max,assists_sum,assists_mean,assists_std,assists_max,total_rebounds_sum,total_rebounds_mean,total_rebounds_std,total_rebounds_max,offensive_reb_sum,offensive_reb_mean,defensive_reb_sum,defensive_reb_mean,steals_sum,steals_mean,team_name,season
0,14045,2,107,5.944444,5.428381,21,29,1.611111,1.851514,6,44,2.444444,2.705598,11,17,0.944444,27,1.5,13,0.722222,Boston Celtics,2024
1,14045,9,103,5.722222,4.429919,14,29,1.611111,1.883017,8,49,2.722222,2.244092,8,12,0.666667,37,2.055556,10,0.555556,Denver Nuggets,2024


## Chinmayee's Features


Objective: For the following variables, find the aggregations corresponding to each.

1. blocks: sum, mean
2. turnovers: sum, mean
3. personal_fouls: sum, mean
4. plus_minus: sum, mean
5. fg_pct: mean 
6. three_pct: mean

In [26]:
# creating a list for the above variables
columns_to_aggregate = {'blocks': ['sum', 'mean'], 'turnovers': ['sum', 'mean'], 'personal_fouls': ['sum', 'mean'], 'plus_minus': ['sum', 'mean'], 'fg_pct': ['mean'], 'three_pct': ['mean'],
                        'team_name': 'first',
                        'season': 'first'}
# using the function previosuly defined to get the time in proper format


def convert_minutes_to_numeric(minutes_str):
    """
    Convert minutes from 'MM:SS' format to decimal minutes.
    Example: '25:30' -> 25.5
    """
    if pd.isna(minutes_str) or minutes_str == '0' or minutes_str == 0:
        return 0.0

    try:
        if isinstance(minutes_str, str) and ':' in minutes_str:
            parts = minutes_str.split(':')
            mins = float(parts[0])
            secs = float(parts[1]) if len(parts) > 1 else 0
            return mins + (secs / 60.0)
        else:
            return float(minutes_str)
    except:
        return 0.0


# after making a copy get the data only for those who played
df_clean_copy = df_clean.copy()

df_clean_copy['minutes_numeric'] = df_clean_copy['minutes'].apply(
    convert_minutes_to_numeric)
df_played = df_clean_copy[df_clean_copy['minutes_numeric'] > 0].copy()

# checking for exact column names
print(df_played.columns.tolist())

['game_id', 'season', 'player_id', 'player_name', 'team_id', 'team_name', 'position', 'minutes', 'points', 'total_rebounds', 'assists', 'steals', 'blocks', 'turnovers', 'personal_fouls', 'fg_made', 'fg_attempted', 'fg_pct', 'three_made', 'three_attempted', 'three_pct', 'ft_made', 'ft_attempted', 'ft_pct', 'offensive_reb', 'defensive_reb', 'plus_minus', 'minutes_numeric']


In [27]:
# group by the game id and then by team id
group_df = df_played.groupby(['game_id', 'team_id'])

# make another list to store the results of the aggregations
aggregations_2 = []

# for each of the variables to find the aggregations for each team in a game
for col, aggs in columns_to_aggregate.items():
    temp = group_df[col].agg(aggs)
    # flatten column names if multiple stats
    if isinstance(temp, pd.DataFrame):
        temp.columns = [f"{col}_{a}" for a in aggs] # the column name for the aggregation
    else:  # if a single aggregation returns a Series
        temp = temp.rename(f"{col}_{aggs[0]}")
    aggregations_2.append(temp) # add this to the list
# at the end add that to the df 
second_6_df = pd.concat(aggregations_2, axis=1).reset_index()

second_6_df.head()

Unnamed: 0,game_id,team_id,blocks_sum,blocks_mean,turnovers_sum,turnovers_mean,personal_fouls_sum,personal_fouls_mean,plus_minus_sum,plus_minus_mean,fg_pct_mean,three_pct_mean,team_name_f,season_f
0,14045,2,8,0.470588,17,1.0,24,1.411765,21,1.235294,36.670588,19.776471,Boston Celtics,2024
1,14045,9,2,0.117647,22,1.294118,24,1.411765,-18,-1.058824,41.235294,23.035294,Denver Nuggets,2024


In [28]:
print("=" * 80)
print("CHINMAYEE'S FINDINGS: TEAM-LEVEL AGGREGATED ATTRIBUTES (6-Variable Version)")
print("=" * 80)

pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

print("\nAll Team-Level Aggregated Features (6 Variables):")
second_6_df

CHINMAYEE'S FINDINGS: TEAM-LEVEL AGGREGATED ATTRIBUTES (6-Variable Version)

All Team-Level Aggregated Features (6 Variables):


Unnamed: 0,game_id,team_id,blocks_sum,blocks_mean,turnovers_sum,turnovers_mean,personal_fouls_sum,personal_fouls_mean,plus_minus_sum,plus_minus_mean,fg_pct_mean,three_pct_mean,team_name_f,season_f
0,14045,2,8,0.470588,17,1.0,24,1.411765,21,1.235294,36.670588,19.776471,Boston Celtics,2024
1,14045,9,2,0.117647,22,1.294118,24,1.411765,-18,-1.058824,41.235294,23.035294,Denver Nuggets,2024
