In [34]:
import os
import sys
# Add the project root directory to sys.path
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))  # Adjust to your project's structure
sys.path.append(project_root)

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from src.analysis import *

In [71]:
import pandas as pd
import os

def process_csv_files(input_dir, output_dir, output_mapping_file):
    """
    Processes multiple CSV files to stack dataframes, filter rows, and generate outputs.

    Parameters:
    - input_dir (str): Directory containing the input CSV files.
    - output_dir (str): Directory to save the processed dataframe.
    - output_mapping_file (str): Path to save the index-to-player mapping.

    Returns:
    - None
    """
    os.makedirs(output_dir, exist_ok=True)
    
    # Initialize an empty list to store dataframes
    dataframes = []
    
    # Iterate over all CSV files in the directory
    for file in os.listdir(input_dir):
        if file.endswith(".csv"):
            file_path = os.path.join(input_dir, file)
            # Read each CSV file into a dataframe
            df = pd.read_csv(file_path)
            
            # Remove rows where sentiment_avg == 0
            df = df[df['sentiment_avg'] != 0]
            
            # Add a new column "mvp" initialized to 0
            df['mvp'] = 0
            
            # Set "mvp" to 1 for rows with Rank == 1 
            df.loc[df['Rank'] == '1', 'mvp'] = 1
            df.loc[df['Rank'] == 1, 'mvp'] = 1
            # df.loc[df['Rank'] == '2', 'mvp'] = 0.5
            # df.loc[df['Rank'] == 2, 'mvp'] = 0.5
            # df.loc[df['Rank'] == '3', 'mvp'] = 0.25
            # df.loc[df['Rank'] == 3, 'mvp'] = 0.25
            
            # Drop unnecessary columns
            df.drop(columns=["Rank", "Age", "Pts_Won", "Pts Max", "Share",
                            'Unnamed: 0','LEAGUE_ID', "TEAM_ID", "Rk","Rk_adv",
                            'TEAM_ABBREVIATION_team', 'MVP_Candidate','Arena',
                             'Attend.', 'Attend./G', "GB", 'Rk_trad', 'First',
                            'Team', 'Year', 'TEAM_ABBREVIATION_player'], inplace=True)
            
            # Append the dataframe to the list
            dataframes.append(df)
    
    # Concatenate all dataframes
    final_df = pd.concat(dataframes, ignore_index=True).reset_index()
    
    # Create a new dataframe with just indexes and "Player" column
    mapping_df = final_df[["index", "Player",'SEASON_ID']]
    
    # Drop the "Player" column from the final dataframe
    final_df.drop(columns=["Player","Unnamed: 0.1", "PLAYER_ID",'SEASON_ID', 'PLAYER_FULLNAME'], inplace=True)
    final_df = final_df.apply(pd.to_numeric, errors='coerce')
    
    # Save the final dataframe and mapping dataframe
    final_output_file = os.path.join(output_dir, "final_stacked_data.csv")
    final_df.to_csv(final_output_file, index=False)
    mapping_df.to_csv(output_mapping_file, index=False)

    print(f"Processed data saved to {final_output_file}")
    print(f"Index-to-Player mapping saved to {output_mapping_file}")


# Usage Example
input_directory = "/Users/cb/src/nba_mvp_ml/data/processed/by_season/fully_merged"
output_directory = "/Users/cb/src/nba_mvp_ml/data/processed/by_season/fully_merged"
output_mapping = "/Users/cb/src/nba_mvp_ml/data/processed/by_season/fully_merged/player_index_mapping.csv"

process_csv_files(input_directory, output_directory, output_mapping)

Processed data saved to /Users/cb/src/nba_mvp_ml/data/processed/by_season/fully_merged/final_stacked_data.csv
Index-to-Player mapping saved to /Users/cb/src/nba_mvp_ml/data/processed/by_season/fully_merged/player_index_mapping.csv


In [64]:
_df = pd.read_csv('/Users/cb/src/nba_mvp_ml/data/processed/by_season/fully_merged/final_stacked_data.csv')


Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,PLAYER_ID,PLAYER_FULLNAME,SEASON_ID,LEAGUE_ID,TEAM_ID,FGM,FGA_player,FG_PCT,...,sentiment_10,sentiment_11,sentiment_12,sentiment_13,sentiment_14,sentiment_15,sentiment_avg,WS,TRB,mvp
0,77,301.0,203081.0,DAMIAN LILLARD,2018.0,0.0,1610613000.0,681.0,1533.0,0.444,...,8.0,3.0,6.0,4.0,7.0,5.0,5.733333,12.1,4.6,0
1,162,17.0,203507.0,GIANNIS ANTETOKOUNMPO,2018.0,0.0,1610613000.0,721.0,1247.0,0.578,...,8.0,8.0,5.0,6.0,9.0,6.0,7.2,14.4,12.5,0
2,201,205.0,201935.0,JAMES HARDEN,2018.0,0.0,1610613000.0,843.0,1909.0,0.442,...,9.0,3.0,6.0,3.0,9.0,6.0,6.8,15.2,6.6,0
3,237,153.0,203954.0,JOEL EMBIID,2018.0,0.0,1610613000.0,580.0,1199.0,0.484,...,8.0,6.0,6.0,5.0,7.0,5.0,5.8,8.7,13.6,0
4,271,298.0,202695.0,KAWHI LEONARD,2018.0,0.0,1610613000.0,560.0,1129.0,0.496,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.5,7.3,0


'1'

In [29]:
list(df.columns)

['Unnamed: 0',
 'PLAYER_ID',
 'PLAYER_FULLNAME',
 'SEASON_ID',
 'LEAGUE_ID',
 'TEAM_ID',
 'FGM',
 'FGA_player',
 'FG_PCT',
 'FG3M',
 'FG3A',
 'FG3_PCT',
 'FTM',
 'FTA_player',
 'FT_PCT',
 'OREB',
 'DREB',
 'REB',
 'AST_player',
 'STL_player',
 'BLK_player',
 'TOV_player',
 'PF_player',
 'PTS_player',
 'MIN_PG',
 'FGM_PG',
 'FGA_PG',
 'FG_PCT_PG',
 'FG3M_PG',
 'FG3A_PG',
 'FG3_PCT_PG',
 'FTM_PG',
 'FTA_PG',
 'FT_PCT_PG',
 'OREB_PG',
 'DREB_PG',
 'REB_PG',
 'AST_PG',
 'STL_PG',
 'BLK_PG',
 'TOV_PG',
 'PF_PG',
 'PTS_PG',
 'TS%_player',
 'eFG%_player',
 'Rk',
 'GP',
 'MIN',
 '3PAr_player',
 'ASTPct',
 'BLKPct',
 'BPM',
 'DBPM',
 'DRBPct',
 'DWS',
 'FTr_player',
 'GS',
 'OBPM',
 'ORBPct',
 'OWS',
 'PER',
 'STLPct',
 'TOVPct',
 'TRBPct',
 'TSPct',
 'USGPct',
 'VORP',
 'WS/48_x',
 'Year',
 'PLAYER_AGE',
 'TEAM_ABBREVIATION_player',
 'WS_x',
 'Rk_trad',
 'Team',
 'G_x',
 'MP_x',
 'FG',
 'FGA_team',
 'FG%_x',
 '3P',
 '3PA',
 '3P%_x',
 '2P',
 '2PA',
 '2P%',
 'FT',
 'FTA_team',
 'FT%_x',
 'ORB',


In [72]:
# year=2021

mvp_path = '/Users/cb/src/nba_mvp_ml/data/processed/by_season/mvp/sentiment'

for year in list(range(1980,2024)):
    player_df, team_df, mvp_df = load_year(year, mvp_path=mvp_path)
    df = merge_dfs(player_df, team_df, mvp_df, include_non_mvp=False)

    df.to_csv(f'/Users/cb/src/nba_mvp_ml/data/processed/by_season/fully_merged/{write_season(year)}.csv')

- make a new dir for merged csv's
- save a merged csv for every season
- set input_directory = this new dir
- make an output_directory for the cleaned huge df (with columns trimmed, etc.)
- output_directory also the location for the mapping csv
- assess the remaining columns, trim out more if necessary
- check the resulting csv for mvp variable (will be the target variable for ML)
- normalize, train test split, start experimentingf