In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score

nba_data_2021_2022_advanced = pd.read_csv('2021_22_NBA_Player_Data.csv')
nba_data_2021_2022_per_game = pd.read_csv('2021_2022_Per_Game_Data.csv')
nba_data_2020_2021_advanced = pd.read_csv('2020_2021_Player_Stats.csv')
nba_data_2020_2021_per_game = pd.read_csv('2020_2021_NBA_Per_Game_Stats.csv')

In [2]:
# Remove the 'MP' columns
nba_data_2021_2022_advanced.drop(columns=['MP','Rk', 'Column1', '2', 'GS', 'Pos'], inplace=True, errors='ignore')
nba_data_2021_2022_per_game.drop(columns=['MP','Rk', 'Column1', '2', 'G', 'GS', 'Pos'], inplace=True, errors='ignore')
nba_data_2020_2021_advanced.drop(columns=['MP','Rk', 'Column1', '2', 'GS', 'Pos'], inplace=True, errors='ignore')
nba_data_2020_2021_per_game.drop(columns=['MP','Rk', 'Column1', '2', 'G', 'GS', 'Pos'], inplace=True, errors='ignore')

print(nba_data_2020_2021_advanced.columns)
print(nba_data_2021_2022_advanced.columns)


# Combine datasets from the same seasons
combined_2021_2022 = pd.merge(nba_data_2021_2022_advanced, nba_data_2021_2022_per_game, on=['Player', 'Tm'])
combined_2020_2021 = pd.merge(nba_data_2020_2021_advanced, nba_data_2020_2021_per_game, on=['Player', 'Tm'])

# Replace NaN values with 0 in the entire DataFrame
combined_2020_2021.fillna(0, inplace=True)
combined_2021_2022.fillna(0, inplace=True)


Index(['Player', 'Tm', 'G', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%',
       'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS', 'WS',
       'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP'],
      dtype='object')
Index(['Player', 'Tm', 'G', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%',
       'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS', 'WS',
       'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP'],
      dtype='object')


In [3]:
tot_players_2020_2021 = combined_2020_2021[combined_2020_2021['Tm'] == 'TOT']

# For each player with "TOT", find the team with which they played the most games
for player in tot_players_2020_2021['Player'].unique():
    # Extract rows for this player
    player_rows = combined_2020_2021[combined_2020_2021['Player'] == player]
    
    # Exclude the "TOT" row
    player_rows = player_rows[player_rows['Tm'] != 'TOT']
    
    # Find the team with the most games
    if not player_rows.empty:
        max_games_team = player_rows.loc[player_rows['G'].idxmax()]['Tm']

        # Update the 'Tm' value in the original dataframe
        combined_2020_2021.loc[(combined_2020_2021['Player'] == player) & (combined_2020_2021['Tm'] == 'TOT'), 'Tm'] = max_games_team

# Removing duplicate players, keeping the row with the updated team
combined_2020_2021_cleaned = combined_2020_2021.drop_duplicates(subset=['Player'], keep='first')

tot_players_2021_2022 = combined_2021_2022[combined_2021_2022['Tm'] == 'TOT']

# For each player with "TOT", find the team with which they played the most games
for player in tot_players_2021_2022['Player'].unique():
    # Extract rows for this player
    player_rows = combined_2021_2022[combined_2021_2022['Player'] == player]
    
    # Exclude the "TOT" row
    player_rows = player_rows[player_rows['Tm'] != 'TOT']
    
    # Find the team with the most games
    if not player_rows.empty:
        max_games_team = player_rows.loc[player_rows['G'].idxmax()]['Tm']

        # Update the 'Tm' value in the original dataframe
        combined_2021_2022.loc[(combined_2021_2022['Player'] == player) & (combined_2021_2022['Tm'] == 'TOT'), 'Tm'] = max_games_team

# Removing duplicate players, keeping the row with the updated team
combined_2021_2022_cleaned = combined_2021_2022.drop_duplicates(subset=['Player'], keep='first')

In [4]:
combined_2020_2021_cleaned.to_csv('combined_2020_2021.csv', index=False)
combined_2021_2022_cleaned.to_csv('combined_2021_2022.csv', index=False)