In [1]:
from BRScraper import nba
import numpy
import pandas as pd
import random
import os

In [37]:
# Function to preprocess a single season
def preprocess_season(file_name, year):
    df = pd.read_csv(file_name)

# Handle players who played for multiple teams
    # Create a dictionary to store multi-team player data (including players with 2TM, 3TM, etc.)
    multi_team_dict = {}

    # Identify players with 'TM' in their 'Team' column (multi-team players)
    multi_team_rows = df[df['Team'].str.contains('TM', na=False)]

    # Check if there are any multi-team players
    if multi_team_rows.empty:
        print("No multi-team players found in the dataset.")
    else:
        print(f"Found {len(multi_team_rows)} multi-team player rows.")

    # For each multi-team player, gather all teams they played for (excluding 2TM, 3TM, etc.)
    for player in multi_team_rows['Player'].unique():
        player_teams = df[(df['Player'] == player) & ~df['Team'].str.contains('TM', na=False)]['Team'].tolist()
        multi_team_dict[player] = ', '.join(player_teams)

    # Remove duplicate rows for multi-team players and keep the rows with 'TM' in the Team column
    multi_team_players = multi_team_rows['Player'].unique()
    mask = (df['Team'].str.contains('TM', na=False)) | (~df['Player'].isin(multi_team_players))
    df = df[mask]

    # Add the 'Multiple Teams' column using the mapping from the multi_team_dict
    df['Multiple Teams'] = df['Player'].map(multi_team_dict)

    # Fill NaN values in the 'Multiple Teams' column with an empty string
    df.fillna({'Multiple Teams': ''}, inplace=True)

    # Reorder columns to place 'Multiple Teams' between 'Team' and 'Pos'
    columns = list(df.columns)
    if 'Multiple Teams' in columns and 'Team' in columns:
        team_index = columns.index('Team')  # Get the index of 'Team' column
        columns.insert(team_index + 1, columns.pop(columns.index('Multiple Teams')))  # Reorder columns
        df = df[columns]



# Addition of the TS% feature
    # Drop unnecessary columns
    drop_columns = ['Age', 'Pos', 'GS', '3PA', '2PA', 'PF']
    df_cleaned = df.drop(columns=drop_columns, errors='ignore')

    # Calculate TS%
    if 'PTS' in df_cleaned.columns and 'FGA' in df_cleaned.columns and 'FTA' in df_cleaned.columns:
        df_cleaned['TS%'] = df_cleaned['PTS'] / (2 * (df_cleaned['FGA'] + 0.44 * df_cleaned['FTA']))
        df_cleaned['TS%'] = df_cleaned['TS%'].round(2)


# Addition of the EFF feature
    # Calculate missed shots for EFF
    if 'FGA' in df_cleaned.columns and 'FG' in df_cleaned.columns:
        df_cleaned['Missed_FG'] = df_cleaned['FGA'] - df_cleaned['FG']
    if 'FTA' in df_cleaned.columns and 'FT' in df_cleaned.columns:
        df_cleaned['Missed_FT'] = df_cleaned['FTA'] - df_cleaned['FT']

    # Calculate EFF
    if {'PTS', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'G', 'Missed_FG', 'Missed_FT'}.issubset(df_cleaned.columns):
        df_cleaned['EFF'] = (
            df_cleaned['PTS'] +
            df_cleaned['TRB'] +
            df_cleaned['AST'] +
            df_cleaned['STL'] +
            df_cleaned['BLK'] -
            df_cleaned['Missed_FG'] -
            df_cleaned['Missed_FT'] -
            df_cleaned['TOV']
        ) / df_cleaned['G']
        df_cleaned['EFF'] = df_cleaned['EFF'].round(2)

    # Drop temporary columns
    df_cleaned.drop(columns=['Missed_FG', 'Missed_FT'], inplace=True, errors='ignore')
    
    
    
# Addition of the MVP feature, ROY feature, AS feature, and All-NBA feature
    # Define award categories
    mvp_awards = [f'MVP-{i}' for i in range(1, 11)]
    dpoy_awards = [f'DPOY-{i}' for i in range(1, 11)]
    six_moy_awards = [f'6MOY-{i}' for i in range(1, 6)]
    roy_awards = [f'ROY-{i}' for i in range(1, 6)]
    all_nba_awards = ['NBA1', 'NBA2', 'NBA3']
    as_awards = ['AS']
    
    # Initialize new columns with empty strings
    df_cleaned['MVP'] = ''
    df_cleaned['DPOY'] = ''
    df_cleaned['6MOY'] = ''
    df_cleaned['ROY'] = ''
    df_cleaned['AS'] = ''
    df_cleaned['All-NBA'] = ''

    # Function to extract awards based on categories
    def extract_awards(awards_string, category_list):
        if pd.isna(awards_string):
            return ''
        awards = [award.strip() for award in awards_string.split(',')]
        filtered_awards = [award for award in awards if award in category_list]
        return ','.join(filtered_awards)

    # Apply the function to split awards into respective columns
    df_cleaned['MVP'] = df_cleaned['Awards'].apply(lambda x: extract_awards(x, mvp_awards))
    df_cleaned['DPOY'] = df_cleaned['Awards'].apply(lambda x: extract_awards(x, dpoy_awards))
    df_cleaned['6MOY'] = df_cleaned['Awards'].apply(lambda x: extract_awards(x, six_moy_awards))
    df_cleaned['ROY'] = df_cleaned['Awards'].apply(lambda x: extract_awards(x, roy_awards))
    df_cleaned['AS'] = df_cleaned['Awards'].apply(lambda x: extract_awards(x, as_awards))
    df_cleaned['All-NBA'] = df_cleaned['Awards'].apply(lambda x: extract_awards(x, all_nba_awards))

    # One-hot encode 'AS' (All-Star) feature
    df_cleaned['AS'] = df_cleaned['AS'].apply(lambda x: 1 if x == 'AS' else 0)

    # Apply numeric values for 'MVP', 'ROY', and 'All-NBA' values
    # MVP - Using lambda to assign numbers based on MVP-1, MVP-2, ..., MVP-10
    df_cleaned['MVP'] = df_cleaned['MVP'].apply(lambda x: 10 if 'MVP-10' in str(x) else
                                        (2 if 'MVP-2' in str(x) else
                                         (3 if 'MVP-3' in str(x) else
                                          (4 if 'MVP-4' in str(x) else
                                           (5 if 'MVP-5' in str(x) else
                                            (6 if 'MVP-6' in str(x) else
                                             (7 if 'MVP-7' in str(x) else
                                              (8 if 'MVP-8' in str(x) else
                                               (9 if 'MVP-9' in str(x) else
                                                (1 if 'MVP-1' in str(x) else 0))))))))))
    
    # DPOY - Using lambda to assign numbers based on DPOY-1, DPOY-2, ..., DPOY-10
    df_cleaned['DPOY'] = df_cleaned['DPOY'].apply(lambda x: 10 if 'DPOY-10' in str(x) else
                                        (2 if 'DPOY-2' in str(x) else
                                         (3 if 'DPOY-3' in str(x) else
                                          (4 if 'DPOY-4' in str(x) else
                                           (5 if 'DPOY-5' in str(x) else
                                            (6 if 'DPOY-6' in str(x) else
                                             (7 if 'DPOY-7' in str(x) else
                                              (8 if 'DPOY-8' in str(x) else
                                               (9 if 'DPOY-9' in str(x) else
                                                (1 if 'DPOY-1' in str(x) else 0))))))))))

    # 6MOY - Using lambda to assign numbers based on 6MOY-1, 6MOY-2, ..., 6MOY-5
    df_cleaned['6MOY'] = df_cleaned['6MOY'].apply(lambda x: 1 if '6MOY-1' in str(x) else
                                         (2 if '6MOY-2' in str(x) else
                                          (3 if '6MOY-3' in str(x) else
                                           (4 if '6MOY-4' in str(x) else
                                            (5 if '6MOY-7' in str(x) else 0)))))

    # ROY - Using lambda to assign numbers based on ROY-1, ROY-2, ..., ROY-5
    df_cleaned['ROY'] = df_cleaned['ROY'].apply(lambda x: 1 if 'ROY-1' in str(x) else
                                         (2 if 'ROY-2' in str(x) else
                                          (3 if 'ROY-3' in str(x) else
                                           (4 if 'ROY-4' in str(x) else
                                            (5 if 'ROY-7' in str(x) else 0)))))

    # All-NBA - Using lambda to assign numbers based on NBA1, NBA2
    df_cleaned['All-NBA'] = df_cleaned['All-NBA'].apply(lambda x: 1 if 'NBA1' in str(x) else
                                            (2 if 'NBA2' in str(x) else
                                             (3 if 'NBA3' in str(x) else 0)))

    

# Addition of the MVP_count feature
    # Add MVP_count column
    df_cleaned['MVP_count'] = 0
    mvp_df = pd.read_csv('nba_player_stats_mvp_data.csv')

    # Extract the starting year from the 'season' column in mvp_df
    mvp_df['season_start'] = mvp_df['Season'].str.split('-').str[0].astype(int)

    # Filter MVP data for seasons before the current year
    prior_mvp_df = mvp_df[mvp_df['season_start'] < year]

    # Count MVP wins per player
    mvp_counts = prior_mvp_df['Player'].value_counts().to_dict()  # {player_name: mvp_count}

    # Map MVP counts to the current season's players
    df_cleaned['MVP_count'] = df_cleaned['Player'].map(mvp_counts).fillna(0).astype(int)
    
    
# Addition of the MVP_nominations feature
    # Add MVP_nominations column
    df_cleaned['MVP_nominations'] = 0
    
    # Cumulative nomination tracker: {player_name: total_nominations}
    cumulative_nominations = {}
     
    for each_year in range(year - 1, 1979, -1):
        current_season = f"{each_year}-{str(each_year+1)[-2:]}"
        file_in = f"{"untouched_seasonal_data"}/nba_player_stats_{current_season}.csv"
        
        if os.path.exists(file_in):
            this_df = pd.read_csv(file_in)
            
            # Check the awards column for MVP nominations (mvp-1 to mvp-10)
            if 'Awards' in this_df.columns:
                for _, row in this_df.iterrows():
                    player = row['Player']
                    awards = str(row.get('Awards', ''))  # Get the awards column, default to an empty string
                
                    # Check if the player was nominated for MVP
                    if any(f"mvp-{i}" in awards.lower() for i in range(1, 11)):
                        # Increment cumulative nominations for the player
                        cumulative_nominations[player] = cumulative_nominations.get(player, 0) + 1
            
    # Add the cumulative nominations from prior years to the current dataset
    df_cleaned['MVP_nominations'] = df_cleaned['Player'].map(cumulative_nominations).fillna(0).astype(int)

         
    
    return df_cleaned




# Define the folder containing the CSV files and desired output
input_folder = "untouched_seasonal_data"
output_folder = "processed_data"

if not os.path.exists(output_folder):
    os.makedirs(output_folder)

for year in range(1980, 2025):
    season = f"{year}-{str(year+1)[-2:]}"
    file_name = f"{input_folder}/nba_player_stats_{season}.csv"
    output_file = f"{output_folder}/nba_player_stats_{season}_processed.csv"    
    if os.path.exists(file_name):
        try:
            processed_df = preprocess_season(file_name, year)
            processed_df.to_csv(output_file, index=False)
            print(f"Processed {season} successfully!")
        except Exception as e:
            print(f"Error processing {season}: {e}")
    else:
        print(f"File {file_name} not found. Skipping.")


Found 29 multi-team player rows.
Processed 1980-81 successfully!
Found 28 multi-team player rows.
Processed 1981-82 successfully!
Found 36 multi-team player rows.
Processed 1982-83 successfully!
Found 15 multi-team player rows.
Processed 1983-84 successfully!
Found 20 multi-team player rows.
Processed 1984-85 successfully!
Found 25 multi-team player rows.
Processed 1985-86 successfully!
Found 21 multi-team player rows.
Processed 1986-87 successfully!
Found 46 multi-team player rows.
Processed 1987-88 successfully!
Found 42 multi-team player rows.
Processed 1988-89 successfully!
Found 38 multi-team player rows.
Processed 1989-90 successfully!
Found 26 multi-team player rows.
Processed 1990-91 successfully!
Found 33 multi-team player rows.
Processed 1991-92 successfully!
Found 28 multi-team player rows.
Processed 1992-93 successfully!
Found 37 multi-team player rows.
Processed 1993-94 successfully!
Found 22 multi-team player rows.
Processed 1994-95 successfully!
Found 56 multi-team playe

We preprocessed the data to include two new features to help predict the MVP Ranking from 1-10. all other players will have NaN. From the new preprocessed datasets for each year from 1980 to 2023 we can create models to predict MVP Rank for each play and compare for each model how well it predicts to actual winners for each season.

In [38]:
#Combine all datasets from 1980 to 2015
# Define the folder containing the CSV files
input_folder = "processed_data"

# List to hold dataframes
dataframes = []

if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Loop through each year from 1980 to 2015
for year in range(1980, 2016):
    season = f"{year}-{str(year+1)[-2:]}"
    file_name = f"{input_folder}/nba_player_stats_{season}_processed.csv"
    
    if os.path.exists(file_name):
        try:
            # Read the dataset for the year
            df = pd.read_csv(file_name)
        
            # Append the dataframe to the list
            dataframes.append(df)
        
            print(f"Successfully read data for the {year} season")
        except Exception as e:
            print(f"Failed to read data for {year} season. Error: {e}")

# Concatenate all dataframes into one
combined_df = pd.concat(dataframes, ignore_index=True)

# Save the combined dataframe to a CSV file
combined_df.to_csv("nba_combined_1980_2015.csv", index=False)

print("All datasets have been concatenated and saved to 'nba_combined_1980_2015.csv'.")

Successfully read data for the 1980 season
Successfully read data for the 1981 season
Successfully read data for the 1982 season
Successfully read data for the 1983 season
Successfully read data for the 1984 season
Successfully read data for the 1985 season
Successfully read data for the 1986 season
Successfully read data for the 1987 season
Successfully read data for the 1988 season
Successfully read data for the 1989 season
Successfully read data for the 1990 season
Successfully read data for the 1991 season
Successfully read data for the 1992 season
Successfully read data for the 1993 season
Successfully read data for the 1994 season
Successfully read data for the 1995 season
Successfully read data for the 1996 season
Successfully read data for the 1997 season
Successfully read data for the 1998 season
Successfully read data for the 1999 season
Successfully read data for the 2000 season
Successfully read data for the 2001 season
Successfully read data for the 2002 season
Successfull

In [39]:
#Combine all datasets from 2016 to 2024
# Define the folder containing the CSV files
input_folder = "processed_data"

# List to hold dataframes
dataframes = []

if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Loop through each year from 2016 to 2024
for year in range(2016, 2025):
    season = f"{year}-{str(year+1)[-2:]}"
    file_name = f"{input_folder}/nba_player_stats_{season}_processed.csv"
    
    if os.path.exists(file_name):
        try:
            # Read the dataset for the year
            df = pd.read_csv(file_name)
        
            # Append the dataframe to the list
            dataframes.append(df)
        
            print(f"Successfully read data for the {year} season")
        except Exception as e:
            print(f"Failed to read data for {year} season. Error: {e}")

# Concatenate all dataframes into one
combined_df = pd.concat(dataframes, ignore_index=True)

# Save the combined dataframe to a CSV file
combined_df.to_csv("nba_combined_2016_2024.csv", index=False)

print("All datasets have been concatenated and saved to 'nba_combined_2016_2024.csv'.")

Successfully read data for the 2016 season
Successfully read data for the 2017 season
Successfully read data for the 2018 season
Successfully read data for the 2019 season
Successfully read data for the 2020 season
Successfully read data for the 2021 season
Successfully read data for the 2022 season
Successfully read data for the 2023 season
Successfully read data for the 2024 season
All datasets have been concatenated and saved to 'nba_combined_2016_2024.csv'.


In [None]:
# Define the folder for standings data
standings_folder = "nba_season_standings"
os.makedirs(standings_folder, exist_ok=True)

# Scrape and save standings data for each season
for year in range(1980, 2025):
    season = f"{year}-{str(year + 1)[-2:]}"  # Format season as '1980-81'
    standings = leaguestandings.LeagueStandings
    standings.to_csv(f"{standings_folder}/{season}_standings.csv", index=False)
    print(f"Saved standings for {season}")

Saved standings for 1980-81
Saved standings for 1981-82
Saved standings for 1982-83
Saved standings for 1983-84
Saved standings for 1984-85
Saved standings for 1985-86
Saved standings for 1986-87
Saved standings for 1987-88
Saved standings for 1988-89
Saved standings for 1989-90
Saved standings for 1990-91
Saved standings for 1991-92
Saved standings for 1992-93
Saved standings for 1993-94
Saved standings for 1994-95
Saved standings for 1995-96
Saved standings for 1996-97
Saved standings for 1997-98
Saved standings for 1998-99
Saved standings for 1999-00
Saved standings for 2000-01
Saved standings for 2001-02
Saved standings for 2002-03
Saved standings for 2003-04
Saved standings for 2004-05
Saved standings for 2005-06
Saved standings for 2006-07
Saved standings for 2007-08
Saved standings for 2008-09
Saved standings for 2009-10


  name2 = re.sub('[^a-zA-Z0-9 \n\.]', '', name)


ValueError: 2010 is not a valid season.