In [2]:
import pandas as pd
from unidecode import unidecode
import os
import collections

In [9]:
# Path names
folder = os.path.dirname(os.getcwd())
raw_stats = os.path.join(folder, 'data', 'raw', 'stats')
clean_stats = os.path.join(folder, 'data', 'clean', 'stats')

Normalize player names with foreign accents. E.g. Luka Dončić -> Luka Doncic

In [10]:
# Store dataframes in a dictionary, with the file name as the key
dfs = {}

for file in os.listdir(raw_stats):
    if file.endswith('.csv'):
        # Read the file
        df = pd.read_csv(os.path.join(raw_stats, file))
        # Remove accents from the 'Player' column
        df['Player'] = df['Player'].apply(unidecode)

        # Save to dictionary
        dfs[file] = df
        

Save all player names for reference

In [11]:
players = set()

for key in dfs:
    players = players.union(dfs[key]['Player'])

# Save in file
with open('players.txt', 'w') as f:
    for player in players:
        f.write(player + '\n')

Merge player performance statistics if they played in various teams

In [12]:
# Create function to remove duplicates and sum the values
def remove_duplicates(df):
    # If more than one row has the same rank and player name, merge into one row, summing the values
    df = df.groupby(['Rk', 'Player']).sum().reset_index()

    # Re-calculate percentage based stats: FG%, 3P%, 2P%, eFG%, FT%
    # Round to 3 decimal places
    df['FG%'] = (df['FG'] / df['FGA']).round(3)
    df['3P%'] = (df['3P'] / df['3PA']).round(3)
    df['2P%'] = (df['2P'] / df['2PA']).round(3)
    df['eFG%'] = ((df['FG'] + 0.5 * df['3P']) / df['FGA']).round(3)
    df['FT%'] = (df['FT'] / df['FTA']).round(3)

    return df

In [13]:
for file, df in dfs.items():
    # Remove duplicates of all dataframes
    dfs[file] = remove_duplicates(df)

    # Drop column 'Player-additional' if it exists
    if 'Player-additional' in dfs[file].columns:
        dfs[file].drop('Player-additional', axis=1, inplace=True)

    # Print the column names
    # print(dfs[file].columns)

    # Save to clean directory
    dfs[file].to_csv(os.path.join(clean_stats, file), index=False)

Check if every player's salary information is available

In [8]:
# Load salary data
salary = pd.read_csv(os.path.join(raw_salary, 'nba_salaries_2019-2020.csv'))

# Remove accents from the 'Player' column
salary['Player'] = salary['Player'].apply(unidecode)

# Add salary column to the stats dataframes
df = dfs['2019-2020 NBA Player Stats - Playoffs.csv']
df = df.merge(salary, on='Player', how='left')

# Check for missing salaries
missing_salaries = df[df['2019-2020'].isnull()]
print(missing_salaries)

# Save to clean directory
df.to_csv(os.path.join(clean_salary, 'nba_salaries_2019-2020.csv'), index=False)


      Rk                   Player Pos  Age   Tm   G  GS   MP  FG  FGA  ...  \
4      5          Justin Anderson  SG   26  BRK   3   0   28   5   12  ...   
8      9            D.J. Augustin  PG   32  ORL   5   0  128  18   46  ...   
9     10               J.J. Barea  PG   35  DAL   1   0    5   0    1  ...   
57    58          James Ennis III  SG   29  ORL   5   5  119  12   35  ...   
63    64             Enes Freedom   C   27  BOS  11   0  102  22   42  ...   
78    79         Tim Hardaway Jr.  SG   27  DAL   6   6  204  40   95  ...   
95    96         Danuel House Jr.  SF   26  HOU   9   4  279  37   85  ...   
101  102               Wes Iwundu  SF   25  ORL   5   0   76   6   20  ...   
106  107             B.J. Johnson  SF   24  ORL   1   0    4   1    2  ...   
110  111        Derrick Jones Jr.  SF   22  MIA  15   0   97   8   17  ...   
126  127  Timothe Luwawu-Cabarrot  SF   24  BRK   4   3  131  20   59  ...   
130  131          Frank Mason III  PG   25  MIL   2   0    2   0