In [2]:
import pandas as pd
from unidecode import unidecode
import os
import collections

In [9]:
# Path names
folder = os.path.dirname(os.getcwd())
raw_stats = os.path.join(folder, 'data', 'raw', 'stats')
clean_stats = os.path.join(folder, 'data', 'clean', 'stats')

Normalize player names with foreign accents. E.g. Luka Dončić -> Luka Doncic

In [10]:
# Store dataframes in a dictionary, with the file name as the key
dfs = {}

for file in os.listdir(raw_stats):
    if file.endswith('.csv'):
        # Read the file
        df = pd.read_csv(os.path.join(raw_stats, file))
        # Remove accents from the 'Player' column
        df['Player'] = df['Player'].apply(unidecode)

        # Save to dictionary
        dfs[file] = df
        

Save all player names for reference

In [11]:
players = set()

for key in dfs:
    players = players.union(dfs[key]['Player'])

# Save in file
with open('players.txt', 'w') as f:
    for player in players:
        f.write(player + '\n')

Merge player performance statistics if they played in various teams

In [12]:
# Create function to remove duplicates and sum the values
def remove_duplicates(df):
    # If more than one row has the same rank and player name, merge into one row, summing the values
    df = df.groupby(['Rk', 'Player']).sum().reset_index()

    # Re-calculate percentage based stats: FG%, 3P%, 2P%, eFG%, FT%
    # Round to 3 decimal places
    df['FG%'] = (df['FG'] / df['FGA']).round(3)
    df['3P%'] = (df['3P'] / df['3PA']).round(3)
    df['2P%'] = (df['2P'] / df['2PA']).round(3)
    df['eFG%'] = ((df['FG'] + 0.5 * df['3P']) / df['FGA']).round(3)
    df['FT%'] = (df['FT'] / df['FTA']).round(3)

    return df

In [13]:
for file, df in dfs.items():
    # Remove duplicates of all dataframes
    dfs[file] = remove_duplicates(df)

    # Drop column 'Player-additional' if it exists
    if 'Player-additional' in dfs[file].columns:
        dfs[file].drop('Player-additional', axis=1, inplace=True)

    # Print the column names
    # print(dfs[file].columns)

    # Save to clean directory
    dfs[file].to_csv(os.path.join(clean_stats, file), index=False)