In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import glob
%matplotlib inline

## Generate & Clean Dataset

In [2]:
def format_dataframe(df, season):
    result = df.drop(columns=['Rk'])
    result.insert(loc=1, column='Season', value=season)
    result['eFG%'] = (df['FG'] + 0.5*df['3P']) / df['FGA']
    result = result.rename(columns={'3P': 'FG3', '3PA': 'FG3A', '3P%': 'FG3%', '2P': 'FG2', '2PA': 'FG2A', '2P%': 'FG2%'})
    return result

def merge_dataframes():
    file_path = '.data/'
    files = os.listdir(file_path)
    all_df_list = []
    for file in files:
        season = file.replace('.csv', '')
        print(f'Getting season {season}')
        df = pd.read_csv(f'{file_path}{file}', header=0)
        formatted_df = format_dataframe(df, season)
        all_df_list.append(formatted_df)

    total_df = pd.concat(all_df_list)

    return total_df


In [3]:
def aggregate_player_in_same_season(df):
    return df.groupby(['Season', 'Player', 'Player-additional', 'Pos', 'Age']).agg({
        'G': 'sum',
        'GS': 'sum',
        'MP': 'mean',
        'PTS': 'mean',
        'FG': 'mean',
        'FGA': 'mean',
        'FG%': 'mean',
        'FG3': 'mean',
        'FG3A': 'mean',
        'FG3%': 'mean',
        'FG2': 'mean',
        'FG2A': 'mean',
        'FG2%': 'mean',
        'FT': 'mean',
        'FTA': 'mean',
        'FT%': 'mean',
        'eFG%': 'mean',
        'ORB': 'mean',
        'DRB': 'mean',
        'AST': 'mean',
        'STL': 'mean',
        'TOV': 'mean',
        'BLK': 'mean'
    }).reset_index()

def remove_non_important_rows(df):
    min_gp = 10
    return total_df[total_df['G'] > min_gp]

total_df = merge_dataframes()

Getting season 1987_88
Getting season 2009_10
Getting season 1997_98
Getting season 1990_91
Getting season 2015_16
Getting season 2005_06
Getting season 1992_93
Getting season 2011_12
Getting season 2022_23
Getting season 1999_00
Getting season 2001_02
Getting season 1984_85
Getting season 2013_14
Getting season 2003_04
Getting season 1994_95
Getting season 2008_09
Getting season 2020_21
Getting season 2018_19
Getting season 1996_97
Getting season 1986_87
Getting season 1991_92
Getting season 1993_94
Getting season 2004_05
Getting season 2014_15
Getting season 1988_89
Getting season 1998_99
Getting season 2016_17
Getting season 2006_07
Getting season 1989_90
Getting season 2019_20
Getting season 2000_01
Getting season 2007_08
Getting season 2017_18
Getting season 2010_11
Getting season 1995_96
Getting season 2012_13
Getting season 2021_22
Getting season 2002_03
Getting season 2023_24


In [4]:
total_df = aggregate_player_in_same_season(total_df)

## Calculate Distance

In [5]:

# DISTANCE FUNCTION
def euclidean_distance(u, v):
    dist = np.sqrt(np.sum((u-v)**2))
    return dist

def calculate_distance(player1_stats, player2_stats):
    vectorized_distance = np.vectorize(euclidean_distance)
    distance_vect = vectorized_distance(player1_stats, player2_stats)
    return np.sum(np.abs(distance_vect)) / len(distance_vect)

## Normalize

In [6]:
#def normalize(col):
#    return (col - col.mean()) / (col.std())

# min-max
def normalize(col):
    return (col - col.min()) / (col.max() - col.min())

cols_to_norm = [
    'PTS',
    'MP',
    'FG',
    'FGA',
    'FG3',
    'FG3A',
    'FG2',
    'FG2A',
    'FT',
    'FTA',
    'ORB',
    'DRB',
    'AST',
    'STL',
    'TOV',
    'BLK'
]

def vorp(df):
    for col_name in cols_to_norm:
        df['{}_norm'.format(col_name)] = normalize(df[col_name])
    return df

df_norm = total_df.groupby(['Season']).apply(vorp)

## Calculating Player Similarity

In [7]:
# Player to compare
season = '2023_24'
player = 'Chet Holmgren'

In [8]:
# Player stats
player_stats_vector = np.array([
    (df_norm.loc[(df_norm['Player'] == player) & (df_norm['Season'] == season), 'PTS_norm']).item(),
    (df_norm.loc[(df_norm['Player'] == player) & (df_norm['Season'] == season), 'MP_norm']).item(),
    (df_norm.loc[(df_norm['Player'] == player) & (df_norm['Season'] == season), 'FG_norm']).item(),
    (df_norm.loc[(df_norm['Player'] == player) & (df_norm['Season'] == season), 'FGA_norm']).item(),
    (df_norm.loc[(df_norm['Player'] == player) & (df_norm['Season'] == season), 'FG3_norm']).item(),
    (df_norm.loc[(df_norm['Player'] == player) & (df_norm['Season'] == season), 'FG3A_norm']).item(),
    (df_norm.loc[(df_norm['Player'] == player) & (df_norm['Season'] == season), 'FG2_norm']).item(),
    (df_norm.loc[(df_norm['Player'] == player) & (df_norm['Season'] == season), 'FG2A_norm']).item(),
    (df_norm.loc[(df_norm['Player'] == player) & (df_norm['Season'] == season), 'FT_norm']).item(),
    (df_norm.loc[(df_norm['Player'] == player) & (df_norm['Season'] == season), 'FTA_norm']).item(),
    (df_norm.loc[(df_norm['Player'] == player) & (df_norm['Season'] == season), 'ORB_norm']).item(),
    (df_norm.loc[(df_norm['Player'] == player) & (df_norm['Season'] == season), 'DRB_norm']).item(),
    (df_norm.loc[(df_norm['Player'] == player) & (df_norm['Season'] == season), 'AST_norm']).item(),
    (df_norm.loc[(df_norm['Player'] == player) & (df_norm['Season'] == season), 'STL_norm']).item(),
    (df_norm.loc[(df_norm['Player'] == player) & (df_norm['Season'] == season), 'TOV_norm']).item(),
    (df_norm.loc[(df_norm['Player'] == player) & (df_norm['Season'] == season), 'BLK_norm']).item(),
])

player_distance = []

for row in df_norm.itertuples():
    compared_player_vector = np.array([
        row.PTS_norm,
        row.MP_norm,
        row.FG_norm,
        row.FGA_norm,
        row.FG3_norm,
        row.FG3A_norm,
        row.FG2_norm,
        row.FG2A_norm,
        row.FT_norm,
        row.FTA_norm,
        row.ORB_norm,
        row.DRB_norm,
        row.AST_norm,
        row.STL_norm,
        row.TOV_norm,
        row.BLK_norm
    ])

    distance = calculate_distance(player_stats_vector, compared_player_vector)
    player_distance.append(distance)

df_norm['distance'] = player_distance

In [13]:
ranked_df = df_norm.sort_values('distance')
ranked_df[ranked_df['Age'] < 23].head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,Season,Player,Player-additional,Pos,Age,G,GS,MP,PTS,FG,...,FG2A_norm,FT_norm,FTA_norm,ORB_norm,DRB_norm,AST_norm,STL_norm,TOV_norm,BLK_norm,distance
Season,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2023_24,17546,2023_24,Chet Holmgren,holmgch01,C,21,32,32,30.0,17.6,6.5,...,0.418478,0.281553,0.301724,0.301887,0.652174,0.212598,0.269231,0.418605,0.870968,0.0
2017_18,14456,2017_18,Myles Turner,turnemy01,C,21,65,62,28.2,12.7,4.7,...,0.427746,0.287356,0.316832,0.27451,0.458716,0.126214,0.25,0.3,0.692308,0.077622
2021_22,16554,2021_22,Jaren Jackson Jr.,jacksja02,PF,22,78,78,27.3,16.3,5.5,...,0.442623,0.375,0.372881,0.326087,0.390909,0.101852,0.391304,0.377778,0.821429,0.079994
2018_19,15002,2018_19,Myles Turner,turnemy01,C,22,74,74,28.6,13.3,5.1,...,0.478788,0.206186,0.245455,0.259259,0.522523,0.149533,0.333333,0.28,1.0,0.088978
2021_22,16899,2021_22,Wendell Carter Jr.,cartewe01,PF,22,62,61,29.9,15.0,5.8,...,0.415301,0.239583,0.279661,0.478261,0.745455,0.259259,0.26087,0.422222,0.25,0.090093
2008_09,10162,2008_09,Wilson Chandler,chandwi01,SF,21,82,70,33.4,14.4,5.5,...,0.463918,0.233333,0.242991,0.255814,0.438776,0.190909,0.321429,0.435897,0.310345,0.091019
2015_16,13379,2015_16,Kristaps Porziņģis,porzikr01,PF,20,72,72,28.4,14.3,5.2,...,0.514451,0.318182,0.323529,0.367347,0.533981,0.111111,0.333333,0.369565,0.513514,0.091937
2016_17,13857,2016_17,Kristaps Porziņģis,porzikr01,PF,21,66,65,32.8,18.1,6.7,...,0.53125,0.326087,0.348624,0.395349,0.533981,0.133929,0.35,0.315789,0.666667,0.092447
2020_21,15831,2020_21,Darius Bazley,bazleda01,PF,20,55,55,31.2,13.7,5.0,...,0.434524,0.25,0.299065,0.191489,0.623762,0.153846,0.238095,0.44,0.147059,0.092874
2019_20,15406,2019_20,Jaren Jackson Jr.,jacksja02,C,20,57,57,28.5,17.4,6.2,...,0.363636,0.235294,0.279661,0.232558,0.315789,0.137255,0.333333,0.354167,0.551724,0.100322
