In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sportsipy.ncaab.teams import Teams
from sklearn.cluster import KMeans

In [2]:
# big_ten_teams = []
# for team in Teams():
#     if (team.conference == 'big-ten'):
#         big_ten_teams.append(team)

In [3]:
# def get_player_stats(teams: list, issue_players = []) -> pd.DataFrame:
#     df = pd.DataFrame()
#     for team in big_ten_teams:
#         print(f"Adding {team.name} players...")
#         for player in team.roster.players:
#             try:
#                 player_stats = player.dataframe.loc['2021-22']
#                 player_stats.insert(0, 'name', player.name)
#                 df = pd.concat([df, player_stats])
#             except:
#                 issue_players.append(player)
#     return df

# issue_players = []
# df = get_player_stats(big_ten_teams, issue_players)
# print(f"{len(issue_players)} players unable to be added: {issue_players}")

# Use this code to import directly from the dataset (faster)
df = pd.read_csv('data/player_stats.csv', index_col='Unnamed: 0')

In [4]:
player_df = df.reset_index(drop=True).copy()

# Filter players by at least 10 games and 50 minutes played
player_df = player_df[(player_df['games_played'] >= 10) & (player_df['minutes_played'] >= 50)]

# Drop irrelevant and completely empty (non-null count = 0) columns
player_df = player_df.drop(['conference', 'height', 'player_id', 'team_abbreviation', 'weight'], axis=1)

# If a player has a missing 3FG%, it means they have 0 3FGA, so fill with 0
player_df['three_point_percentage'] = player_df['three_point_percentage'].fillna(0)

# One-hot encode the 'position' column
player_df['position'] = player_df['position'].apply(str.lower)
ohe = pd.get_dummies(player_df['position'], prefix='is')
player_df = player_df.drop('position', axis=1)
player_df = player_df.join(ohe)

In [5]:
player_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 154 entries, 0 to 195
Data columns (total 49 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   name                             154 non-null    object 
 1   assist_percentage                154 non-null    float64
 2   assists                          154 non-null    int64  
 3   block_percentage                 154 non-null    float64
 4   blocks                           154 non-null    int64  
 5   box_plus_minus                   154 non-null    float64
 6   defensive_box_plus_minus         154 non-null    float64
 7   defensive_rebound_percentage     154 non-null    float64
 8   defensive_rebounds               154 non-null    int64  
 9   defensive_win_shares             154 non-null    float64
 10  effective_field_goal_percentage  154 non-null    float64
 11  field_goal_attempts              154 non-null    int64  
 12  field_goal_percentage 

In [6]:
player_df.head()

Unnamed: 0,name,assist_percentage,assists,block_percentage,blocks,box_plus_minus,defensive_box_plus_minus,defensive_rebound_percentage,defensive_rebounds,defensive_win_shares,...,turnovers,two_point_attempts,two_point_percentage,two_pointers,usage_percentage,win_shares,win_shares_per_40_minutes,is_center,is_forward,is_guard
0,Kofi Cockburn,6.6,22,3.0,27,9.3,2.0,25.6,201,1.8,...,65,381,0.593,226,32.0,5.3,0.247,1,0,0
1,Alfonso Plummer,7.1,35,0.0,0,4.7,0.4,7.7,70,1.0,...,43,123,0.455,56,22.8,3.7,0.15,0,0,1
2,Trent Frazier,21.8,126,0.4,4,5.6,3.0,7.1,70,1.4,...,55,113,0.513,58,18.4,3.6,0.135,0,0,1
3,Jacob Grandison,17.9,69,0.6,5,7.0,2.0,12.0,82,0.9,...,37,90,0.522,47,18.7,3.0,0.159,0,0,1
4,Coleman Hawkins,14.8,50,2.6,17,6.1,4.2,16.4,94,1.2,...,45,89,0.551,49,18.4,2.1,0.133,0,1,0


In [7]:
# plt.figure(figsize=(35, 25))
# sns.heatmap(data=player_df.corr(), annot=True)

In [8]:
model_data = player_df.drop('name', axis=1)
num_clusters = 12
model = KMeans(n_clusters = num_clusters, random_state = 0).fit(model_data)
labels = model.labels_

In [9]:
player_groups = pd.DataFrame(data = {'name': player_df['name'], 'group': labels})
groups = {}

for group in range(num_clusters):
    groups[f"Group {group+1}"] = list(player_groups.query(f"group == {group}")['name'])
    
for group in groups:
    print(f"{group}:\n")
    print(groups[group], "\n\n\n")

Group 1:

['Jacob Grandison', "Da'Monte Williams", 'Parker Stewart', 'Miller Kopp', 'Filip Rebraca', 'Joey Hauser', 'C.J. Wilcher', 'Lat Mayen', 'Ty Berry', 'Robbie Beran', 'Jamari Wheeler', 'Myles Dread', 'Eric Hunter Jr.', 'Mason Gillis', 'Caleb McConnell'] 



Group 2:

['Andre Curbelo', 'Jordan Geronimo', 'Tamar Bates', 'Rob Phinisee', 'Trey Galloway', 'Payton Sandfort', 'Ahron Ulis', 'Ian Martinez', 'Terrance Williams II', 'Brandon Johns Jr', 'Frankie Collins', 'Trey McGowens', 'Eduardo Andre', 'Elyjah Williams', 'Cedric Russell', 'Meechie Johnson Jr.', 'Eugene Brown III', 'Dallion Johnson', 'Greg Lee', 'Jaheam Cornwall', 'Caleb Furst', 'Brandon Newman', 'Aundre Hyatt', 'Mawot Mag', 'Chris Vogt'] 



Group 3:

['Alfonso Plummer', 'Fatts Russell', 'Eric Ayala', 'Donta Scott', 'Gabe Brown', 'Eli Brooks', 'Jamison Battle', 'Payton Willis', 'Bryce McGowens', 'Boo Buie', 'Jalen Pickett', 'Ron Harper Jr.', 'Brad Davison'] 



Group 4:

['Patrick McCaffery', 'Malik Hall', 'Tyson Walker',

In [10]:
player_df = player_df.merge(player_groups)
player_df.head()

Unnamed: 0,name,assist_percentage,assists,block_percentage,blocks,box_plus_minus,defensive_box_plus_minus,defensive_rebound_percentage,defensive_rebounds,defensive_win_shares,...,two_point_attempts,two_point_percentage,two_pointers,usage_percentage,win_shares,win_shares_per_40_minutes,is_center,is_forward,is_guard,group
0,Kofi Cockburn,6.6,22,3.0,27,9.3,2.0,25.6,201,1.8,...,381,0.593,226,32.0,5.3,0.247,1,0,0,8
1,Alfonso Plummer,7.1,35,0.0,0,4.7,0.4,7.7,70,1.0,...,123,0.455,56,22.8,3.7,0.15,0,0,1,2
2,Trent Frazier,21.8,126,0.4,4,5.6,3.0,7.1,70,1.4,...,113,0.513,58,18.4,3.6,0.135,0,0,1,11
3,Jacob Grandison,17.9,69,0.6,5,7.0,2.0,12.0,82,0.9,...,90,0.522,47,18.7,3.0,0.159,0,0,1,0
4,Coleman Hawkins,14.8,50,2.6,17,6.1,4.2,16.4,94,1.2,...,89,0.551,49,18.4,2.1,0.133,0,1,0,6


In [11]:
# Convert the following stats to a per game basis to make data analysis and labeling easier
per_game_stats = ['points', 'assists', 'total_rebounds', 'steals', 'blocks', 'field_goals',
         'three_pointers']
player_df_per_game = player_df.copy()
player_df_per_game[per_game_stats] = player_df_per_game[per_game_stats].div(player_df_per_game.games_played, axis=0).round(2)
player_df_per_game.head()

Unnamed: 0,name,assist_percentage,assists,block_percentage,blocks,box_plus_minus,defensive_box_plus_minus,defensive_rebound_percentage,defensive_rebounds,defensive_win_shares,...,two_point_attempts,two_point_percentage,two_pointers,usage_percentage,win_shares,win_shares_per_40_minutes,is_center,is_forward,is_guard,group
0,Kofi Cockburn,6.6,0.79,3.0,0.96,9.3,2.0,25.6,201,1.8,...,381,0.593,226,32.0,5.3,0.247,1,0,0,8
1,Alfonso Plummer,7.1,1.06,0.0,0.0,4.7,0.4,7.7,70,1.0,...,123,0.455,56,22.8,3.7,0.15,0,0,1,2
2,Trent Frazier,21.8,4.06,0.4,0.13,5.6,3.0,7.1,70,1.4,...,113,0.513,58,18.4,3.6,0.135,0,0,1,11
3,Jacob Grandison,17.9,2.3,0.6,0.17,7.0,2.0,12.0,82,0.9,...,90,0.522,47,18.7,3.0,0.159,0,0,1,0
4,Coleman Hawkins,14.8,1.52,2.6,0.52,6.1,4.2,16.4,94,1.2,...,89,0.551,49,18.4,2.1,0.133,0,1,0,6


In [12]:
# Stats to include for data analysis and labeling
stats = ['points', 'assists', 'total_rebounds', 'steals', 'blocks', 'field_goals',
         'field_goal_percentage', 'three_pointers', 'three_point_percentage', 'free_throw_percentage',
         'usage_percentage', 'player_efficiency_rating', 'offensive_win_shares', 'defensive_win_shares']

# Filter data set by group number and 
def group_stats(group_number: int, df: pd.DataFrame = player_df_per_game, stats: list[str] = stats) -> pd.DataFrame:
    if group_number < 1 or group_number > num_clusters:
        raise ValueError(f"Expected group_number to be between 1 and {num_clusters}, received {group_number}")
    cluster_players = df[df['group'] == group_number - 1]
    cluster_stats = pd.DataFrame(index=[group_number])
    for stat in stats:
        average = round(cluster_players[stat].mean(), 2)
        cluster_stats[stat] = [average]
    return cluster_stats

clusters = pd.concat([group_stats(cluster) for cluster in range(1, num_clusters + 1)])

In [13]:
clusters

Unnamed: 0,points,assists,total_rebounds,steals,blocks,field_goals,field_goal_percentage,three_pointers,three_point_percentage,free_throw_percentage,usage_percentage,player_efficiency_rating,offensive_win_shares,defensive_win_shares
1,6.63,1.46,3.68,0.71,0.25,2.32,0.43,1.15,0.36,0.72,14.57,12.31,1.11,0.96
2,4.15,0.91,2.3,0.43,0.21,1.49,0.4,0.45,0.27,0.68,17.98,10.68,0.24,0.45
3,14.53,2.42,4.45,0.88,0.31,4.96,0.42,2.01,0.36,0.79,23.49,17.6,2.27,1.05
4,9.34,2.14,4.61,0.61,0.4,3.61,0.51,0.55,0.27,0.72,20.91,18.43,1.81,0.88
5,1.45,0.34,0.85,0.13,0.14,0.51,0.44,0.14,0.22,0.66,16.86,9.76,0.06,0.09
6,19.46,2.23,7.73,0.84,1.61,7.01,0.51,1.16,0.29,0.75,29.27,28.97,4.33,1.78
7,7.82,1.01,4.55,0.68,0.68,2.95,0.49,0.37,0.21,0.71,21.77,18.97,1.15,0.88
8,2.19,0.43,1.5,0.26,0.21,0.82,0.41,0.2,0.19,0.61,14.98,10.27,0.13,0.3
9,15.76,1.67,8.57,0.67,0.94,6.22,0.6,0.05,0.12,0.63,32.97,34.97,3.57,1.57
10,12.45,2.89,5.51,0.94,0.58,4.71,0.5,0.71,0.31,0.74,23.56,20.65,2.14,1.35
