In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sportsipy.ncaab.teams import Teams
from sklearn.cluster import KMeans

In [3]:
# big_ten_teams = []
# for team in Teams():
#     if (team.conference == 'big-ten'):
#         big_ten_teams.append(team)

In [4]:
# def get_player_stats(teams: list, issue_players = []) -> pd.DataFrame:
#     df = pd.DataFrame()
#     for team in big_ten_teams:
#         print(f"Adding {team.name} players...")
#         for player in team.roster.players:
#             try:
#                 player_stats = player.dataframe.loc['2021-22']
#                 player_stats.insert(0, 'name', player.name)
#                 df = pd.concat([df, player_stats])
#             except:
#                 issue_players.append(player)
#     return df

# issue_players = []
# df = get_player_stats(big_ten_teams, issue_players)
# print(f"{len(issue_players)} players unable to be added: {issue_players}")

# Use this code to import directly from the dataset (faster)
df = pd.read_csv('data/player_stats.csv', index_col='Unnamed: 0')

In [5]:
player_df = df.reset_index(drop=True).copy()

# Filter players by at least 10 games and 50 minutes played
player_df = player_df[(player_df['games_played'] >= 10) & (player_df['minutes_played'] >= 50)]

# Drop irrelevant and completely empty (non-null count = 0) columns
player_df = player_df.drop(['conference', 'height', 'player_id', 'team_abbreviation', 'weight'], axis=1)

# If a player has a missing 3FG%, it means they have 0 3FGA, so fill with 0
player_df['three_point_percentage'] = player_df['three_point_percentage'].fillna(0)

# One-hot encode the 'position' column
player_df['position'] = player_df['position'].apply(str.lower)
ohe = pd.get_dummies(player_df['position'], prefix='is')
player_df = player_df.drop('position', axis=1)
player_df = player_df.join(ohe)

In [6]:
player_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 154 entries, 0 to 195
Data columns (total 49 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   name                             154 non-null    object 
 1   assist_percentage                154 non-null    float64
 2   assists                          154 non-null    int64  
 3   block_percentage                 154 non-null    float64
 4   blocks                           154 non-null    int64  
 5   box_plus_minus                   154 non-null    float64
 6   defensive_box_plus_minus         154 non-null    float64
 7   defensive_rebound_percentage     154 non-null    float64
 8   defensive_rebounds               154 non-null    int64  
 9   defensive_win_shares             154 non-null    float64
 10  effective_field_goal_percentage  154 non-null    float64
 11  field_goal_attempts              154 non-null    int64  
 12  field_goal_percentage 

In [7]:
player_df.head()

Unnamed: 0,name,assist_percentage,assists,block_percentage,blocks,box_plus_minus,defensive_box_plus_minus,defensive_rebound_percentage,defensive_rebounds,defensive_win_shares,...,turnovers,two_point_attempts,two_point_percentage,two_pointers,usage_percentage,win_shares,win_shares_per_40_minutes,is_center,is_forward,is_guard
0,Kofi Cockburn,6.6,22,3.0,27,9.3,2.0,25.6,201,1.8,...,65,381,0.593,226,32.0,5.3,0.247,1,0,0
1,Alfonso Plummer,7.1,35,0.0,0,4.7,0.4,7.7,70,1.0,...,43,123,0.455,56,22.8,3.7,0.15,0,0,1
2,Trent Frazier,21.8,126,0.4,4,5.6,3.0,7.1,70,1.4,...,55,113,0.513,58,18.4,3.6,0.135,0,0,1
3,Jacob Grandison,17.9,69,0.6,5,7.0,2.0,12.0,82,0.9,...,37,90,0.522,47,18.7,3.0,0.159,0,0,1
4,Coleman Hawkins,14.8,50,2.6,17,6.1,4.2,16.4,94,1.2,...,45,89,0.551,49,18.4,2.1,0.133,0,1,0


In [8]:
# plt.figure(figsize=(35, 25))
# sns.heatmap(data=player_df.corr(), annot=True)

In [9]:
model_data = player_df.drop('name', axis=1)
num_clusters = 8
model = KMeans(n_clusters=num_clusters, random_state=0).fit(model_data)
labels = model.labels_ + 1

In [10]:
player_groups = pd.DataFrame(data={'name': player_df['name'], 'group': labels})
groups = {}

for group in range(1, num_clusters + 1):
    groups[f"Group {group}"] = list(player_groups.query(f"group == {group}")['name'])
    
for group in groups:
    print(f"{group}:\n")
    print(groups[group], "\n\n\n")

Group 1:

['RJ Melendez', 'Omar Payne', 'Luke Goode', 'Benjamin Bosmans-Verdonk', 'Brandin Podziemski', 'Michael Durr', 'Khristian Lander', 'Anthony Leal', 'Austin Ash', 'Josh Ogundele', 'Riley Mulvey', 'Simon Wright', 'Mady Sissoko', 'Pierre Brooks', 'Jace Howard', 'Jaron Faulds', 'Treyton Thompson', 'Wilhelm Breidenbach', 'Keon Edwards', 'Matthew Nicholson', 'Brooks Barnhizer', 'Joey Brunk', 'Jimmy Sotos', 'Jevonnie Scott', 'Caleb Dorsey', 'Dean Reiber', 'Jaden Jones', 'Ralph Agee', 'Oskar Palmquist', 'Jalen Miller', 'Lorne Bowman II', 'Jahcobi Neath', 'Jordan Davis', 'Carter Gilmore'] 



Group 2:

['Xavier Johnson', 'Race Thompson', 'DeVante Jones', 'Alonzo Verge Jr.', 'Pete Nance', 'Malaki Branham', 'Sam Sessoms', 'John Harrar', 'Zach Edey', 'Trevion Williams', 'Clifford Omoruyi', 'Tyler Wahl'] 



Group 3:

['Coleman Hawkins', 'Joe Toussaint', 'Connor McCaffery', 'Julian Reese', 'Xavier Green', 'Julius Marble', 'Jaden Akins', 'Sean Sutherlin', 'Kobe Webster', 'Lat Mayen', 'Keisei

In [11]:
player_df = player_df.merge(player_groups)
player_df.head()

Unnamed: 0,name,assist_percentage,assists,block_percentage,blocks,box_plus_minus,defensive_box_plus_minus,defensive_rebound_percentage,defensive_rebounds,defensive_win_shares,...,two_point_attempts,two_point_percentage,two_pointers,usage_percentage,win_shares,win_shares_per_40_minutes,is_center,is_forward,is_guard,group
0,Kofi Cockburn,6.6,22,3.0,27,9.3,2.0,25.6,201,1.8,...,381,0.593,226,32.0,5.3,0.247,1,0,0,8
1,Alfonso Plummer,7.1,35,0.0,0,4.7,0.4,7.7,70,1.0,...,123,0.455,56,22.8,3.7,0.15,0,0,1,6
2,Trent Frazier,21.8,126,0.4,4,5.6,3.0,7.1,70,1.4,...,113,0.513,58,18.4,3.6,0.135,0,0,1,6
3,Jacob Grandison,17.9,69,0.6,5,7.0,2.0,12.0,82,0.9,...,90,0.522,47,18.7,3.0,0.159,0,0,1,5
4,Coleman Hawkins,14.8,50,2.6,17,6.1,4.2,16.4,94,1.2,...,89,0.551,49,18.4,2.1,0.133,0,1,0,3


In [12]:
# Convert the following stats to a per game basis to make data analysis and labeling easier
per_game_stats = ['points', 'assists', 'total_rebounds', 'steals', 'blocks', 'field_goals',
         'three_pointers']
player_df_per_game = player_df.copy()
player_df_per_game[per_game_stats] = player_df_per_game[per_game_stats].div(player_df_per_game.games_played, axis=0).round(2)
player_df_per_game = player_df_per_game.rename(columns={og: og + "_per_game" for og in per_game_stats})
player_df_per_game.head()

Unnamed: 0,name,assist_percentage,assists_per_game,block_percentage,blocks_per_game,box_plus_minus,defensive_box_plus_minus,defensive_rebound_percentage,defensive_rebounds,defensive_win_shares,...,two_point_attempts,two_point_percentage,two_pointers,usage_percentage,win_shares,win_shares_per_40_minutes,is_center,is_forward,is_guard,group
0,Kofi Cockburn,6.6,0.79,3.0,0.96,9.3,2.0,25.6,201,1.8,...,381,0.593,226,32.0,5.3,0.247,1,0,0,8
1,Alfonso Plummer,7.1,1.06,0.0,0.0,4.7,0.4,7.7,70,1.0,...,123,0.455,56,22.8,3.7,0.15,0,0,1,6
2,Trent Frazier,21.8,4.06,0.4,0.13,5.6,3.0,7.1,70,1.4,...,113,0.513,58,18.4,3.6,0.135,0,0,1,6
3,Jacob Grandison,17.9,2.3,0.6,0.17,7.0,2.0,12.0,82,0.9,...,90,0.522,47,18.7,3.0,0.159,0,0,1,5
4,Coleman Hawkins,14.8,1.52,2.6,0.52,6.1,4.2,16.4,94,1.2,...,89,0.551,49,18.4,2.1,0.133,0,1,0,3


In [13]:
# Stats to include for data analysis and labeling
stats = [stat + "_per_game" for stat in per_game_stats] + ['field_goal_percentage',
        'three_point_percentage', 'free_throw_percentage', 'effective_field_goal_percentage',
        'usage_percentage', 'player_efficiency_rating', 'offensive_win_shares', 'defensive_win_shares',
        'is_center', 'is_forward', 'is_guard']

# Filter data set by group number and 
def group_stats(group_number: int, df: pd.DataFrame = player_df_per_game, stats: list[str] = stats) -> pd.DataFrame:
    if group_number < 1 or group_number > num_clusters:
        raise ValueError(f"Expected group_number to be between 1 and {num_clusters}, received {group_number}")
    cluster_players = df[df['group'] == group_number]
    cluster_stats = pd.DataFrame(index=[group_number])
    for stat in stats:
        average = round(cluster_players[stat].mean(), 2)
        cluster_stats[stat] = [average]
    return cluster_stats

clusters = pd.concat([group_stats(cluster) for cluster in range(1, num_clusters + 1)])

In [14]:
clusters

Unnamed: 0,points_per_game,assists_per_game,total_rebounds_per_game,steals_per_game,blocks_per_game,field_goals_per_game,three_pointers_per_game,field_goal_percentage,three_point_percentage,free_throw_percentage,effective_field_goal_percentage,usage_percentage,player_efficiency_rating,offensive_win_shares,defensive_win_shares,is_center,is_forward,is_guard
1,1.78,0.39,1.09,0.18,0.17,0.64,0.18,0.43,0.22,0.64,0.49,16.08,10.15,0.1,0.17,0.12,0.41,0.47
2,12.34,2.6,6.05,0.83,0.57,4.73,0.57,0.52,0.27,0.72,0.55,25.16,23.39,2.38,1.32,0.17,0.42,0.42
3,5.09,1.14,2.79,0.6,0.27,1.78,0.69,0.42,0.33,0.74,0.51,15.07,12.55,0.76,0.66,0.0,0.37,0.63
4,3.82,0.82,2.22,0.41,0.21,1.39,0.39,0.41,0.24,0.67,0.46,17.43,10.42,0.21,0.44,0.07,0.33,0.6
5,8.37,1.42,4.73,0.61,0.6,3.19,0.56,0.5,0.25,0.71,0.54,20.49,18.58,1.51,0.91,0.06,0.65,0.29
6,13.29,2.39,3.95,0.86,0.31,4.49,1.99,0.41,0.35,0.81,0.51,22.0,16.24,2.1,1.01,0.0,0.2,0.8
7,7.53,2.19,3.08,0.93,0.18,2.69,1.12,0.42,0.37,0.74,0.51,15.42,12.36,1.17,0.99,0.0,0.13,0.87
8,19.67,2.02,8.14,0.84,1.52,7.16,0.99,0.53,0.25,0.74,0.56,29.66,29.41,4.21,1.79,0.29,0.43,0.29


In [15]:
# clusters.to_csv('clusters.csv')

SAME THING BUT TRAINED KMEANS ON PER GAME STATS INSTEAD OF TOTAL STATS

In [16]:
per_game_model_data = player_df_per_game[clusters.columns]
per_game_model = KMeans(n_clusters=num_clusters, random_state=0).fit(per_game_model_data)
per_game_labels = per_game_model.labels_ + 1

In [17]:
per_game_player_groups = pd.DataFrame(data={'name': player_df['name'], 'group': per_game_labels})
per_game_groups = {}

for group in range(1, num_clusters + 1):
    per_game_groups[f"Group {group}"] = list(per_game_player_groups.query(f"group == {group}")['name'])
    
for group in per_game_groups:
    print(f"{group}:\n")
    print(per_game_groups[group], "\n\n\n")

Group 1:

["Da'Monte Williams", 'Omar Payne', 'Luke Goode', 'Benjamin Bosmans-Verdonk', 'Anthony Leal', 'Filip Rebraca', 'Joe Toussaint', 'Ahron Ulis', 'Connor McCaffery', 'Jaden Akins', 'Mady Sissoko', 'Terrance Williams II', 'Treyton Thompson', 'Eduardo Andre', 'Ryan Greer', 'Julian Roper II', 'Eugene Brown III', 'Joey Brunk', 'Jimmy Sotos', 'Jalanni White', 'Mason Gillis', 'Isaiah Thompson', 'Caleb Furst', 'Ethan Morton', 'Oskar Palmquist', 'Chris Vogt'] 



Group 2:

['RJ Melendez', 'Race Thompson', 'Kris Murray', 'Austin Ash', 'Marcus Bingham', 'Ryan Young', 'Zed Key', 'John Harrar', 'Clifford Omoruyi'] 



Group 3:

['Brandin Podziemski', 'Tamar Bates', 'Rob Phinisee', 'Khristian Lander', 'Josh Ogundele', 'Ian Martinez', 'Pierre Brooks', 'Brandon Johns Jr', 'Frankie Collins', 'Kobe Bufkin', 'Jace Howard', 'Jaron Faulds', 'Wilhelm Breidenbach', 'Elyjah Williams', 'Casey Simmons', 'Brooks Barnhizer', 'Cedric Russell', 'Meechie Johnson Jr.', 'Dallion Johnson', 'Jevonnie Scott', 'Bra

In [18]:
per_game_clusters = pd.concat([per_game_model_data, per_game_player_groups], axis=1)
per_game_clusters.head()

Unnamed: 0,points_per_game,assists_per_game,total_rebounds_per_game,steals_per_game,blocks_per_game,field_goals_per_game,three_pointers_per_game,field_goal_percentage,three_point_percentage,free_throw_percentage,effective_field_goal_percentage,usage_percentage,player_efficiency_rating,offensive_win_shares,defensive_win_shares,is_center,is_forward,is_guard,name,group
0,20.89,0.79,10.57,0.82,0.96,8.07,0.0,0.593,0.0,0.655,0.593,32.0,32.1,3.5,1.8,1,0,0,Kofi Cockburn,4
1,14.61,1.06,2.45,0.3,0.0,4.67,2.97,0.424,0.408,0.874,0.559,22.8,16.2,2.8,1.0,0,0,1,Alfonso Plummer,6
2,11.61,4.06,2.81,1.32,0.13,3.9,2.03,0.397,0.328,0.833,0.5,18.4,14.7,2.2,1.4,0,0,1,Trent Frazier,5
3,9.57,2.3,3.83,0.33,0.17,3.4,1.83,0.455,0.41,0.824,0.578,18.7,17.1,2.1,0.9,0,0,1,Jacob Grandison,5
4,5.88,1.52,4.27,0.79,0.52,2.06,0.58,0.442,0.292,0.65,0.503,18.4,16.1,0.9,1.2,0,1,0,Coleman Hawkins,5


In [19]:
per_game_clusters = pd.concat([group_stats(cluster, per_game_clusters) for cluster in range(1, num_clusters + 1)])

In [20]:
per_game_clusters

Unnamed: 0,points_per_game,assists_per_game,total_rebounds_per_game,steals_per_game,blocks_per_game,field_goals_per_game,three_pointers_per_game,field_goal_percentage,three_point_percentage,free_throw_percentage,effective_field_goal_percentage,usage_percentage,player_efficiency_rating,offensive_win_shares,defensive_win_shares,is_center,is_forward,is_guard
1,3.1,0.95,2.31,0.38,0.25,1.11,0.35,0.46,0.27,0.65,0.53,13.13,12.4,0.62,0.53,0.15,0.35,0.5
2,8.45,0.79,5.36,0.52,0.73,3.29,0.34,0.55,0.26,0.68,0.59,23.14,23.99,1.73,1.03,0.11,0.56,0.33
3,2.86,0.57,1.4,0.33,0.15,1.03,0.36,0.38,0.22,0.63,0.44,18.64,8.88,0.04,0.27,0.0,0.35,0.65
4,18.34,1.91,8.4,0.77,1.49,6.88,0.67,0.55,0.22,0.71,0.57,30.73,32.02,4.12,1.75,0.38,0.5,0.12
5,8.3,1.84,3.72,0.66,0.37,3.02,0.81,0.48,0.29,0.76,0.54,19.58,16.98,1.55,0.84,0.06,0.48,0.45
6,13.81,2.76,4.33,0.94,0.36,4.8,1.66,0.42,0.35,0.78,0.49,25.27,17.65,1.89,1.05,0.0,0.19,0.81
7,1.27,0.35,1.15,0.18,0.14,0.44,0.13,0.35,0.2,0.71,0.4,11.71,5.22,-0.04,0.22,0.08,0.46,0.46
8,6.85,1.39,2.95,0.78,0.19,2.39,1.13,0.4,0.35,0.73,0.5,15.76,10.94,0.74,0.76,0.0,0.2,0.8


In [23]:
# Sort clusters by a specific column for data analysis and labeling
per_game_clusters.sort_values(by=['three_point_percentage'], ascending=False)

Unnamed: 0,points_per_game,assists_per_game,total_rebounds_per_game,steals_per_game,blocks_per_game,field_goals_per_game,three_pointers_per_game,field_goal_percentage,three_point_percentage,free_throw_percentage,effective_field_goal_percentage,usage_percentage,player_efficiency_rating,offensive_win_shares,defensive_win_shares,is_center,is_forward,is_guard
6,13.81,2.76,4.33,0.94,0.36,4.8,1.66,0.42,0.35,0.78,0.49,25.27,17.65,1.89,1.05,0.0,0.19,0.81
8,6.85,1.39,2.95,0.78,0.19,2.39,1.13,0.4,0.35,0.73,0.5,15.76,10.94,0.74,0.76,0.0,0.2,0.8
5,8.3,1.84,3.72,0.66,0.37,3.02,0.81,0.48,0.29,0.76,0.54,19.58,16.98,1.55,0.84,0.06,0.48,0.45
1,3.1,0.95,2.31,0.38,0.25,1.11,0.35,0.46,0.27,0.65,0.53,13.13,12.4,0.62,0.53,0.15,0.35,0.5
2,8.45,0.79,5.36,0.52,0.73,3.29,0.34,0.55,0.26,0.68,0.59,23.14,23.99,1.73,1.03,0.11,0.56,0.33
3,2.86,0.57,1.4,0.33,0.15,1.03,0.36,0.38,0.22,0.63,0.44,18.64,8.88,0.04,0.27,0.0,0.35,0.65
4,18.34,1.91,8.4,0.77,1.49,6.88,0.67,0.55,0.22,0.71,0.57,30.73,32.02,4.12,1.75,0.38,0.5,0.12
7,1.27,0.35,1.15,0.18,0.14,0.44,0.13,0.35,0.2,0.71,0.4,11.71,5.22,-0.04,0.22,0.08,0.46,0.46


**Evaluated Groups:**
* Group 1: Low-Usage, yet somewhat Efficient Players (2nd Lowest in USG%, 4th Lowest PER)

* Group 2: Efficient Interior Players (2nd in RPG, BPG; T-1st FG%; 1st in EFG%)

* Group 3: 3rd String Players (2nd Lowest PPG, APG, RPG, SPG, etc.)

* Group 4: Elite Big Men (1st in PPG, RPG, BPG)

* Group 5: Average Role-Players

* Group 6: Play-Making Wings (2nd in PPG; 1st in APG, SPG, 3PPG)

* Group 7: Garbage-Time Substitutions (Lowest PPG, APG, RPG, SPG, etc.)

* Group 8: Miscellaneous Archetypes 
