In [242]:
import pandas as pd
import csv
from collections import defaultdict
from sklearn.preprocessing import MinMaxScaler
import networkx as nx

In [243]:
nba = pd.read_csv("all_seasons.csv")

In [244]:
# Clean year to only have rookie data
# Extract the first four characters of the 'season' column, convert to integer
nba['season_year'] = nba['season'].str[:4].astype(int)

# Replace "undrafted" with 0 in the 'draft_year' column
nba['draft_year'] = nba['draft_year'].replace('Undrafted', 0)
nba['draft_year'] = nba['draft_year'].astype(int)

In [245]:
# Filter the DataFrame
rookies = nba[nba['draft_year'] == nba['season_year']]
rookies

Unnamed: 0.1,Unnamed: 0,player_name,team_abbreviation,age,player_height,player_weight,college,country,draft_year,draft_round,...,reb,ast,net_rating,oreb_pct,dreb_pct,usg_pct,ts_pct,ast_pct,season,season_year
0,0,Randy Livingston,HOU,22.0,193.04,94.800728,Louisiana State,USA,1996,2,...,1.5,2.4,0.3,0.042,0.071,0.169,0.487,0.248,1996-97,1996
38,38,Erick Dampier,IND,21.0,210.82,120.201880,Mississippi State,USA,1996,1,...,4.1,0.6,-2.0,0.107,0.216,0.218,0.451,0.074,1996-97,1996
50,50,Jerome Williams,DET,24.0,205.74,93.439952,Georgetown,USA,1996,1,...,1.5,0.2,3.0,0.144,0.182,0.181,0.419,0.071,1996-97,1996
67,67,John Wallace,NYK,23.0,205.74,102.058200,Syracuse,USA,1996,1,...,2.3,0.5,2.7,0.080,0.148,0.204,0.571,0.081,1996-97,1996
72,72,Jermaine O'Neal,POR,18.0,210.82,102.511792,,USA,1996,1,...,2.8,0.2,1.3,0.099,0.198,0.199,0.494,0.030,1996-97,1996
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12793,12793,Jake LaRavia,MEM,21.0,200.66,106.594120,Wake Forest,USA,2022,1,...,1.8,0.6,-7.6,0.044,0.100,0.117,0.531,0.068,2022-23,2022
12814,12814,Josh Minott,MIN,20.0,203.20,92.986360,Memphis,USA,2022,2,...,1.7,0.3,10.4,0.054,0.194,0.176,0.582,0.065,2022-23,2022
12819,12819,Johnny Davis,WAS,21.0,193.04,88.450440,Wisconsin,USA,2022,1,...,2.3,1.0,-6.2,0.020,0.125,0.192,0.446,0.100,2022-23,2022
12824,12824,Jaylin Williams,OKC,21.0,205.74,108.862080,Arkansas,USA,2022,2,...,4.9,1.6,-4.6,0.045,0.230,0.133,0.559,0.109,2022-23,2022


In [246]:
rookies = rookies.dropna()
rookies.isna().sum()

Unnamed: 0           0
player_name          0
team_abbreviation    0
age                  0
player_height        0
player_weight        0
college              0
country              0
draft_year           0
draft_round          0
draft_number         0
gp                   0
pts                  0
reb                  0
ast                  0
net_rating           0
oreb_pct             0
dreb_pct             0
usg_pct              0
ts_pct               0
ast_pct              0
season               0
season_year          0
dtype: int64

In [247]:
def create_network_with_players_and_points(data):
    # Create a dictionary to store the adjacency list
    adjacency_list = defaultdict(list)

    # Iterate over each row in the data
    for index, row in data.iterrows():
        college = row['college']
        team = row['team_abbreviation']
        player_name = row['player_name']
        draft_year = row['draft_year']
        score = row["composite_performance_score"]

        # Append the tuple (team, player_name, player_points) to the college's list
        adjacency_list[college].append((team, player_name, draft_year, score))

    return adjacency_list

In [248]:
# Replace negative net_rating values with 0
rookies['net_rating'] = rookies['net_rating'].apply(lambda x: max(x, 0))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rookies['net_rating'] = rookies['net_rating'].apply(lambda x: max(x, 0))


In [249]:
rookies.isnull().sum()

Unnamed: 0           0
player_name          0
team_abbreviation    0
age                  0
player_height        0
player_weight        0
college              0
country              0
draft_year           0
draft_round          0
draft_number         0
gp                   0
pts                  0
reb                  0
ast                  0
net_rating           0
oreb_pct             0
dreb_pct             0
usg_pct              0
ts_pct               0
ast_pct              0
season               0
season_year          0
dtype: int64

In [250]:
# Ensure all values are numeric, filling NaNs with zero or an appropriate value if necessary
rookies.fillna(0, inplace=True)  # Or choose a strategy like rookies.fillna(rookies.mean(), inplace=True)

# Select only the columns to normalize
numeric_cols = ["pts", "reb", "ast", "net_rating", "ts_pct", "usg_pct"]
scaler = MinMaxScaler()

# Normalize the selected columns
normalized_vals = scaler.fit_transform(rookies[numeric_cols])
normalized_df = pd.DataFrame(normalized_vals, columns=numeric_cols)

# Ensure indices align between rookies and normalized_df
normalized_df.index = rookies.index

# Create a composite score by averaging the normalized values
rookies['composite_performance_score'] = normalized_df.mean(axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rookies.fillna(0, inplace=True)  # Or choose a strategy like rookies.fillna(rookies.mean(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rookies['composite_performance_score'] = normalized_df.mean(axis=1)


In [251]:
rookies.to_csv("rookies.csv")

In [252]:
adj_list = create_network_with_players_and_points(rookies)

In [253]:
with open('adj_list.csv', 'w') as csv_file:
    writer = csv.writer(csv_file)
    for key, value in adj_list.items():
       writer.writerow([key, value])

In [254]:
def get_all_nodes_with_connections(adjacency_list):
    # Create a dictionary to count connections for colleges only
    college_connection_counts = defaultdict(int)

    # Iterate over each college in the adjacency list
    for college, connections in adjacency_list.items():
        # Count each player connection for the college
        college_connection_counts[college] += len(connections)

    # Sort the colleges by the number of connections in descending order
    sorted_college_connections = sorted(college_connection_counts.items(), key=lambda x: x[1], reverse=True)

    return sorted_college_connections

In [255]:
# Get the list of all nodes with their connections
sorted_nodes_with_connections = get_all_nodes_with_connections(adj_list)
for node, count in sorted_nodes_with_connections:
    print(f"'{node}': {count} connections")

'Kentucky': 54 connections
'Duke': 47 connections
'Arizona': 34 connections
'UCLA': 33 connections
'Kansas': 32 connections
'North Carolina': 31 connections
'Syracuse': 22 connections
'Connecticut': 21 connections
'Michigan State': 19 connections
'Michigan': 19 connections
'Maryland': 19 connections
'Villanova': 18 connections
'Washington': 18 connections
'Florida State': 17 connections
'Texas': 17 connections
'Louisville': 16 connections
'Florida': 16 connections
'Louisiana State': 15 connections
'Memphis': 15 connections
'Stanford': 15 connections
'Indiana': 14 connections
'Gonzaga': 14 connections
'Georgia Tech': 13 connections
'Southern California': 13 connections
'Oregon': 13 connections
'Tennessee': 12 connections
'Iowa State': 12 connections
'Georgetown': 11 connections
'Baylor': 11 connections
'Alabama': 10 connections
'Marquette': 10 connections
'Ohio State': 10 connections
'Vanderbilt': 10 connections
'Virginia': 10 connections
'California': 9 connections
'Cincinnati': 9 conn

In [256]:
def calculate_sorted_average_composite_score(adjacency_list):
    # Dictionary to store the average composite score for each college
    college_averages = {}

    # Iterate through the adjacency list
    for college, players in adjacency_list.items():
        # Initialize a sum of composite scores and player count
        composite_score_sum = 0
        player_count = 0

        # Iterate over each player's data in the list for the college
        for _, _, _, composite_score in players:
            composite_score_sum += composite_score
            player_count += 1
        
        # Calculate the average composite score for the college
        if player_count > 0:
            average_score = composite_score_sum / player_count
        else:
            average_score = float('nan')  # Handle division by zero if no players

        # Store the calculated average in the dictionary
        college_averages[college] = average_score
    
    # Sort the dictionary by average composite score in descending order
    sorted_college_averages = sorted(college_averages.items(), key=lambda x: x[1], reverse=True)

    return sorted_college_averages

In [257]:
# Calculate and sort the average composite score for each college
sorted_average_scores = calculate_sorted_average_composite_score(adj_list)
sorted_average_scores

[('Davidson', 0.48798689709702625),
 ('Rhode Island', 0.45111757420779286),
 ('Morehead State', 0.3721018631055677),
 ('Rider', 0.37087344219936824),
 ('Weber State', 0.3668408307165245),
 ('Miami (OH)', 0.36641991083154163),
 ('Wake Forest', 0.34068621406572963),
 ('Pacific', 0.3386376404605629),
 ('Saint Louis', 0.3304748366629162),
 ('Utah', 0.3297691592955176),
 ('Eastern Washington', 0.3254857078853061),
 ('Santa Clara', 0.3186986931269049),
 ('Murray State', 0.3160915915034928),
 ('Louisiana Tech', 0.3159563086927864),
 ('Ball State', 0.3110478420783173),
 ('Louisiana-Lafayette', 0.3078819291542773),
 ('TCU', 0.30426851110701403),
 ('Memphis', 0.3023473662131766),
 ('Connecticut', 0.3009791799976441),
 ('Duke', 0.30095457762660555),
 ('South Dakota State', 0.29941422870762),
 ('St. Bonaventure', 0.296922994979252),
 ('Kentucky', 0.2948739786213563),
 ('North Dakota', 0.2940468763828574),
 ('Oklahoma', 0.2925701653461593),
 ('Texas', 0.2898211171063898),
 ('University of Texas at 

In [258]:
def top_three_colleges_by_team(adjacency_list, target_team):
    # Dictionary to store the count of players drafted from each college by the target team
    college_counts = defaultdict(int)

    # Iterate through the adjacency list
    for college, players in adjacency_list.items():
        # Iterate through each player's data
        for team, player, year, composite_score in players:
            # If the team matches the target_team, increment the count for this college
            if team == target_team:
                college_counts[college] += 1

    # Sort the colleges by the number of players drafted by the target team in descending order
    sorted_colleges = sorted(college_counts.items(), key=lambda x: x[1], reverse=True)

    # Return the top three colleges or fewer if less than three exist
    top_three_colleges = sorted_colleges

    return top_three_colleges

In [259]:
target_team = input("Enter NBA three letter team code: ")
top_colleges = top_three_colleges_by_team(adj_list, target_team)

# Print out the results
for college, count in top_colleges:
    print(f"{college}: {count}")


Georgetown: 3
Kentucky: 2
Villanova: 2
Kansas: 2
UCLA: 2
Purdue: 2
Syracuse: 1
Michigan State: 1
Georgia: 1
Louisville: 1
Tennessee: 1
Connecticut: 1
Memphis: 1
Arizona: 1
Marquette: 1
Michigan: 1
Iowa State: 1
Cincinnati: 1
Colorado: 1
Duke: 1
Iowa: 1
Florida: 1
Washington: 1
Creighton: 1
Missouri: 1
Oklahoma State: 1
Vanderbilt: 1
North Carolina-Charlotte: 1
Pepperdine: 1
Gonzaga: 1
Texas A&M: 1
Eastern Washington: 1
Alabama-Birmingham: 1
North Texas: 1
Miami: 1
