In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import time
from tqdm import tqdm
import re

# Functions

In [2]:
def getDraftURL(year):
    """
    Get tankathon draft history URL for given year
    """
    return f"https://www.tankathon.com/past_drafts/{year}"

def getSoupFromURL(url):
    """
    Get soup from given url
    """
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    return soup

def getDraftRows(soup):
    """
    Get all player rows for tankathon draft history year soup
    """
    tables = soup.find_all(attrs = {'class' : 'mock-rows'})
    player_rows = []
    for table in tables:
        p_rows = table.find_all(attrs = {'class' : 'mock-row'})
        player_rows.extend(p_rows)
    return player_rows

def getPick(draft_row):
    """
    Retrieve draft pick from a given row in the draft table
    """
    row_pick_number = draft_row.find(attrs = {'class' : 'mock-row-pick-number'})
    subtree = row_pick_number.find('div')
    if subtree is None:
        return row_pick_number.text
    else:
        subtree.extract()
        return row_pick_number.text
    
def getDraftTeam(draft_row):
    """
    Retrieve draft team from a given row in the draft table
    """
    return draft_row.find('div', attrs = {'class' : 'mock-row-logo'}).find('img')['alt']
    
def getReference(draft_row):
    """
    Get the url reference to the player of a given draft row
    """
    mock_row_player = draft_row.find(attrs = {'class' : 'mock-row-player'})
    if mock_row_player.find('a') is None:
        return None
    MAIN = "https://www.tankathon.com/"
    return MAIN + mock_row_player.find('a')['href']

def getName(player_soup):
    """
    Get the name for the player given the player page soup
    """
    return player_soup.find('h1').text

def getBlockInfo(block):
    """
    Divide a block into a label and value
    """
    label = block.find('div', attrs = {'class' : 'label'}).text
    value = block.find('div', attrs = {'class' : 'data'}).text
    return label, value

def getPlayerInfo(player_soup):
    """
    Get all player biographical info from a player page 
    """
    player_info = player_soup.find('div', attrs = {'class' : 'player-info'})
    blocks = player_info.find_all('div', attrs = {'class' : 'data-block'})
    info = {}
    for block in blocks:
        label, value = getBlockInfo(block)
        info[label] = value
        if label == "Height":
            if block.find('span') is not None:
                wing = block.find('span').text
            else:
                wing = ''
            info['Wingspan'] = wing
    return info

def filterStatHeaders(stat_headers_raw):
    stat_headers = []
    remove_if_see = ['STRENGTHS', 'TOP ', 'GAME LOG', "NBA COMBINE", " DRAFT"]
    for header in stat_headers_raw:
        keep = True
        for phrase in remove_if_see:
            if phrase in header.text:
                keep = False
                break
        if keep:
            stat_headers.append(header)
    return stat_headers

def getPlayerStats(player_soup):
    """
    Get all player stats (per 36 and advanced, not per game) from a player page
    """
    player_data = {}
    stat_headers_raw = player_soup.find_all('div', attrs = {'class' : 'stats-header'})
    stat_headers = filterStatHeaders(stat_headers_raw)
    for class_type in ['game-log', 'stats combine']:
        if player_soup.find('div', attrs = {'class' : class_type}) is not None:
            subtree = player_soup.find('div', attrs = {'class' : class_type})
            subtree.extract()
    stat_values = player_soup.find_all('div', attrs = {'class' : 'stats'})
    assert len(stat_headers) == len(stat_values)
    if len(stat_headers) == 0:
        return player_data
    stat_combos = zip(stat_headers, stat_values)
    for header, stats in stat_combos:
        # if "PER GAME AVERAGES" in header.text:
        #     continue
        stat_row = stats.find('div', attrs = {'class' : 'stat-row'})
        children = stat_row.contents
        for child in children:
            child_contents = child.contents
            label = child_contents[0].text
            val = child_contents[1].text
            if ("PER GAME AVERAGES" in header.text and label == "MP") or ("PER GAME AVERAGES" not in header.text and label != "MP"):
                player_data[label] = val
    return player_data

def cleanHeight(ht):
    """
    Use regular expressions to extract height in inches from string
    """
    pattern = r'(\d+)\'(\d+(?:\.\d+)?)'
    match = re.search(pattern, ht)
    ft = int(match.group(1))
    inch = float(match.group(2))
    height = 12 * ft + inch
    return height

def cleanWingspan(ws):
    """
    Use regular expressions to extract wingspan in inches from string
    """
    if ws == '':
        return np.nan
    pattern = r'(\d+)\'(\d+(?:\.\d+)?)'
    match = re.search(pattern, ws)
    ft = int(match.group(1))
    inch = float(match.group(2))
    wingspan = 12 * ft + inch
    return wingspan

def cleanPlayerInfo(info):
    """
    Make all biographical information neater (convert strings to floats when necessary, clean up stuff, etc)
    """
    new_info = {}
    for key, value in info.items():
        if key == 'Height':
            new_info[key] = cleanHeight(value)
        elif key == 'Wingspan':
            new_info[key] = cleanWingspan(value)
        elif key == 'Weight':
            wt = int(value[:3])
            new_info[key] = wt
        elif key == 'Draft Age':
            age = float(value[:-4])
            new_info[key] = age
        elif key == 'Draft':
            draft_data = value.split(' | ')
            draft_tm = draft_data[-1]
            new_info['Draft Team'] = draft_tm.strip()
        elif key in ['ESPN 100', 'High School', 'Hometown']:
            continue
        else:
            new_info[key] = value.strip()
    return new_info

def cleanStatName(s):
    """
    Convert the name of a stat to its desired representation label
    """
    dct = {'True Shooting %TS%' : 'TS%',
           'Effective FG%EFG%' : 'eFG%',
           '3PA Rate3PAR' : '3PAr',
           'FTA RateFTAR' : 'FTAr'}
    if s in dct:
        return dct[s]
    else:
        return s

def cleanPlayerStats(stats):
    """
    Clean player stats from the player page 
    """
    exclude = ['ORTG', 'DRTG', 'Proj NBA 3P%NBA 3P%']
    fill_with_na = ['OWS/40', 'DWS/40', 'WS/40', 'OBPM', 'DBPM', 'BPM']
    new_stats = {}
    for stat, value in stats.items():
        if stat in exclude:
            continue
        elif '-' in stat:
            names = stat.split('-')
            vals = value.split('-')
            vals = [float(v) for v in vals]
            for n, v in zip(names, vals):
                new_stats[n] = v
        elif value == '':
            if stat in fill_with_na:
                new_stats[stat] = np.nan
            else:
                new_stats[stat] = 0.0
        else:
            new_stats[cleanStatName(stat)] = float(value)
    return new_stats

def getPlayerAll(player_soup):
    """
    Get dictionary including all cleaned player biographical info and stats from player page
    """
    name = getName(player_soup)
    info = getPlayerInfo(player_soup)
    info = cleanPlayerInfo(info)
    stats = getPlayerStats(player_soup)
    stats = cleanPlayerStats(stats)
    for key, value in stats.items():
        info[key] = value
    info['Name'] = name
    return info

def mergeDicts(dict_list):
    """
    Given a list of dictionaries mapping strings to lists, return a new dictionary mapping strings to
    lists that combine all the lists for that key within the input dict_list
    """
    master = {col : [] for col in dict_list[0].keys()}
    for dct in dict_list:
        for key, val_list in dct.items():
            master[key].extend(val_list)
    return master

def getAllCols():
    """
    Get all relevant draft player columns in the desired order
    """
    cols = ['Name', 'Year', 'Draft Team', 'Pick', 'Team', 'Position', 'Height', 'Wingspan', 'Weight', 'Draft Age', 'Birthdate', 
            'Nation', 'G', 'MP', 'FGM', 'FGA', 'FG%', '3PM', '3PA', '3P%', 'FTM', 'FTA', 'FT%', 'REB', 'AST', 'BLK',
            'STL', 'TO', 'PF', 'PTS', 'TS%', 'eFG%', '3PAr', 'FTAr', 'USG%', 'AST/USG', 'AST/TO', 'PER', 'OWS/40',
            'DWS/40', 'WS/40', 'OBPM', 'DBPM', 'BPM']
    return cols

def getYearStats(year):
    """
    Gather draft player data for one season only (returns a dict mapping stats to list of stats
    """
    draft_soup = getSoupFromURL(getDraftURL(year))
    draft_rows = getDraftRows(draft_soup)
    data = {col : [] for col in getAllCols()}
    for row in tqdm(draft_rows):
        # skip if row represents a forfeited pick
        if getPick(row) == '':
            continue
        # other stuff
        pick = int(getPick(row))
        draft_team = getDraftTeam(row)
        ref = getReference(row)
        if ref is None:
            continue
        time.sleep(2)
        player_soup = getSoupFromURL(ref)
        player_info = getPlayerAll(player_soup)
        player_info['Pick'] = pick
        player_info['Draft Team'] = draft_team
        player_info['Year'] = year
        for stat in data.keys():
            if stat in player_info:
                data[stat].append(player_info[stat])
            else:
                data[stat].append(np.nan)
    return data

def getAllDraftHistory(start_year, end_year):
    """
    Get draft player data for a range of seasons (returns a dict mapping stats to list of stats)
    """
    data_list = []
    for yr in range(start_year, end_year + 1):
        print(f"Retreiving Data for {yr}")
        year_data = getYearStats(yr)
        data_list.append(year_data)
    full_data = mergeDicts(data_list)
    return full_data

def getDraftBoard(year):
    draft_soup = getSoupFromURL("https://www.tankathon.com/big_board")
    draft_soup = draft_soup.find('div', attrs = {'id' : 'big-board'})
    draft_rows = getDraftRows(draft_soup)
    data = {col : [] for col in getAllCols()}
    for row in tqdm(draft_rows):
        pick = int(getPick(row))
        ref = getReference(row)
        if ref is None:
            continue
        time.sleep(2)
        player_soup = getSoupFromURL(ref)
        player_info = getPlayerAll(player_soup)
        player_info['Pick'] = pick
        player_info['Year'] = year
        for stat in data.keys():
            if stat in player_info:
                data[stat].append(player_info[stat])
            else:
                data[stat].append(np.nan)
    return data

# Scrape

## All

In [3]:
# d_soup = getSoupFromURL(getDraftURL(2023))
# d_rows = getDraftRows(d_soup)
# ref = getReference(d_rows[0])
# time.sleep(1)
# p_soup = getSoupFromURL(ref)

In [4]:
# info = getPlayerInfo(p_soup)

In [5]:
# test = getYearStats(2004)

In [6]:
past_data = getAllDraftHistory(2004, 2023)
current_board = getDraftBoard(2024)
draft_players = pd.DataFrame(past_data)
draft_players24 = pd.DataFrame(current_board)
draft_players.head()

Retreiving Data for 2004


100%|███████████████████████████████████████████| 60/60 [02:22<00:00,  2.38s/it]


Retreiving Data for 2005


100%|███████████████████████████████████████████| 60/60 [02:24<00:00,  2.41s/it]


Retreiving Data for 2006


100%|███████████████████████████████████████████| 60/60 [02:52<00:00,  2.87s/it]


Retreiving Data for 2007


100%|███████████████████████████████████████████| 60/60 [02:25<00:00,  2.42s/it]


Retreiving Data for 2008


100%|███████████████████████████████████████████| 60/60 [02:24<00:00,  2.41s/it]


Retreiving Data for 2009


100%|███████████████████████████████████████████| 60/60 [02:26<00:00,  2.45s/it]


Retreiving Data for 2010


100%|███████████████████████████████████████████| 60/60 [02:24<00:00,  2.42s/it]


Retreiving Data for 2011


100%|███████████████████████████████████████████| 60/60 [02:25<00:00,  2.42s/it]


Retreiving Data for 2012


100%|███████████████████████████████████████████| 60/60 [02:29<00:00,  2.49s/it]


Retreiving Data for 2013


100%|███████████████████████████████████████████| 63/63 [02:31<00:00,  2.41s/it]


Retreiving Data for 2014


100%|███████████████████████████████████████████| 60/60 [02:35<00:00,  2.60s/it]


Retreiving Data for 2015


100%|███████████████████████████████████████████| 60/60 [02:24<00:00,  2.41s/it]


Retreiving Data for 2016


100%|███████████████████████████████████████████| 69/69 [02:26<00:00,  2.12s/it]


Retreiving Data for 2017


100%|███████████████████████████████████████████| 64/64 [02:26<00:00,  2.28s/it]


Retreiving Data for 2018


100%|███████████████████████████████████████████| 60/60 [02:24<00:00,  2.41s/it]


Retreiving Data for 2019


100%|███████████████████████████████████████████| 60/60 [02:26<00:00,  2.45s/it]


Retreiving Data for 2020


100%|███████████████████████████████████████████| 77/77 [02:24<00:00,  1.88s/it]


Retreiving Data for 2021


100%|███████████████████████████████████████████| 74/74 [02:25<00:00,  1.96s/it]


Retreiving Data for 2022


100%|███████████████████████████████████████████| 79/79 [02:21<00:00,  1.79s/it]


Retreiving Data for 2023


100%|███████████████████████████████████████████| 74/74 [02:20<00:00,  1.89s/it]
100%|█████████████████████████████████████████| 140/140 [05:35<00:00,  2.40s/it]


Unnamed: 0,Name,Year,Draft Team,Pick,Team,Position,Height,Wingspan,Weight,Draft Age,...,USG%,AST/USG,AST/TO,PER,OWS/40,DWS/40,WS/40,OBPM,DBPM,BPM
0,Dwight Howard,2004,ORL,1,SACA,C,82.25,88.5,240,18.52,...,,,,,,,,,,
1,Emeka Okafor,2004,CHA,2,UConn,C,82.0,88.0,257,21.71,...,25.1,0.24,0.43,0.0,0.127,0.213,0.34,,,
2,Ben Gordon,2004,CHI,3,UConn,SG,74.25,80.5,192,21.2,...,25.8,0.91,1.63,0.0,0.116,0.092,0.208,,,
3,Shaun Livingston,2004,LAC,4,Peoria High School,PG,79.5,83.0,186,18.76,...,,,,,,,,,,
4,Devin Harris,2004,DAL,5,Wisconsin,PG,75.0,79.5,170,21.3,...,27.6,1.04,2.14,0.0,0.165,0.103,0.269,,,


In [7]:
draft_players24.head()

Unnamed: 0,Name,Year,Draft Team,Pick,Team,Position,Height,Wingspan,Weight,Draft Age,...,USG%,AST/USG,AST/TO,PER,OWS/40,DWS/40,WS/40,OBPM,DBPM,BPM
0,Nikola Topić,2024,,1,Red Star (Serbia),PG,78.0,,201.0,18.86,...,23.92,1.44,2.4,19.85,,,,,,
1,Zaccharie Risacher,2024,,2,JL Bourg (France),SF,81.0,,204.0,19.2,...,22.41,0.31,0.58,14.83,,,,,,
2,Alexandre Sarr,2024,,3,Perth (NBL),PF/C,85.0,88.25,224.0,19.15,...,21.9,0.43,1.0,20.96,,,,,,
3,Reed Sheppard,2024,,4,Kentucky,PG/SG,75.0,75.25,182.0,19.99,...,18.0,1.34,2.24,23.6,0.126,0.059,0.188,6.4,5.0,11.4
4,Donovan Clingan,2024,,5,UConn,C,87.0,90.75,282.0,20.32,...,25.1,0.54,1.89,34.8,0.193,0.112,0.305,8.8,6.2,15.0


In [8]:
draft_players.shape

(1195, 44)

In [9]:
draft_players24.shape

(140, 44)

In [10]:
draft_players['Draft Age'].describe()

count    1195.000000
mean       21.077941
std         1.499129
min         9.500000
25%        19.880000
50%        21.150000
75%        22.205000
max        27.360000
Name: Draft Age, dtype: float64

In [13]:
draft_players[draft_players['Draft Age'] < 17]

Unnamed: 0,Name,Year,Draft Team,Pick,Team,Position,Height,Wingspan,Weight,Draft Age,...,USG%,AST/USG,AST/TO,PER,OWS/40,DWS/40,WS/40,OBPM,DBPM,BPM
263,Nicolas Batum,2008,POR,25,Le Mans (France),SF,80.0,,200,9.5,...,,,1.46,,,,,,,
403,Paulao Prestes,2010,MIN,45,CB Murcia (Spain),C,83.0,,260,12.33,...,,,0.34,,,,,,,


In [14]:
def fixAge(age):
    if age < 16:
        return 10 + age
    return age

draft_players['Draft Age'] = draft_players['Draft Age'].apply(fixAge)
draft_players24['Draft Age'] = draft_players24['Draft Age'].apply(fixAge)

In [16]:
draft_players.to_csv('data/draft_players.csv', index = False)
draft_players24.to_csv('data/draft_players24.csv', index = False)

## Only 2024

In [3]:
current_board = getDraftBoard(2024)
current_data = pd.DataFrame(current_board)

100%|███████████████████████████████████████████| 70/70 [02:49<00:00,  2.42s/it]


In [4]:
def fixAge(age):
    if age < 16:
        return 10 + age
    return age

current_data['Draft Age'] = current_data['Draft Age'].apply(fixAge)

In [5]:
current_data.head()

Unnamed: 0,Name,Year,Draft Team,Pick,Team,Position,Height,Wingspan,Weight,Draft Age,...,USG%,AST/USG,AST/TO,PER,OWS/40,DWS/40,WS/40,OBPM,DBPM,BPM
0,Nikola Topić,2024,,1,Red Star (Serbia),PG,78.0,,201.0,18.86,...,23.92,1.44,2.4,19.85,,,,,,
1,Zaccharie Risacher,2024,,2,JL Bourg (France),SF,81.0,,204.0,19.2,...,22.4,0.31,0.57,15.19,,,,,,
2,Alexandre Sarr,2024,,3,Perth (NBL),PF/C,85.0,88.25,224.0,19.15,...,21.9,0.43,1.0,20.96,,,,,,
3,Reed Sheppard,2024,,4,Kentucky,PG/SG,75.0,75.25,182.0,19.99,...,18.0,1.34,2.24,23.6,0.126,0.059,0.188,6.4,5.0,11.4
4,Donovan Clingan,2024,,5,UConn,C,87.0,90.75,282.0,20.32,...,25.1,0.54,1.89,34.8,0.193,0.112,0.305,8.8,6.2,15.0


In [6]:
current_data.shape

(70, 44)

In [11]:
withdrew = ["Hunter Sallis", "Payton Sandfort", "Trevon Brazile", "Ryan Kalkbrenner", "Baba Miller"]
current_data = current_data[current_data["Name"].isin(withdrew) == False]
current_data['Pick'] = list(range(1, len(current_data) + 1))
current_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  current_data['Pick'] = list(range(1, len(current_data) + 1))


Unnamed: 0,Name,Year,Draft Team,Pick,Team,Position,Height,Wingspan,Weight,Draft Age,...,USG%,AST/USG,AST/TO,PER,OWS/40,DWS/40,WS/40,OBPM,DBPM,BPM
0,Nikola Topić,2024,,1,Red Star (Serbia),PG,78.00,,201.0,18.86,...,23.92,1.44,2.40,19.85,,,,,,
1,Zaccharie Risacher,2024,,2,JL Bourg (France),SF,81.00,,204.0,19.20,...,22.40,0.31,0.57,15.19,,,,,,
2,Alexandre Sarr,2024,,3,Perth (NBL),PF/C,85.00,88.25,224.0,19.15,...,21.90,0.43,1.00,20.96,,,,,,
3,Reed Sheppard,2024,,4,Kentucky,PG/SG,75.00,75.25,182.0,19.99,...,18.00,1.34,2.24,23.60,0.126,0.059,0.188,6.4,5.0,11.4
4,Donovan Clingan,2024,,5,UConn,C,87.00,90.75,282.0,20.32,...,25.10,0.54,1.89,34.80,0.193,0.112,0.305,8.8,6.2,15.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62,Trentyn Flowers,2024,,61,Adelaide (NBL),SG,79.75,80.25,201.0,19.28,...,19.87,0.35,0.42,8.36,,,,,,
65,Reece Beekman,2024,,62,Virginia,PG,74.50,78.50,196.0,22.70,...,27.70,1.61,3.03,23.40,0.090,0.090,0.176,5.0,5.4,10.5
66,Judah Mintz,2024,,63,Syracuse,PG,76.00,75.50,176.0,20.95,...,29.70,0.86,1.51,20.50,0.081,0.048,0.129,2.7,1.5,4.2
67,Jaylen Wells,2024,,64,Washington State,SF,79.75,79.25,206.0,20.82,...,20.10,0.39,1.83,18.60,0.125,0.056,0.181,5.7,1.6,7.3


In [12]:
current_data.to_csv('data/draft_players24.csv', index = False)

# Testing

In [317]:
board_rows = getDraftRows(getSoupFromURL("https://www.tankathon.com/big_board"))

In [318]:
ref1 = getReference(board_rows[3])
ps1 = getSoupFromURL(ref1)

In [319]:
h1 = ps1.find_all('div', attrs = {'class' : 'stats-header'})
val1 = ps1.find_all('div', attrs = {'class' : 'stats'})

In [341]:
board = getDraftBoard(2024)

100%|███████████████████████████████████████████| 74/74 [02:58<00:00,  2.42s/it]


In [344]:
df = pd.DataFrame(board)

In [46]:
d = getAllDraftHistory(2004, 2023)
df = pd.DataFrame(d)

Retreiving Data for 2021


100%|███████████████████████████████████████████| 60/60 [03:21<00:00,  3.36s/it]


Retreiving Data for 2022


100%|███████████████████████████████████████████| 60/60 [03:11<00:00,  3.20s/it]


Retreiving Data for 2023


100%|███████████████████████████████████████████| 60/60 [03:18<00:00,  3.30s/it]


In [46]:
def getToggleOptions(df):
    toggles = []
    identifiers = []
    for col in df.columns:
        if (df[col].dtype == np.int64 or df[col].dtype == np.float64) and col not in ['Year', 'id']:
            toggles.append(col)
        else:
            identifiers.append(col)
    return identifiers, toggles

def filterData(df, toggle_list):
    new_df = df.copy()
    for col in toggle_list:
        new_df = new_df[new_df[col].notnull()]
    return new_df

def standardize(df, toggle_list):
    for col in toggle_list:
        # if col == 'Pick':
        #     df[col] = np.log(df[col])
        df[col] = (df[col] - df[col].mean()) / df[col].std()
    return df

def getVectorFromRow(row, toggle_list):
    return row[toggle_list].values

def dotProduct(ar1, ar2):
    return np.dot(ar1, ar2)

def cosineSimilarity(ar1, ar2):
    return np.dot(ar1, ar2) / (np.linalg.norm(ar1) * np.linalg.norm(ar2))

def invEuclideanDistance(ar1, ar2):
    return 1 / (1 + np.linalg.norm(ar1 - ar2))

def calculateSimilarity(df, toggle_list, target_id, sim_function):
    if target_id not in df.index:
        raise ValueError("Target ID not valid for these stats")
    target_vector = getVectorFromRow(df.loc[target_id], toggle_list)
    df['similarity'] = df.apply(lambda x : sim_function(target_vector, getVectorFromRow(x, toggle_list)), axis = 1)
    return df

In [48]:
a, b = getToggleOptions(all_data_df)

In [133]:
all_data_df[all_data_df['Year'] == 2024].head(10)

Unnamed: 0,Name,Year,Draft Team,Pick,Team,Position,Height,Weight,Draft Age,Birthdate,...,USG%,AST/USG,AST/TO,PER,OWS/40,DWS/40,WS/40,OBPM,DBPM,BPM
1135,Nikola Topić,2024,,1,Red Star (Serbia),PG,78.0,201.0,18.86,"Aug 10, 2005",...,24.45,1.42,2.27,19.84,,,,,,
1136,Alexandre Sarr,2024,,2,Perth (NBL),PF/C,85.0,217.0,19.15,"Apr 26, 2005",...,22.2,0.43,0.97,20.98,,,,,,
1137,Zaccharie Risacher,2024,,3,JL Bourg (France),SF,81.0,204.0,19.2,"Apr 8, 2005",...,21.99,0.36,0.69,16.83,,,,,,
1138,Reed Sheppard,2024,,4,Kentucky,PG/SG,75.0,187.0,19.99,"Jun 24, 2004",...,17.8,1.26,2.24,23.5,0.12,0.073,0.193,5.8,6.0,11.9
1139,Cody Williams,2024,,5,Colorado,SF,80.0,185.0,19.58,"Nov 20, 2004",...,20.6,0.54,0.82,17.5,0.087,0.051,0.138,3.6,1.6,5.2
1140,Rob Dillingham,2024,,6,Kentucky,PG,75.0,176.0,19.46,"Jan 4, 2005",...,29.7,0.99,2.02,24.0,0.139,0.046,0.185,5.8,1.3,7.2
1141,Ja'Kobe Walter,2024,,7,Baylor,SG,77.0,180.0,19.79,"Sep 4, 2004",...,23.9,0.38,1.24,19.0,0.124,0.048,0.172,5.4,1.4,6.8
1142,Ron Holland,2024,,8,G League,SF/PF,80.0,200.0,18.95,"Jul 7, 2005",...,28.05,0.58,0.9,15.79,,,,,,
1143,Matas Buzelis,2024,,9,G League,SF,83.0,195.0,19.68,"Oct 13, 2004",...,20.5,0.45,0.76,13.13,,,,,,
1144,Stephon Castle,2024,,10,UConn,PG/SG,78.0,190.0,19.63,"Nov 1, 2004",...,21.7,0.88,1.94,19.1,0.115,0.074,0.183,3.3,2.4,5.7


In [157]:
toggles = ['Height', 'Weight', 'Draft Age', 'Pick', 'AST', 'REB', 'STL', 'BLK', 'TO', '3PM', '3PA', 
           'FGM', 'FGA', 'FTM', 'FTA', 'TS%']
df2 = filterData(all_data_df, toggles)
standardize(df2, toggles)
calculateSimilarity(df2, toggles, 1177, cosineSimilarity)
ids = df2.sort_values('similarity', ascending = False).head(10).index
df3 = all_data_df.copy()
df3['similarity'] = df2['similarity']
df3[a + toggles + ['similarity']].loc[ids]

Unnamed: 0,Name,Year,Draft Team,Team,Position,Birthdate,Nation,Height,Weight,Draft Age,...,BLK,TO,3PM,3PA,FGM,FGA,FTM,FTA,TS%,similarity
1177,Justin Edwards,2024,,Kentucky,SF,"Dec 16, 2003",USA,79.0,180.0,20.51,...,0.4,1.5,1.5,4.1,5.5,11.3,2.0,2.6,0.578,1.0
1184,Pacome Dadiet,2024,,Ratiopharm Ulm (Germany),SG,"Jul 27, 2005",France,80.0,187.0,18.9,...,0.3,2.0,1.8,5.4,5.2,10.8,2.5,3.3,0.597,0.852756
638,Glenn Robinson III,2014,MIN,Michigan,SF,"Jan 8, 1994",,78.75,211.0,20.45,...,0.3,1.4,1.0,3.3,5.5,11.2,2.6,3.5,0.566,0.828987
1123,Mojave King,2023,IND,G League,SG,"Jul 11, 2002",New Zealand,76.75,201.0,20.94,...,0.5,1.3,1.4,4.3,4.4,11.0,1.4,1.8,0.536,0.814348
866,Jordan Poole,2019,GS,Michigan,SG,"Jun 19, 1999",USA,77.5,191.0,20.0,...,0.2,1.7,2.2,6.0,4.7,10.9,2.2,2.6,0.573,0.813874
570,Alex Abrines,2013,OKC,Barcelona (Spain),SG/SF,"Aug 1, 1993",Spain,78.0,190.0,19.88,...,0.6,1.9,2.0,6.4,4.6,11.0,2.1,2.6,0.538,0.806214
935,Vít Krejčí,2020,OKC,Zaragoza (Spain),PG,"Jun 19, 2000",Czech Rep.,80.0,195.0,20.0,...,0.4,2.2,1.3,3.7,4.9,8.8,1.4,2.5,0.62,0.777091
815,Gary Trent Jr.,2018,POR,Duke,SG,"Jan 18, 1999",,77.75,204.0,19.42,...,0.1,1.1,2.8,6.9,5.1,12.3,2.4,2.8,0.567,0.776672
833,Arnoldas Kulboka,2018,CHA,Brose Bamberg (Germany),SF,"Jan 4, 1998",Lithuania,81.0,200.0,20.46,...,0.2,1.9,2.2,6.1,3.6,9.8,2.1,2.6,0.522,0.762586
611,Zach LaVine,2014,MIN,UCLA,SG,"Mar 10, 1995",,77.75,181.0,19.28,...,0.2,1.7,1.9,5.1,5.1,11.5,1.9,2.7,0.545,0.761752


# Scratch Work

In [3]:
test = getSoupFromURL(getDraftURL(2023))
test = getDraftRows(test)
len(test)

60

In [9]:
all_keys = {}
for row in tqdm(test):
    ref = getReference(row)
    if ref is None:
        continue
    time.sleep(2)
    player_soup = getSoupFromURL(ref)
    stats = getPlayerStats(player_soup)
    for stat, value in stats.items():
        if stat not in all_keys:
            all_keys[stat] = 0
        if value == '':
            all_keys[stat] += 1
all_keys

100%|███████████████████████████████████████████| 60/60 [03:14<00:00,  3.24s/it]


{'MP': 0,
 'G': 0,
 'FGM-FGA': 0,
 'FG%': 0,
 '3PM-3PA': 0,
 '3P%': 2,
 'FTM-FTA': 0,
 'FT%': 0,
 'REB': 0,
 'AST': 0,
 'BLK': 0,
 'STL': 0,
 'TO': 0,
 'PF': 0,
 'PTS': 0,
 'True Shooting %TS%': 0,
 'Effective FG%EFG%': 0,
 '3PA Rate3PAR': 0,
 'FTA RateFTAR': 0,
 'Proj NBA 3P%NBA 3P%': 0,
 'USG%': 0,
 'AST/USG': 0,
 'AST/TO': 0,
 'PER': 0,
 'OWS/40': 10,
 'DWS/40': 10,
 'WS/40': 10,
 'ORTG': 46,
 'DRTG': 46,
 'OBPM': 10,
 'DBPM': 10,
 'BPM': 10}

In [21]:
test_player = getReference(test[42])
test_player = getSoupFromURL(test_player)
getPlayerAll(test_player)

{'Team': 'NZ Breakers (NBL)',
 'Year': 'International',
 'Position': 'SG',
 'Height': 79.0,
 'Weight': 193,
 'Draft Age': 19.05,
 'Birthdate': 'May 31, 2004',
 'Nation': ' France',
 'Draft Team': ' POR',
 'MP': 18.1,
 'G': 31.0,
 'FGM': 4.4,
 'FGA': 12.0,
 'FG%': 0.369,
 '3PM': 1.6,
 '3PA': 5.1,
 '3P%': 0.312,
 'FTM': 3.1,
 'FTA': 4.2,
 'FT%': 0.738,
 'REB': 4.8,
 'AST': 1.6,
 'BLK': 0.3,
 'STL': 1.5,
 'TO': 2.2,
 'PF': 3.9,
 'PTS': 13.6,
 'TS%': 0.484,
 'eFG%': 0.436,
 '3PAr': 0.428,
 'FTAr': 0.348,
 'USG%': 20.15,
 'AST/USG': 0.36,
 'AST/TO': 0.74,
 'PER': 9.96,
 'OWS/40': nan,
 'DWS/40': nan,
 'WS/40': nan,
 'OBPM': nan,
 'DBPM': nan,
 'BPM': nan}

In [19]:
test_player = getReference(test[42])
test_player = getSoupFromURL(test_player)
a = getPlayerStats(test_player)
cleanPlayerStats(a)

{'MP': 18.1,
 'G': 31.0,
 'FGM': 4.4,
 'FGA': 12.0,
 'FG%': 0.369,
 '3PM': 1.6,
 '3PA': 5.1,
 '3P%': 0.312,
 'FTM': 3.1,
 'FTA': 4.2,
 'FT%': 0.738,
 'REB': 4.8,
 'AST': 1.6,
 'BLK': 0.3,
 'STL': 1.5,
 'TO': 2.2,
 'PF': 3.9,
 'PTS': 13.6,
 'TS%': 0.484,
 'eFG%': 0.436,
 '3PAr': 0.428,
 'FTAr': 0.348,
 'USG%': 20.15,
 'AST/USG': 0.36,
 'AST/TO': 0.74,
 'PER': 9.96,
 'OWS/40': nan,
 'DWS/40': nan,
 'WS/40': nan,
 'OBPM': nan,
 'DBPM': nan,
 'BPM': nan}