# Web Scraping

In [ ]:
import pandas as pd
import os
from io import StringIO

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
import time

In [15]:
# Remove tuples without hrefs (with None instead in all rows)

def remove_tuples(df):  
    n_df = df.copy()
    for attr in n_df.columns:
        # if (isinstance(n_df[attr][0], tuple)) and all(n_df[attr][c][1] is None for c in n_df[attr]):
        # if all(isinstance(n_df[attr][i], tuple) for i in range(len(n_df))) and all(d[1] is None for d in n_df[attr]):
        if all(isinstance(d1, tuple) for d1 in n_df[attr]) and all(d2[1] is None for d2 in n_df[attr]):
            n_df[attr] = [c[0] for c in n_df[attr]]
    return n_df # right

In [17]:
# "numbers" in effective numbers (float). N.B.: Wt can be empty

def int_numbers(df):  # only not tuple object (they don't have href) may be numbers (data is only numbers)
    n_df = df.copy()
    for attr in n_df.columns:
        if not any(isinstance(d, tuple) or any(c.isalpha() for c in d) or '-' in d for d in n_df[attr]):
            try:
                n_df[attr] = n_df[attr].astype(float)
            except ValueError:
                print("It's Wt for sure so" )
                n_df.loc[n_df['Wt'] == '', 'Wt'] = '0'
                n_df['Wt'] = n_df['Wt'].astype(float)
                continue
    return n_df


In [18]:
# "numbers" in effective numbers (float). N.B.: Wt can be empty

def int_numbers1(df):  # only not tuple object (they don't have href) may be numbers
    n_df = df.copy()
    for k, series in enumerate([n_df[attr] for attr in n_df.columns]):
        if not any(isinstance(s, tuple) for s in series):
            series = series.astype(float)
            print(series)
    return n_df

In [19]:
# Transformation table

def zoom_table(df, r):
    n_df = df.copy()
    # career_index = n_df.index[n_df['Season'] == 'Career']
    n_df = n_df.loc[n_df['Season'] == 'Career']
    n_df = n_df.drop(columns=['Season', 'Tm', 'Lg'])
    n_df['From'] = r['From'].astype(int)
    n_df['To'] = r['To'].astype(int)
    n_df['Hof'] = [True if t[0][-1] == '*' else False for t in r['Player']]
    n_df['Player'] = r['Player'][0].replace('*', '')
    
    return n_df
    

In [20]:
def create_none_advanced_dataframe(row1):
    return pd.DataFrame({
        'Age': None, 'Pos': None, 'G': None, 'MP': None, 'PER': None, 'TS%': None, '3PAr': None, 'FTr': None,
        'ORB%': None, 'DRB%': None, 'TRB%': None, 'AST%': None, 'STL%': None, 'BLK%': None, 'TOV%': None, 'USG%': None,
        'Unnamed: 19': None, 'OWS': None, 'DWS': None, 'WS': None, 'WS/48': None, 'Boh2': None, 'OBPM': None,
        'DBPM': None, 'BPM': None, 'VORP': None, 'From': row1['From'].astype(int), 'To': row1['To'].astype(int),
        'Hof': False, 'Player': row1['Player'], 'Unnamed: 16': None, 'Unnamed: 21': None, 'Unnamed: 18': None,
        'Unnamed: 23': None
    })

In [21]:
# Taking statistics like per 100 possession statistics and avdanced statistics

def per_100_and_advanced_statistics(absolute_path, p_table, driver1):
    df_per_100 = []
    df_ad = []
    
    for index, row in p_table.iterrows():  # for each plyer
        player, href = row['Player']
        print(f"Let's go with player {player} from {row['From']}")
        driver1.get(absolute_path + href)

        ################# Per_100_poss #################
        try: 
            dfp_t = driver1.find_element(By.XPATH, '//table[@id="per_poss"]')
        except NoSuchElementException: 
            dfp_t = driver1.find_element(By.XPATH, '//table[@id="per_minute"]')  #  or "per_game"
        dfp = pd.read_html(StringIO(dfp_t.get_attribute('outerHTML')))[0]
        #  print(df1)  #
        dfp = zoom_table(dfp, row)
        df_per_100.append(dfp)
        
        ################# Advanced #################
        try:
            dfa_t = driver1.find_element(By.XPATH, '//table[@id="advanced"]')
            dfa = pd.read_html(StringIO(dfa_t.get_attribute('outerHTML')))[0]

            dfa = zoom_table(dfa, row)
        except NoSuchElementException:
            dfa = create_none_advanced_dataframe(row)
        df_ad.append(dfa)
        
        
        time.sleep(7)  # otherwise I'll be kicked
    return df_per_100, df_ad

In [None]:
# One letter at time

dir_path = ".\\Datasets"
alphabet = 'abcdefghijklmnopqrstuvwxyz'
bf_players_path = "https://www.basketball-reference.com/players/"
bf_absolute_path = "https://www.basketball-reference.com"
list_p_100_p = []
list_a = []
driver = webdriver.Firefox()
letter = None

for letter in alphabet[25:26]:  # z
    letter_path = bf_players_path + letter
    driver.get(letter_path)
    players_character_table = pd.read_html(letter_path, extract_links='body')[0]

    players_l_table = remove_tuples(players_character_table)
    players_l_table = int_numbers(players_l_table)
    size_b = players_l_table.shape[0]
    players_l_table = players_l_table.loc[
        (players_l_table['From'] >= 1973) & (players_l_table['From'] < 2021)].reset_index(drop=True)
    size_a = players_l_table.shape[0]
    print("%d players got eliminated: " % (size_b - size_a))
    df_p_100_p, df_a = per_100_and_advanced_statistics(bf_absolute_path, players_l_table, driver)
    [list_p_100_p.append(d) for d in df_p_100_p]  # list for each letter
    # dataset_p_100_p.append(df_p_100_p)
    [list_a.append(d) for d in df_a]  # list for each letter
    #dataset_a.append(df_a)

driver.close()
print(letter)

################# Per_100_poss #################
dataset_per_100_poss = pd.concat(list_p_100_p, ignore_index=True)
# dataset_per_100_poss.to_csv(dir_path + "\\Per_100_poss_csv\\" + letter + "_p_100_p.csv")
# dataset_per_100_poss.to_csv(dir_path + "\\Per_100_poss_excel\\" + letter + "_p_100_p.xlsx")
dataset_per_100_poss.to_json(dir_path + "\\Per_100_poss_json\\" + letter + "_p_100_p.json", orient='records', indent=3)

################# Advanced #################
dataset_advanced = pd.concat(list_a, ignore_index=True)
# dataset_advanced.to_csv(dir_path + "\\Advanced_csv\\" + letter + "_advanced.csv")
# dataset_advanced.to_csv(dir_path + "\\Advanced_excel\\" + letter + "_advanced.xlsx")
dataset_advanced.to_json(dir_path + "\\Advanced_json\\" + letter + "_advanced.json", orient='records', indent=3)

In [ ]:
# Join single letter files

dir_path = ".\\Datasets"

################# Per_100_poss #################
folder_path_per_100_poss = ".\\Per_100_poss_json"
per_100_poss_list = []
for filename in os.listdir(folder_path_per_100_poss):
    file_path = os.path.join(folder_path_per_100_poss, filename)
    df_f = pd.read_json(file_path, orient='records')
    per_100_poss_list.append(df_f)
df_all_per_100_poss_json = pd.concat(per_100_poss_list, ignore_index=True)
# df_all_per_100_poss_json.to_csv(dir_path + "\\training_per_100_poss_players.csv")
# df_all_per_100_poss_json.to_excel(dir_path + "\\training_per_100_poss_players.xlsx")
df_all_per_100_poss_json.to_json(dir_path + "\\training_per_100_poss_players.json", orient='records', indent=3)

################# Advanced #################
folder_path_advanced = dir_path + "\\Advanced_json"
advanced_list = []
for filename in os.listdir(folder_path_advanced):
    file_path = os.path.join(folder_path_advanced, filename)
    df_f = pd.read_json(file_path, orient='records')
    advanced_list.append(df_f)
df_all_advanced_json = pd.concat(advanced_list, ignore_index=True)
# df_all_per_100_poss_json.to_csv(dir_path + "\\training_advanced_players.csv")
# df_all_per_100_poss_json.to_excel(dir_path + "\\training_advanced_players.xlsx")
df_all_advanced_json.to_json(dir_path + "\\training_advanced_players.json", orient='records', indent=3)

In [None]:
# All together

from selenium import webdriver
import pandas as pd

dir_path = ".\\Datasets"
alphabet = 'abcdefghijklmnopqrstuvwxyz'
bf_players_path = "https://www.basketball-reference.com/players/"
bf_absolute_path = "https://www.basketball-reference.com"
list_p_100_p = []
list_a = []
driver = webdriver.Firefox()
letter = None

for letter in alphabet:  # a-z
    letter_path = bf_players_path + letter
    driver.get(letter_path)
    players_character_table = pd.read_html(letter_path, extract_links='body')[0]

    players_l_table = remove_tuples(players_character_table)  # list(players_character_table.columns) anche senza list
    players_l_table = int_numbers(players_l_table)
    size_b = players_l_table.shape[0]
    players_l_table = players_l_table.loc[
        (players_l_table['From'] >= 1973) & (players_l_table['From'] < 2021)].reset_index(drop=True)
    size_a = players_l_table.shape[0]
    print("%d players got eliminated: " % (size_b - size_a))
    df_p_100_p, df_a = per_100_and_advanced_statistics(bf_absolute_path, players_l_table, driver)
    [list_p_100_p.append(d) for d in df_p_100_p]  # list for each letter
    # dataset_p_100_p.append(df_p_100_p)
    [list_a.append(d) for d in df_a]  # list for each letter
    #dataset_a.append(df_a)

driver.close()

################# Per_100_poss #################
dataset_per_100_poss = pd.concat(list_p_100_p, ignore_index=True)
# dataset_per_100_poss.to_csv(dir_path + "training_per_100_poss_players.csv")
# dataset_per_100_poss.to_excel(dir_path + "training_per_100_poss_players.xlsx")
dataset_per_100_poss.to_json(dir_path + "\\training_per_100_poss_players.json", orient='records', indent=3)

################# Advanced #################
dataset_advanced = pd.concat(list_a, ignore_index=True)
# dataset_advanced.to_csv(dir_path + "\\training_advanced_players.csv")
# dataset_advanced.to_excel(dir_path + "\\training_advanced_players.xlsx")
dataset_advanced.to_json(dir_path + "\\training_advanced_players.json", orient='records', indent=3)

# Training and test set

In [ ]:
# Difference between sets (df1 - df2) 

def df_difference1(df1, df2):
    merged = pd.merge(df1, df2, on='nome', how='left', indicator=True)
    df_filtered = merged[merged['_merge'] == 'left_only'].drop(columns='_merge')
    df_not_filtered = merged[merged['_merge'] != 'left_only'].drop(columns='_merge')
    
    return df_filtered, df_not_filtered

In [ ]:
# Difference between sets (df1 - df2) 

def df_difference(df1, df2):
    df_s1, df_s2 = set(df1['Player']), set(df2['Player'])
    df_diff = df_s1 - df_s2

    return df1[df1['Player'].isin(list(df_diff))], df1[df1['Player'].isin(list(df_s2))]

In [ ]:
def making_test_datasets(path_file, test_path, hof_cand_pl):
    pl_df = pd.read_json(path_file, orient='records')
    pl_df, hof_cand_pl = df_difference1(pl_df, hof_cand_pl)
    pl_df.to_json(path_file, orient='records', indent=3)

    hof_cand_players_df = pd.merge(pl_df, hof_cand_pl, how='left', on='Player') # 
    hof_cand_players_df['Hof'] = [True if float(d) > 0.5 else False for d in hof_cand_players_df['HoF Prob']]
    hof_cand_players_df.to_json(test_path, orient='records', indent=3)

In [None]:
# Dropping candidate hof players into another json to make dataset test. Hof_candidates (hof_candidate_advanced,
# hof_candidates_per_100_poss) with hof prob. > 0.5 are 'effectively' hof

dir_path = ".\\Datasets"

hof_test_players_path = "https://www.basketball-reference.com/leaders/hof_prob.html"
hof_candidate_players = pd.read_html(hof_test_players_path)[1]
hof_candidate_players = hof_candidate_players.drop(columns=['Rank'])

################# Per_100_poss #################
all_per_100_poss_file_path = dir_path + "\\training_per_100_poss_players.json"
hof_per_100_poss_file_path = dir_path + "\\test_per_100_poss_players.json"
making_test_datasets(all_per_100_poss_file_path, hof_per_100_poss_file_path, hof_candidate_players)

################# Advanced #################
all_advanced_file_path = dir_path + "\\training_advanced_players.json"
hof_advanced_file_path = dir_path + "\\test_advanced_players.json"
making_test_datasets(all_advanced_file_path, hof_advanced_file_path, hof_candidate_players)