### See this link for scraping of Sports Reference: https://www.sports-reference.com/bot-traffic.html

In [130]:
import os
from bs4 import BeautifulSoup, Comment
import requests
import time
import random
import pandas as pd
import html5lib
import csv
import warnings

In [132]:
warnings.filterwarnings("ignore")

In [3]:
letters = [chr(letter) for letter in range(ord('a'), ord('z')+1)]

In [4]:
def save_html(url, folder_path, file_name):

    time.sleep(5)

    # Make a GET request to the URL
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code == 200:
        # Create the folder if it doesn't exist
        if not os.path.exists(folder_path):
            os.makedirs(folder_path)
        
        # Write the HTML content to a file
        with open(os.path.join(folder_path, file_name), 'w', encoding='utf-8') as f:
            f.write(response.text)
    else:
        print("Failed to retrieve HTML. Status code:", response.status_code)

In [42]:
for letter in letters:
    url = f"https://www.basketball-reference.com/players/{letter}/"
    save_folder_path = "data/player_names"
    file_name = f"{letter}_names.html"

    save_html(url, save_folder_path, file_name)
    

In [5]:
def parse_html_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()
    
    # Parse HTML using BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Find the table with id="players"
    players_table = soup.find('table', id='players')
    if players_table:
        # Find all table rows within the players table
        table_rows = players_table.find_all('tr')
        
        # Extract href values from anchor elements within each row
        href_values = []
        for row in table_rows:
            anchor = row.find('a')
            if anchor and anchor.has_attr('href'):
                href_values.append(anchor['href'])
        
        return href_values
    else:
        print("No table with id='players' found in:", file_path)
        return []

In [44]:
for letter in letters:
    file_path = f"data/player_names/{letter}_names.html"
    href_values = parse_html_file(file_path)

    href_file_path = os.path.join("data/player_href", f"{letter}_names_href.txt")
    with open(href_file_path, 'w', encoding='utf-8') as href_file:
        href_file.write('\n'.join(href_values))

In [6]:
def check_file_exists(file_path):
    return os.path.exists(file_path)

In [7]:
# Function to retrieve HTML for a single href and store it in player_html folder
def save_player_html(href, folder_path):
    # Make a GET request to the href
    href = href.strip()

    # Extract player ID from href
    player_id = href.split('/')[-1]
        
    # Write the HTML content to a file
    file_name = player_id
    file_path = os.path.join(folder_path, file_name)

    if not check_file_exists(file_path):
        time.sleep(3.1)
        url = f"https://www.basketball-reference.com{href}"
        print(url)

        response = requests.get(url)
        
        # Check if the request was successful
        if response.status_code == 200:
            with open(file_path, 'w', encoding='utf-8') as f:
                f.write(response.text)
        else:
            print(f"Failed to retrieve HTML for player {url}. Status code:", response.status_code)
    else:
        return


In [8]:
for letter in letters:
    file_path = os.path.join("data/player_href/", f"{letter}_names_href.txt")
    with open(file_path, 'r', encoding='utf-8') as href_file:
        for href in href_file:
            # Call save_player_html function for each href
            save_player_html(href, "data/player_html")

In [123]:
def parse_player_salary_html(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()

    soup = BeautifulSoup(html_content, 'html.parser')

    for comment in soup.find_all(string=lambda text:isinstance(text,Comment)):
        data = BeautifulSoup(comment, "html.parser")

        salaries_table = data.find('table', id='all_salaries')
        if salaries_table:
            break

    if not salaries_table:
        return pd.DataFrame()
    df = pd.read_html(str(salaries_table))[0]
    df_return = df[df['Season'] != 'Career']

    return df_return

In [124]:
def parse_player_stats_html(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()

    soup = BeautifulSoup(html_content, 'html.parser')

    tfoot = soup.find_all('tfoot')
    for item in tfoot:
        item.decompose()

    div_content = soup.find('div', id='info')
    h1_content = div_content.find('h1')
    name = h1_content.get_text(strip=True)

    per_game_table = soup.find('table', id='per_game')

    df = pd.read_html(str(per_game_table))[0]
    df['Name'] = name
    return df

In [125]:
def create_df(html):
    df_stats = parse_player_stats_html(html)
    df_salary = parse_player_salary_html(html)
    if df_salary.empty:
        return df_salary

    merged_df = pd.merge(df_stats, df_salary, on='Season', how='left')
    merged_df = merged_df.dropna(subset=['Salary'])
    return merged_df

In [136]:
columns = ['Season', 'Age', 'Tm', 'Lg_x', 'Pos', 'G', 'GS', 'MP', 'FG', 'FGA',
       'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA',
       'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS',
       'Awards', 'Name', 'Team', 'Lg_y', 'Salary']

csv_file = 'data/player_data.csv'

with open(csv_file, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(columns)


for html in os.listdir('data/player_html/'):
    file_path = os.path.join("data/player_html/", html)
    df = create_df(file_path)

    if not df.empty:
        df.to_csv('data/player_data.csv', mode='a', index=False, header=False)

In [138]:
df = pd.read_csv("data/player_data.csv")
df_sorted = df.sort_values(by='Season')
df_filtered = df_sorted[df['Salary'].str.startswith('$')]
df_filtered.to_csv("full_nba_salaries.csv")