### See this link for scraping of Sports Reference: https://www.sports-reference.com/bot-traffic.html

In [165]:
import os
from bs4 import BeautifulSoup, Comment
import requests
import time
import random

In [39]:
letters = [chr(letter) for letter in range(ord('a'), ord('z')+1)]

In [41]:
def save_html(url, folder_path, file_name):

    time.sleep(5)

    # Make a GET request to the URL
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code == 200:
        # Create the folder if it doesn't exist
        if not os.path.exists(folder_path):
            os.makedirs(folder_path)
        
        # Write the HTML content to a file
        with open(os.path.join(folder_path, file_name), 'w', encoding='utf-8') as f:
            f.write(response.text)
    else:
        print("Failed to retrieve HTML. Status code:", response.status_code)

In [42]:
for letter in letters:
    url = f"https://www.basketball-reference.com/players/{letter}/"
    save_folder_path = "data/player_names"
    file_name = f"{letter}_names.html"

    save_html(url, save_folder_path, file_name)
    

In [43]:
def parse_html_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()
    
    # Parse HTML using BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Find the table with id="players"
    players_table = soup.find('table', id='players')
    if players_table:
        # Find all table rows within the players table
        table_rows = players_table.find_all('tr')
        
        # Extract href values from anchor elements within each row
        href_values = []
        for row in table_rows:
            anchor = row.find('a')
            if anchor and anchor.has_attr('href'):
                href_values.append(anchor['href'])
        
        return href_values
    else:
        print("No table with id='players' found in:", file_path)
        return []

In [44]:
for letter in letters:
    file_path = f"data/player_names/{letter}_names.html"
    href_values = parse_html_file(file_path)

    href_file_path = os.path.join("data/player_href", f"{letter}_names_href.txt")
    with open(href_file_path, 'w', encoding='utf-8') as href_file:
        href_file.write('\n'.join(href_values))

In [179]:
def check_file_exists(file_path):
    return os.path.exists(file_path)

In [187]:
# Function to retrieve HTML for a single href and store it in player_html folder
def save_player_html(href, folder_path):
    # Make a GET request to the href
    href = href.strip()

    # Extract player ID from href
    player_id = href.split('/')[-1]
        
    # Write the HTML content to a file
    file_name = player_id
    file_path = os.path.join(folder_path, file_name)

    if not check_file_exists(file_path):
        time.sleep(3.5)
        url = f"https://www.basketball-reference.com{href}"
        print(url)

        response = requests.get(url)
        
        # Check if the request was successful
        if response.status_code == 200:
            with open(file_path, 'w', encoding='utf-8') as f:
                f.write(response.text)
        else:
            print(f"Failed to retrieve HTML for player {url}. Status code:", response.status_code)
    else:
        return


In [188]:
for letter in letters:
    file_path = os.path.join("data/player_href/", f"{letter}_names_href.txt")
    with open(file_path, 'r', encoding='utf-8') as href_file:
        for href in href_file:
            # Call save_player_html function for each href
            save_player_html(href, "data/player_html")

https://www.basketball-reference.com/players/f/fergude01.html
https://www.basketball-reference.com/players/f/fergute01.html
https://www.basketball-reference.com/players/f/fernaru01.html
https://www.basketball-reference.com/players/f/fernabr01.html
https://www.basketball-reference.com/players/f/fernser01.html
https://www.basketball-reference.com/players/f/ferraal01.html
https://www.basketball-reference.com/players/f/ferrero01.html
https://www.basketball-reference.com/players/f/ferredu01.html
https://www.basketball-reference.com/players/f/ferreyo01.html
https://www.basketball-reference.com/players/f/ferriar01.html
https://www.basketball-reference.com/players/f/ferrybo01.html
https://www.basketball-reference.com/players/f/ferryda01.html
https://www.basketball-reference.com/players/f/fesenky01.html
https://www.basketball-reference.com/players/f/fieldbo01.html
https://www.basketball-reference.com/players/f/fieldke01.html
https://www.basketball-reference.com/players/f/fieldla01.html
https://

KeyboardInterrupt: 

In [154]:
def parse_player_html(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()

    soup = BeautifulSoup(html_content, 'html.parser')
    
    for comment in soup.find_all(string=lambda text:isinstance(text,Comment)):
        data = BeautifulSoup(comment, "html.parser")
        salaries_table = data.find('table', id='all_salaries')
        if salaries_table:
            break

    # Check if the salaries table exists
    if salaries_table:
        # Find all table rows within the salaries table
        table_rows = salaries_table.find_all('tr')
        
        # Initialize a list to store salary data
        player_salaries = []
        
        for row in table_rows[1:]:
            i = 0
            for item in row:
                if i == 0:
                    season = ''.join(map(str, item.contents))
                else:
                    temp = [cell.get_text(strip=True) for cell in row.find_all('td')]
                    if i == 1:
                        team_name = temp[0]
                    elif i == 2:
                        lg_id = temp[1]
                    else: 
                        salary = temp[2].replace("$", '').replace(',', '')
                i += 1

            # Append data to player_salaries list
            player_salaries.append({
                'season': season,
                'team_name': team_name,
                'lg_id': lg_id,
                'salary': salary
            })
        
        return player_salaries
    else:
        print("No table with id='all_salaries' found in the HTML.")
        return []

In [155]:
test_html = "data/player_html/abdulka01.html"

temp = parse_player_html(test_html)
print(temp)


  data = BeautifulSoup(comment, "html.parser")


[{'season': '1984-85', 'team_name': 'Los Angeles Lakers', 'lg_id': 'NBA', 'salary': '1530000'}, {'season': '1985-86', 'team_name': 'Los Angeles Lakers', 'lg_id': 'NBA', 'salary': '2030000'}, {'season': '1987-88', 'team_name': 'Los Angeles Lakers', 'lg_id': 'NBA', 'salary': '2000000'}, {'season': '1988-89', 'team_name': 'Los Angeles Lakers', 'lg_id': 'NBA', 'salary': '3000000'}, {'season': 'Career', 'team_name': '(may be incomplete)', 'lg_id': '', 'salary': '8560000'}]
