In [None]:
!pip install fake-useragent

Collecting fake-useragent
  Downloading fake_useragent-1.5.1-py3-none-any.whl.metadata (15 kB)
Downloading fake_useragent-1.5.1-py3-none-any.whl (17 kB)
Installing collected packages: fake-useragent
Successfully installed fake-useragent-1.5.1


In [None]:
import os
import time
import random
import pandas as pd
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from google.colab import drive

In [None]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
base_url = "https://www.basketball-reference.com/teams/"

In [None]:
data_dir = '/content/drive/My Drive/GA/Capstone/capstone/data'

In [None]:
# this was an approach that I was using to try and use proxy servers to scrape the data, because Google Colab is blocked.
# These proxies were also blocked, but I'm going to leave this in the repo in case I ever want to pay for proxies to scrape
# large amounts of data from BBRef in the future. The real scraping ipynb file is called "bbref-scraping"

if not os.path.exists(data_dir):
    os.makedirs(data_dir)

def scrape_team_data(team, year):
    url = f"{base_url}{team}/{year}.html"

    try:
        response = requests.get(url)

        # checks for response status (BBRef is very sensitive to scraping requests)
        if response.status_code != 200:
            print(f"Failed to retrieve data for {team} in {year}. HTTP Status Code: {response.status_code}")
            return pd.DataFrame()

        soup = BeautifulSoup(response.content, 'html.parser')

        # finds the roster table
        roster_table = soup.find('table', {'id': 'roster'})

        if roster_table:
            players = roster_table.find('tbody').find_all('tr')

            data = []
            for player in players:
                try:
                    # extracts player name and jersey number
                    player_name_tag = player.find('th', {'data-stat': 'player'})
                    player_name = player_name_tag.text.strip() if player_name_tag else "N/A"

                    jersey_number_tag = player.find('td', {'data-stat': 'number'})
                    jersey_number = jersey_number_tag.text.strip() if jersey_number_tag else "N/A"

                    # constrcts the player's profile URL (if it exists) (player profile URLs follow a constistent pattern)
                    player_profile_tag = player_name_tag.find('a') if player_name_tag else None
                    player_url = f"https://www.basketball-reference.com{player_profile_tag['href']}" if player_profile_tag else "N/A"

                    # appends the data
                    data.append({
                        'Year': year,
                        'PlayerName': player_name,
                        'JerseyNumber': jersey_number,
                        'PlayerURL': player_url
                    })

                except Exception as e:
                    print(f"Error processing player data for {team} in {year}: {e}")

            df = pd.DataFrame(data)
            return df
        else:
            print(f"Roster table not found for {team} in {year}")
            return pd.DataFrame()

    except requests.exceptions.RequestException as e:
        print(f"Request failed for {team} in {year}: {e}")
        return pd.DataFrame()

In [None]:
proxies = get_free_proxies()
ua = UserAgent()

def get_random_proxy():
    return {"http": random.choice(proxies), "https": random.choice(proxies)}

In [None]:


def scrape_team_data(team, year):
    url = f"{base_url}{team}/{year}.html"

    for attempt in range(5):  # retries up to 5 times
        proxy = get_random_proxy()
        headers = {'User-Agent': ua.random}  # randomly rotates User-Agent

        try:
            response = requests.get(url, headers=headers, proxies=proxy, timeout=10)  # timeout threshold for faster retries

            if response.status_code == 200:
                soup = BeautifulSoup(response.content, 'html.parser')
                roster_table = soup.find('table', {'id': 'roster'})

                if roster_table:
                    players = roster_table.find('tbody').find_all('tr')
                    data = []

                    for player in players:
                        try:
                            player_name = player.find('th', {'data-stat': 'player'}).text.strip()
                            jersey_number = player.find('td', {'data-stat': 'number'}).text.strip()

                            player_profile_tag = player.find('th', {'data-stat': 'player'}).find('a')
                            player_url = f"https://www.basketball-reference.com{player_profile_tag['href']}" if player_profile_tag else "N/A"

                            data.append({
                                'Year': year,
                                'PlayerName': player_name,
                                'JerseyNumber': jersey_number,
                                'PlayerURL': player_url
                            })
                        except Exception as e:
                            print(f"Error processing player data for {team} in {year}: {e}")

                    df = pd.DataFrame(data)
                    return df

                else:
                    print(f"Roster table not found for {team} in {year}")
                    return pd.DataFrame()
            # contingency for if proxy is blocked
            elif response.status_code == 429:
                retry_after = int(response.headers.get('Retry-After', 10))
                print(f"HTTP 429: Too many requests. Retrying after {retry_after} seconds...")
                time.sleep(retry_after)
            else:
                print(f"Failed to retrieve data for {team} in {year}. HTTP Status Code: {response.status_code}")
                return pd.DataFrame()
        # process for switching to a different proxy
        except (ProxyError, SSLError, ConnectionError) as e:
            print(f"Request failed for {team} in {year} with proxy {proxy['http']}: {e}")
            time.sleep(random.uniform(10, 30))

    print(f"Failed to retrieve data for {team} in {year} after multiple attempts.")
    return pd.DataFrame()

In [None]:
def save_team_data(team, df):
    if df.empty:
        return

    filename = f"{team}_RosterData.csv"
    filepath = os.path.join(data_dir, filename)

    # appends the DataFrame to the CSV file
    if os.path.exists(filepath):
        df.to_csv(filepath, mode='a', header=False, index=False)
    else:
        df.to_csv(filepath, index=False)

    print(f"Saved data for {team} to {filename}")

In [None]:
def process_teams_years(teams_years):
    for team, years in teams_years.items():
        all_team_data = pd.DataFrame()
        for year in years:
            df = scrape_team_data(team, year)
            all_team_data = pd.concat([all_team_data, df], ignore_index=True)
            time.sleep(random.uniform(10, 15)) # waiting 10-15 seconds for each scrape to prevent timeouts (this works)
        save_team_data(team, all_team_data)

In [None]:
teams_years = {
    'BOS': list(range(1954, 2025)),  # Celtics from 1954 to 2024
    'LAL': list(range(1954, 2025))   # Lakers from 1954 to 2024
}

In [None]:
process_teams_years(teams_years)

NameError: name 'ProxyError' is not defined