In [10]:
import os
import time
import random
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [11]:
base_url = "https://www.basketball-reference.com/teams/"

In [12]:
data_dir = 'data/'

In [13]:
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

In [16]:
def scrape_team_data(team, year):
    url = f"{base_url}{team}/{year}.html"
    
    try:
        response = requests.get(url)
        
        # Check the status code of the response
        if response.status_code != 200:
            print(f"Failed to retrieve data for {team} in {year}. HTTP Status Code: {response.status_code}")
            return pd.DataFrame()

        soup = BeautifulSoup(response.content, 'html.parser')

        # Find the roster table
        roster_table = soup.find('table', {'id': 'roster'})

        if roster_table:
            players = roster_table.find('tbody').find_all('tr')

            data = []
            for player in players:
                try:
                    # Extract player name and jersey number
                    jersey_number = player.find('th', {'data-stat': 'number'}).text.strip()
                    player_name = player.find('td', {'data-stat': 'player'}).text.strip()

                    # Construct the player's profile URL if it exists
                    player_profile_tag = player.find('td', {'data-stat': 'player'}).find('a')
                    player_url = f"https://www.basketball-reference.com{player_profile_tag['href']}" if player_profile_tag else "N/A"

                    # Append the data
                    data.append({
                        'Year': year,
                        'PlayerName': player_name,
                        'JerseyNumber': jersey_number,
                        'PlayerURL': player_url
                    })

                except Exception as e:
                    print(f"Error processing player data for {team} in {year}: {e}")

            df = pd.DataFrame(data)
            return df
        else:
            print(f"Roster table not found for {team} in {year}")
            return pd.DataFrame()

    except requests.exceptions.RequestException as e:
        print(f"Request failed for {team} in {year}: {e}")
        return pd.DataFrame()

In [17]:
team = 'BOS'
year = 1954
df = scrape_team_data(team, year)

In [18]:
print(df)


    Year     PlayerName JerseyNumber  \
0   1954  Don Barksdale           17   
1   1954  Ernie Barrett           23   
2   1954    Bob Brannum           18   
3   1954   Chuck Cooper           11   
4   1954      Bob Cousy           14   
5   1954     Bob Donham           12   
6   1954     Bob Harris           13   
7   1954    Ed Macauley           22   
8   1954       Ed Mikan           15   
9   1954   Jack Nichols           16   
10  1954   Bill Sharman           21   

                                            PlayerURL  
0   https://www.basketball-reference.com/players/b...  
1   https://www.basketball-reference.com/players/b...  
2   https://www.basketball-reference.com/players/b...  
3   https://www.basketball-reference.com/players/c...  
4   https://www.basketball-reference.com/players/c...  
5   https://www.basketball-reference.com/players/d...  
6   https://www.basketball-reference.com/players/h...  
7   https://www.basketball-reference.com/players/m...  
8   https://www

In [26]:
def save_team_data(team, df):
    if df.empty:
        return  # Skip saving empty DataFrame

    filename = f"{team}_RosterData.csv"
    filepath = os.path.join(data_dir, filename)

    if os.path.exists(filepath):
        df.to_csv(filepath, mode='a', header=False, index=False)
    else:
        df.to_csv(filepath, index=False)

    print(f"Saved data for {team} to {filename}")

In [27]:
def process_teams_years(teams_years):
    for team, years in teams_years.items():
        all_team_data = pd.DataFrame()
        for year in years:
            df = scrape_team_data(team, year)
            all_team_data = pd.concat([all_team_data, df], ignore_index=True)
            time.sleep(random.uniform(10, 15))  
        save_team_data(team, all_team_data)

In [30]:
teams_years = {
    'MIN': list(range(1954, 1960)),  # Minneapolis Lakers from 1954 to 1959
    'LAL': list(range(1960, 2025))   # Los Angeles Lakers from 1960 to 2024
}

In [31]:
process_teams_years(teams_years)

Failed to retrieve data for MIN in 1954. HTTP Status Code: 404
Failed to retrieve data for MIN in 1955. HTTP Status Code: 404
Failed to retrieve data for MIN in 1956. HTTP Status Code: 404
Failed to retrieve data for MIN in 1957. HTTP Status Code: 404
Failed to retrieve data for MIN in 1958. HTTP Status Code: 404
Failed to retrieve data for MIN in 1959. HTTP Status Code: 404
Failed to retrieve data for LAL in 1960. HTTP Status Code: 404
Saved data for LAL to LAL_RosterData.csv
