In [1]:
import requests
from bs4 import BeautifulSoup
import sys
import time
import csv
import pandas as pd
from tqdm import tqdm
import os

In [2]:
# Fetch and Soupify Function
def fetch(URL):
    response = None
    try:
        response = requests.get(URL)
        response.raise_for_status()  # Check if the request was successful
    except requests.exceptions.RequestException as e:
        if response.status_code == 429:
            retry_after = int(response.headers.get("Retry-After", 0))
            print(f'Rate limit exceeded. Wait before retrying: {retry_after} seconds')
            return None
        else:
            print(f"Error fetching the URL: {e}")
        return None
    else:
        return BeautifulSoup(response.content, "html.parser")

In [None]:
## Fetching all team names for team stats pages

for i in range(12):
    year = 2014 + i
    if year == 2020:
        continue

    URL = 'https://www.sports-reference.com/cbb/seasons/men/' + str(year) + '-school-stats.html'

    # Fetching and soupifying data
    soup = fetch(URL)
    if soup is None: sys.exit(1)

    # Getting rows from soup
    table = soup.find(id="basic_school_stats")
    rows = table.find_all("tr")

    TeamNames = []
    TourneyTeams = []
    for row in rows:
        # Find all <td> elements with data-stat='school_name'
        school = row.find('td', attrs={'data-stat': 'school_name'})
        if school:
            a = school.find('a')
            if 'href' in a.attrs: 
                href = a.get('href')
                href = href[:-5] + '-gamelogs.html'
                # href = href[:-5] + '-gamelogs-advanced.html'    # Change to use the advanced stats page
            name = school.text
            if name.endswith('\xa0NCAA'):   # Removing NCAA tourney tag
                name = name.replace('\xa0NCAA','')
                TourneyTeams.append(name)

            TeamNames.append([name,href])

    with open(f"Tournament_Teams_{year}.txt", "w") as f:
        for team in TourneyTeams:
            f.write(team + "\n")




In [None]:
## Fetching data for team stats

GameData = {}
year = 2025

for name in tqdm(TeamNames, desc=str(year), unit="team"):

    URL = 'https://www.sports-reference.com' + name[1]

    # Fetching URL and soupifying it
    soup = fetch(URL)
    if soup is None: sys.exit(1)

    # Grabbing all rows of the table
    table = soup.find(id="team_game_log")
    # table = soup.find(id="sgl-advanced")  # Switch ID when getting advanced table
    rows = table.find_all("tr")

    # Grabbing data from each row
    labels = []
    data = []
    for i,row in enumerate(rows):
        stats = row.find_all(['th','td'], attrs={'data-stat': True})
        temp = {}
        for j,stat in enumerate(stats):
            # Skipping header rows
            if i == 0:
                continue
            
            # Grabbing labels row (Had to manually fix some labels)
            if i == 1:
                # if j == 0 or j == 1 or j == 2:
                #     continue
                if j == 3:
                    labels.append('Site')
                elif j >= 31:
                    labels.append('o' + stat.text)
                else:
                    labels.append(stat.text)

            # Grabbing data
            else:
                temp[labels[j]] = stat.text

        # Adding only filled data blocks
        if temp:
            temp.pop('\xa0', None)  # Remove undefined columns
            temp['Team'] = name[0]
            data.append(temp)
    
    # Adding team data to GameData
    GameData[name[0]] = data
    labels.append("Team")
    time.sleep(2)

filename = str(year) + "GameData.csv"
# filename = "2024AdvGameData.csv"

with open(filename, "w") as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=labels)

    writer.writeheader()

    for name in TeamNames:
        writer.writerows(GameData[name[0]])

2025: 100%|██████████| 364/364 [13:25<00:00,  2.21s/team]


ValueError: dict contains fields not in fieldnames: 'Team'

In [42]:
with open(filename, "w") as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=labels)

    writer.writeheader()

    for name in TeamNames:
        writer.writerows(GameData[name[0]])