In [1]:
from entities import *
from constants import PICKLE_FILE

import time
import requests
from bs4 import BeautifulSoup, Comment

from tqdm import tqdm, trange

import os
import pickle
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

In [2]:
URL = "https://www.sports-reference.com"
WAIT = 10
DESCRIPTION_LOC = 0
PLAYER_STATS_LOC = 6

In [3]:
def fetch_cbb_page(url, delay=3):
    """
    Fetch a page from Sports Reference with polite rate limiting

    Args:
        url: The URL to fetch
        delay: Seconds to wait before request (respects 20 req/min limit)

    Returns:
        BeautifulSoup object or None if failed
    """
    # Be polite - wait between requests
    time.sleep(delay)

    # Headers to mimic a browser
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    try:
        response = requests.get(url, headers=headers)

        if response.status_code == 200:
            print(f"✓ Successfully fetched: {url}")
            return BeautifulSoup(response.content, 'html.parser')
        elif response.status_code == 429:
            print("✗ Rate limited! Wait 60 seconds and try again.")
            return None
        elif response.status_code == 403:
            print("✗ Access forbidden. Your IP may be blocked.")
            return None
        else:
            print(f"✗ Failed with status code: {response.status_code}")
            return None

    except Exception as e:
        print(f"✗ Error: {e}")
        return None

In [4]:
def fetch_schedule(link):

    logs = pd.read_html(link)[0].iloc[:-1]
    team_stats = logs['Team'].copy()
    opp_stats = logs['Opponent'].copy()

    game_mask = logs.columns.get_level_values(0).str.contains('Unnamed|Score')
    game_details = logs.loc[:, game_mask].copy()
    game_details.columns = game_details.columns.get_level_values(1)

    cols = list(game_details.columns)
    cols[4], cols[8] = 'Opp_Name', 'Opp_Score'
    
    game_details.columns = cols
    game_details = game_details[['Date', 'Opp_Name', 'Type', 'Rslt', 'Tm', 'Opp_Score']]

    team_stats = team_stats.join(game_details)
    opp_stats = opp_stats.join(game_details)

    return game_details, team_stats, opp_stats


In [5]:
soup = fetch_cbb_page(
    'https://www.sports-reference.com/cbb/seasons/men/2026-school-stats.html')

✓ Successfully fetched: https://www.sports-reference.com/cbb/seasons/men/2026-school-stats.html


In [6]:
school_tds = soup.find_all('td', {'data-stat': 'school_name'})

schools = []
for td in tqdm(school_tds):
    link = td.find('a')

    if link:
        school_dict = {
            'School': link.text,
            'Link': f"{URL}{link['href']}"
        }
        schools.append(school_dict)

print(f"Found {len(schools)} schools")

100%|██████████| 365/365 [00:00<00:00, 72984.41it/s]

Found 365 schools





In [7]:
df = pd.DataFrame(schools)
df['GameLogs'] = [link.replace('2026.html', '2026-gamelogs.html') for link in df['Link']]
df

Unnamed: 0,School,Link,GameLogs
0,Abilene Christian,https://www.sports-reference.com/cbb/schools/a...,https://www.sports-reference.com/cbb/schools/a...
1,Air Force,https://www.sports-reference.com/cbb/schools/a...,https://www.sports-reference.com/cbb/schools/a...
2,Akron,https://www.sports-reference.com/cbb/schools/a...,https://www.sports-reference.com/cbb/schools/a...
3,Alabama,https://www.sports-reference.com/cbb/schools/a...,https://www.sports-reference.com/cbb/schools/a...
4,Alabama A&M,https://www.sports-reference.com/cbb/schools/a...,https://www.sports-reference.com/cbb/schools/a...
...,...,...,...
360,Wright State,https://www.sports-reference.com/cbb/schools/w...,https://www.sports-reference.com/cbb/schools/w...
361,Wyoming,https://www.sports-reference.com/cbb/schools/w...,https://www.sports-reference.com/cbb/schools/w...
362,Xavier,https://www.sports-reference.com/cbb/schools/x...,https://www.sports-reference.com/cbb/schools/x...
363,Yale,https://www.sports-reference.com/cbb/schools/y...,https://www.sports-reference.com/cbb/schools/y...


In [8]:
school_stats = pd.read_html('https://www.sports-reference.com/cbb/seasons/men/2026-school-stats.html')[0]

In [9]:
end = f'{" "*100}\r'

if os.path.exists(PICKLE_FILE) and os.path.getsize(PICKLE_FILE) > 0:
    with open(PICKLE_FILE, 'rb') as f:
        teams = pickle.load(f)
    print(f"Loaded {len(teams)} existing teams.")
else:
    teams = {}
    print("No valid existing data found. Starting fresh.")

team_list = df['School']
link_list = df['Link']
sched_list = df['GameLogs']

for school, link, sched in zip(team_list, link_list, sched_list):
    
    # Skip if we already have this school
    if school in teams:
        print(f"Skipping {school} (already exists)...", end=end)
        continue

    print(f'Gathering Data: {school:50}', end=f'{" "*100}\r')

    try:
        _ = pd.read_html(link)
        player_descriptions = BasketballData(_[DESCRIPTION_LOC])
        player_stats = BasketballData(_[PLAYER_STATS_LOC])
        team_record = school_stats[school_stats['Unnamed: 1_level_0', 'School'] == school][school_stats.columns[2:8]]['Overall'].iloc[0]
        game_log = fetch_schedule(sched)

        player_context = PlayerContext(
            school=school, 
            description_df=player_descriptions, 
            performance_df=player_stats, 
            record=team_record,
            schedule=game_log[0],
            schedule_team=game_log[1],
            schedule_opp=game_log[2]
        )
        team = Team(player_context)
        teams[school] = team

        with open(PICKLE_FILE, 'wb') as f:
            pickle.dump(teams, f)

    except Exception as e:
        pass

print("\nProcessing complete.")

No valid existing data found. Starting fresh.
Gathering Data: Youngstown State                                                                                                                                      
Processing complete.


In [11]:
with open(PICKLE_FILE, 'rb') as f:
    test = pickle.load(f)

test

{'Abilene Christian': <Team: Abilene Christian Record: 5-3>,
 'Air Force': <Team: Air Force Record: 3-6>,
 'Akron': <Team: Akron Record: 6-2>,
 'Alabama': <Team: Alabama Record: 5-2>,
 'Alabama A&M': <Team: Alabama A&M Record: 3-2>,
 'Alabama State': <Team: Alabama State Record: 3-5>,
 'Albany (NY)': <Team: Albany (NY) Record: 2-6>,
 'Alcorn State': <Team: Alcorn State Record: 1-8>,
 'American': <Team: American Record: 4-4>,
 'Appalachian State': <Team: Appalachian State Record: 4-4>,
 'Arizona': <Team: Arizona Record: 7-0>,
 'Arizona State': <Team: Arizona State Record: 6-2>,
 'Arkansas': <Team: Arkansas Record: 5-2>,
 'Arkansas State': <Team: Arkansas State Record: 5-3>}

In [14]:
test['Arizona'].schedule_opp

Unnamed: 0,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,Date,Opp_Name,Type,Rslt,Tm,Opp_Score
0,30.0,70.0,0.429,7.0,27.0,0.259,23.0,43.0,0.535,0.479,20.0,30.0,0.667,15.0,21.0,36.0,15.0,9.0,2.0,14.0,23.0,2025-11-03,Florida,REG (Non-Conf),W,93.0,87.0
1,21.0,55.0,0.382,6.0,22.0,0.273,15.0,33.0,0.455,0.436,19.0,27.0,0.704,11.0,17.0,28.0,5.0,7.0,4.0,14.0,16.0,2025-11-07,Utah Tech,REG (Non-Conf),W,93.0,67.0
2,20.0,62.0,0.323,4.0,16.0,0.25,16.0,46.0,0.348,0.355,5.0,7.0,0.714,10.0,15.0,25.0,11.0,13.0,2.0,17.0,17.0,2025-11-11,Northern Arizona,REG (Non-Conf),W,84.0,49.0
3,25.0,58.0,0.431,9.0,17.0,0.529,16.0,41.0,0.39,0.509,6.0,8.0,0.75,6.0,18.0,24.0,15.0,4.0,3.0,15.0,12.0,2025-11-14,UCLA,REG (Non-Conf),W,69.0,65.0
4,25.0,56.0,0.446,8.0,25.0,0.32,17.0,31.0,0.548,0.518,9.0,18.0,0.5,4.0,15.0,19.0,15.0,4.0,6.0,5.0,21.0,2025-11-19,Connecticut,REG (Non-Conf),W,71.0,67.0
5,24.0,60.0,0.4,9.0,24.0,0.375,15.0,36.0,0.417,0.475,16.0,24.0,0.667,7.0,16.0,23.0,13.0,8.0,1.0,14.0,17.0,2025-11-24,Denver,REG (Non-Conf),W,103.0,73.0
6,27.0,64.0,0.422,5.0,16.0,0.313,22.0,48.0,0.458,0.461,2.0,9.0,0.222,4.0,18.0,22.0,12.0,4.0,2.0,14.0,29.0,2025-11-29,Norfolk State,REG (Non-Conf),W,98.0,61.0
7,,,,,,,,,,,,,,,,,,,,,,2025-12-06,Auburn,REG (Non-Conf),,,
8,,,,,,,,,,,,,,,,,,,,,,2025-12-12,Alabama,REG (Non-Conf),,,
9,,,,,,,,,,,,,,,,,,,,,,2025-12-16,Abilene Christian,REG (Non-Conf),,,
