In [None]:
from entities import *
from constants import PICKLE_FILE

import time
import requests
from bs4 import BeautifulSoup, Comment

from tqdm import tqdm, trange

import os
import pickle
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

In [None]:
URL = "https://www.sports-reference.com"
WAIT = 10
DESCRIPTION_LOC = 0
PLAYER_STATS_LOC = 6

In [None]:
def fetch_cbb_page(url, delay=3):
    """
    Fetch a page from Sports Reference with polite rate limiting

    Args:
        url: The URL to fetch
        delay: Seconds to wait before request (respects 20 req/min limit)

    Returns:
        BeautifulSoup object or None if failed
    """
    # Be polite - wait between requests
    time.sleep(delay)

    # Headers to mimic a browser
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    try:
        response = requests.get(url, headers=headers)

        if response.status_code == 200:
            print(f"✓ Successfully fetched: {url}")
            return BeautifulSoup(response.content, 'html.parser')
        elif response.status_code == 429:
            print("✗ Rate limited! Wait 60 seconds and try again.")
            return None
        elif response.status_code == 403:
            print("✗ Access forbidden. Your IP may be blocked.")
            return None
        else:
            print(f"✗ Failed with status code: {response.status_code}")
            return None

    except Exception as e:
        print(f"✗ Error: {e}")
        return None

In [None]:
def fetch_schedule(link):

    logs = pd.read_html(link)[0].iloc[:-1]
    team_stats = logs['Team'].copy()
    opp_stats = logs['Opponent'].copy()

    game_mask = logs.columns.get_level_values(0).str.contains('Unnamed|Score')
    game_details = logs.loc[:, game_mask].copy()
    game_details.columns = game_details.columns.get_level_values(1)

    cols = list(game_details.columns)
    cols[4], cols[8] = 'Opp_Name', 'Opp_Score'
    
    game_details.columns = cols
    game_details = game_details[['Date', 'Opp_Name', 'Type', 'Rslt', 'Tm', 'Opp_Score']]

    team_stats = team_stats.join(game_details)
    opp_stats = opp_stats.join(game_details)

    return game_details, team_stats, opp_stats


In [None]:
soup = fetch_cbb_page(
    'https://www.sports-reference.com/cbb/seasons/men/2026-school-stats.html')

In [None]:
school_tds = soup.find_all('td', {'data-stat': 'school_name'})

schools = []
for td in tqdm(school_tds):
    link = td.find('a')

    if link:
        school_dict = {
            'School': link.text,
            'Link': f"{URL}{link['href']}"
        }
        schools.append(school_dict)

print(f"Found {len(schools)} schools")

In [None]:
url = 'https://www.sports-reference.com/cbb/seasons/men/2026-advanced-school-stats.html'
raw_data = pd.read_html(url)[0]

pos_df = pd.DataFrame({
    'School': raw_data['Unnamed: 1_level_0']['School'],
    'Pace': raw_data['School Advanced']['Pace']
})

pos_df = pos_df[pos_df['School'] != 'School'].dropna().reset_index(drop=True)
pos_df['Pace'] = pd.to_numeric(pos_df['Pace'])

In [None]:
df = pd.DataFrame(schools)
df['GameLogs'] = [link.replace('2026.html', '2026-gamelogs.html') for link in df['Link']]
df['Pace'] = pos_df['Pace']
df

In [None]:
school_stats = pd.read_html('https://www.sports-reference.com/cbb/seasons/men/2026-school-stats.html')[0]

In [None]:
end = f'{" "*100}\r'

if os.path.exists(PICKLE_FILE) and os.path.getsize(PICKLE_FILE) > 0:
    with open(PICKLE_FILE, 'rb') as f:
        teams = pickle.load(f)
    print(f"Loaded {len(teams)} existing teams.")
else:
    teams = {}
    print("No valid existing data found. Starting fresh.")

team_list = df['School']
link_list = df['Link']
sched_list = df['GameLogs']
possession_list = df['Pace']

for school, link, sched, pos in zip(team_list, link_list, sched_list, possession_list):
    
    # Skip if we already have this school
    if school in teams:
        print(f"Skipping {school} (already exists)...", end=end)
        continue

    print(f'Gathering Data: {school:50}', end=f'{" "*100}\r')

    try:
        _ = pd.read_html(link)
        player_descriptions = BasketballData(_[DESCRIPTION_LOC])
        player_stats = BasketballData(_[PLAYER_STATS_LOC])
        team_record = school_stats[school_stats['Unnamed: 1_level_0', 'School'] == school][school_stats.columns[2:8]]['Overall'].iloc[0]
        game_log = fetch_schedule(sched)

        player_context = PlayerContext(
            school=school, 
            description_df=player_descriptions, 
            performance_df=player_stats, 
            record=team_record,
            schedule=game_log[0],
            schedule_team=game_log[1],
            schedule_opp=game_log[2],
            pace=pos
        )
        team = Team(player_context)
        teams[school] = team

        with open(PICKLE_FILE, 'wb') as f:
            pickle.dump(teams, f)

    except Exception as e:
        pass

print("\nProcessing complete.")

In [None]:
with open(PICKLE_FILE, 'rb') as f:
    test = pickle.load(f)

list(test.keys())[:5]

## Overall Distribution of Possessions

In [None]:
import pandas as pd
from scipy.stats import norm

url = 'https://www.teamrankings.com/ncaa-basketball/stat/possessions-per-game'
norm.fit(pd.read_html(url)[0]['2025'])