In [18]:
from entities import *
from constants import TEAMS_DATA

import time
import requests
from bs4 import BeautifulSoup, Comment

from tqdm import tqdm, trange

import os
import pickle
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

In [19]:
#TODO: NCAA Women Teams

In [20]:
URL = "https://www.sports-reference.com"
WAIT = 10
DESCRIPTION_LOC = 0
PLAYER_STATS_LOC = 6

In [21]:
def fetch_cbb_page(url, delay=3):
    time.sleep(delay)
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    try:
        response = requests.get(url, headers=headers)

        if response.status_code == 200:
            print(f"✓ Successfully fetched: {url}")
            return BeautifulSoup(response.content, 'html.parser')
        elif response.status_code == 429:
            print("✗ Rate limited! Wait 60 seconds and try again.")
            return None
        elif response.status_code == 403:
            print("✗ Access forbidden. Your IP may be blocked.")
            return None
        else:
            print(f"✗ Failed with status code: {response.status_code}")
            return None

    except Exception as e:
        print(f"✗ Error: {e}")
        return None

In [22]:
def fetch_schedule(link):

    logs = pd.read_html(link)[0].iloc[:-1]
    team_stats = logs['Team'].copy()
    opp_stats = logs['Opponent'].copy()

    game_mask = logs.columns.get_level_values(0).str.contains('Unnamed|Score')
    game_details = logs.loc[:, game_mask].copy()
    game_details.columns = game_details.columns.get_level_values(1)

    cols = list(game_details.columns)
    cols[4], cols[8] = 'Opp_Name', 'Opp_Score'
    
    game_details.columns = cols
    game_details = game_details[['Date', 'Opp_Name', 'Type', 'Rslt', 'Tm', 'Opp_Score']]

    team_stats = team_stats.join(game_details)
    opp_stats = opp_stats.join(game_details)

    return game_details, team_stats, opp_stats


In [23]:
soup = fetch_cbb_page(
    'https://www.sports-reference.com/cbb/seasons/men/2026-school-stats.html')

✓ Successfully fetched: https://www.sports-reference.com/cbb/seasons/men/2026-school-stats.html


In [24]:
school_tds = soup.find_all('td', {'data-stat': 'school_name'})

schools = []
for td in tqdm(school_tds):
    link = td.find('a')

    if link:
        school_dict = {
            'School': link.text,
            'Link': f"{URL}{link['href']}"
        }
        schools.append(school_dict)

print(f"Found {len(schools)} schools")

100%|██████████| 365/365 [00:00<00:00, 56034.59it/s]

Found 365 schools





In [25]:
url = 'https://www.sports-reference.com/cbb/seasons/men/2026-advanced-school-stats.html'
raw_data = pd.read_html(url)[0]

pos_df = pd.DataFrame({
    'School': raw_data['Unnamed: 1_level_0']['School'],
    'Pace': raw_data['School Advanced']['Pace']
})

pos_df = pos_df[pos_df['School'] != 'School'].dropna().reset_index(drop=True)
pos_df['Pace'] = pd.to_numeric(pos_df['Pace'])

In [26]:
df = pd.DataFrame(schools)
df['GameLogs'] = [link.replace('2026.html', '2026-gamelogs.html') for link in df['Link']]
df['Pace'] = pos_df['Pace']
df.to_csv('data/Team_Links.csv')
df

Unnamed: 0,School,Link,GameLogs,Pace
0,Abilene Christian,https://www.sports-reference.com/cbb/schools/a...,https://www.sports-reference.com/cbb/schools/a...,68.8
1,Air Force,https://www.sports-reference.com/cbb/schools/a...,https://www.sports-reference.com/cbb/schools/a...,67.3
2,Akron,https://www.sports-reference.com/cbb/schools/a...,https://www.sports-reference.com/cbb/schools/a...,74.4
3,Alabama,https://www.sports-reference.com/cbb/schools/a...,https://www.sports-reference.com/cbb/schools/a...,76.1
4,Alabama A&M,https://www.sports-reference.com/cbb/schools/a...,https://www.sports-reference.com/cbb/schools/a...,68.3
...,...,...,...,...
360,Wright State,https://www.sports-reference.com/cbb/schools/w...,https://www.sports-reference.com/cbb/schools/w...,68.8
361,Wyoming,https://www.sports-reference.com/cbb/schools/w...,https://www.sports-reference.com/cbb/schools/w...,72.1
362,Xavier,https://www.sports-reference.com/cbb/schools/x...,https://www.sports-reference.com/cbb/schools/x...,71.7
363,Yale,https://www.sports-reference.com/cbb/schools/y...,https://www.sports-reference.com/cbb/schools/y...,68.1


In [27]:
school_stats = pd.read_html('https://www.sports-reference.com/cbb/seasons/men/2026-school-stats.html')[0]

In [52]:
BATCH_LIMIT = 5
processed_count = 0 
end = f'{" "*100}\r'

# Load existing data
if os.path.exists(TEAMS_DATA) and os.path.getsize(TEAMS_DATA) > 0:
    with open(TEAMS_DATA, 'rb') as f:
        teams = pickle.load(f)
    print(f"Loaded {len(teams)} existing teams.")
else:
    teams = {}
    print("No valid existing data found. Starting fresh.")

team_list = df['School']
link_list = df['Link']
sched_list = df['GameLogs']
possession_list = df['Pace']

# Iterate through the data
for school, link, sched, pos in zip(team_list, link_list, sched_list, possession_list):
    
    # Exit if we have reached our batch limit for this run
    if processed_count >= BATCH_LIMIT:
        print(f"\nReached batch limit of {BATCH_LIMIT}. Stopping.")
        break

    # Skip if we already have this school
    if school in teams:
        continue

    print(f'Gathering Data: {school:50}', end=end)

    try:
        # Scrape and process
        _ = pd.read_html(link)
        player_descriptions = BasketballData(_[DESCRIPTION_LOC])
        player_stats = BasketballData(_[PLAYER_STATS_LOC])
        team_record = school_stats[school_stats['Unnamed: 1_level_0', 'School'] == school][school_stats.columns[2:8]]['Overall'].iloc[0]
        game_log = fetch_schedule(sched)

        player_context = PlayerContext(
            school=school, 
            description_df=player_descriptions, 
            performance_df=player_stats, 
            record=team_record,
            schedule=game_log[0],
            schedule_team=game_log[1],
            schedule_opp=game_log[2],
            pace=pos
        )
        
        team = Team(player_context)
        teams[school] = team

        # Save after every successful scrape to prevent data loss
        with open(TEAMS_DATA, 'wb') as f:
            pickle.dump(teams, f)

        # Increment the counter only after a successful save
        processed_count += 1

    except Exception as e:
        print(f"Error processing {school}: {e}")
        pass

print(f"\nProcessing complete. Total teams now in database: {len(teams)}")

Loaded 350 existing teams.
Gathering Data: Western Michigan                                                                                                                                      
Reached batch limit of 5. Stopping.

Processing complete. Total teams now in database: 355


In [53]:
with open(TEAMS_DATA, 'rb') as f:
    test = pickle.load(f)

list(test.keys())[-5:]

['West Virginia',
 'Western Carolina',
 'Western Illinois',
 'Western Kentucky',
 'Western Michigan']

## Overall Distribution of Possessions

In [51]:
import pandas as pd
from scipy.stats import norm

norm.fit(df['Pace'])

(np.float64(70.28356164383561), np.float64(2.9582489127457956))