In [1]:
import os
import time

from bs4 import BeautifulSoup
import pandas as pd
import requests
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [None]:
BASE_URL = "https://www.sports-reference.com"
MENS_TEAMS = "/cbb/postseason/men/2025-ncaa.html"

In [None]:
# Gather teams.

# Include teams in the first four.
team_to_link = {
    "Alabama State": "/cbb/schools/alabama-state/men/2025.html",
    "Saint Francis (PA)": "/cbb/schools/saint-francis-pa/men/2025.html",
    "San Diego State": "/cbb/schools/san-diego-state/men/2025.html",
    "North Carolina": "/cbb/schools/north-carolina/men/2025.html",
    "American": "/cbb/schools/american/men/2025.html",
    "Mount St. Mary's": "/cbb/schools/mount-st-marys/men/2025.html",
    "Texas": "/cbb/schools/texas/men/2025.html",
    "Xavier": "/cbb/schools/xavier/men/2025.html"
}

response = requests.get(BASE_URL + MENS_TEAMS)
soup = BeautifulSoup(response.text, "html.parser")
brackets = soup.find("div", id="brackets")
team_to_link.update({a.text: a["href"] for a in brackets.find_all("a", href=True) if a.text != "tbd"})

print(f"Number of teams: {len(team_to_link)}")

In [None]:
# Create a data frame for each team.

team_to_df = dict()
for team, link in team_to_link.items():
    link = BASE_URL + link[:-5] + "-gamelogs.html"
    while (response := requests.get(link)).status_code != 200:
        print(f"Received status {response.status_code}, retrying for {team}.")
        time.sleep(1)
    soup = BeautifulSoup(response.text, "html.parser")
    
    table = soup.find("table")
    rows = [[cell.text.strip() for cell in row.find_all(["th", "td"])] for row in table.find_all("tr")]
    headers = rows[1]
    rows = [row for row in rows if row[0].isnumeric()]
    rows = [[float(cell) if cell.isnumeric() else cell for cell in row] for row in rows]
    
    df = pd.DataFrame(rows, columns=headers)
    team_to_df[team] = df

    print(f"Gathered data for {team}.")

In [None]:
# Write data to CSVs.

os.makedirs("teams", exist_ok=True)
for team, df in team_to_df.items():
    df.to_csv(os.path.join("teams", f"{team}.csv"), index=False)

In [2]:
# Load data from CSVs.

team_to_df = dict()
for file in os.listdir("teams"):
    name = file[:-4]
    file = os.path.join("teams", file)
    team_to_df[name] = pd.read_csv(file)

In [3]:
# Signals

def win_percentage(df):
    counts = df["Rslt"].value_counts()
    wins = counts["W"] if "W" in counts else 0
    losses = counts["L"] if "L" in counts else 0
    total = wins + losses
    return wins / total

def average_score_differential(df):
    team_scores = df["Tm"]
    opponent_scores = df["Opp.1"]
    differentials = team_scores - opponent_scores
    return differentials.mean()

def average_foul_differential(df):
    team_fouls = df["PF"]
    opponent_fouls = df["PF.1"]
    differentials = team_fouls - opponent_fouls
    return -differentials.mean()

def average_rebounds(df):
    rebounds = df["TRB"]
    return rebounds.mean()

def average_assists(df):
    assists = df["AST"]
    return assists.mean()

def average_steals(df):
    steals = df["STL"]
    return steals.mean()

def average_blocks(df):
    blocks = df["BLK"]
    return blocks.mean()

def average_turnovers(df):
    turnovers = df["TOV"]
    return -turnovers.mean()

def average_effective_field_goal_percentage(df):
    effective_field_goal_percentage = df["eFG%"]
    return effective_field_goal_percentage.mean()

def average_free_throw_percentage(df):
    free_throw_percentage = df["FT%"]
    return free_throw_percentage.mean()

In [4]:
# Compute and normalize the signals.

signals = [win_percentage, average_score_differential, average_foul_differential, average_rebounds, average_assists, average_steals, average_blocks, average_turnovers, average_effective_field_goal_percentage, average_free_throw_percentage]

team_to_signals = {team: [signal(df) for signal in signals] for team, df in team_to_df.items()}
signals_df = pd.DataFrame.from_dict(team_to_signals, orient="index", columns=[signal.__name__ for signal in signals])

scaler = StandardScaler()
signals_normalized = scaler.fit_transform(signals_df)
signals_normalized_df = pd.DataFrame(signals_normalized, columns=signals_df.columns, index=signals_df.index)

signals_normalized_df

Unnamed: 0,win_percentage,average_score_differential,average_foul_differential,average_rebounds,average_assists,average_steals,average_blocks,average_turnovers,average_effective_field_goal_percentage,average_free_throw_percentage
Georgia,-1.065630,-0.676985,1.615586,-0.152777,-1.630458,0.380769,1.284046,-1.378358,-0.485330,-0.490916
North Carolina,-1.028192,-0.778152,-0.214998,-0.077277,-0.268715,-1.010150,-0.070319,0.026924,0.312588,-0.298869
Bryant,-0.526074,-0.136856,0.501911,1.904110,-0.073671,-0.387801,2.058037,-0.849928,-0.865994,-0.497925
Wisconsin,0.169843,0.051761,0.720100,-0.155937,-0.286493,-1.552155,-1.409909,1.002539,0.096727,2.644924
Maryland,0.324135,1.343500,1.130032,0.132144,-0.262788,0.502207,0.693796,0.578329,-0.008659,0.638387
...,...,...,...,...,...,...,...,...,...,...
Illinois,-0.946507,0.034094,0.953717,2.420451,-0.074226,-2.142119,0.359496,-0.552669,-0.609801,0.674070
Troy,-0.311186,-0.147767,0.116222,0.573120,-0.734195,1.674908,0.860947,-1.659604,-1.352861,-0.709046
Oregon,0.006474,-0.911587,0.468852,-0.761726,-0.319357,-0.049652,-0.002663,0.048925,-0.591057,0.881367
Kansas,-0.946507,-0.511491,-0.412722,0.537365,1.622840,-0.739477,0.888805,-0.624861,-0.174676,-0.887458


In [5]:
# Perform PCA.

pca = PCA()
principal_components = pca.fit_transform(signals_normalized_df)
pca_df = pd.DataFrame(principal_components, index=signals_normalized_df.index, columns=[f"PC{i}" for i in range(principal_components.shape[1])])

pca_df

Unnamed: 0,PC0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9
Georgia,-1.824637,1.587480,0.112131,1.562625,-0.502547,0.220718,-0.042238,1.303657,0.828280,0.252030
North Carolina,-1.005884,-0.084137,-0.984551,-0.592766,0.281185,-0.537530,0.041135,0.369419,0.392882,-0.009650
Bryant,-0.424207,3.007129,0.288993,-0.309765,0.684522,0.120124,-0.182316,0.337831,0.498427,-0.059795
Wisconsin,0.843088,-1.368733,-1.908805,1.512383,1.990551,-0.345374,0.137712,-0.538040,-0.382231,-0.111201
Maryland,1.277916,0.284627,0.718878,1.200906,0.462113,0.023846,-0.545747,0.495307,0.406866,0.391560
...,...,...,...,...,...,...,...,...,...,...
Illinois,-0.038154,2.400758,-1.758989,0.227747,1.708585,-0.760725,-0.069218,-0.761827,0.687518,0.231245
Troy,-1.399241,1.667452,1.499924,0.593984,-0.792757,1.177748,0.053132,-0.368436,0.276973,0.248601
Oregon,-0.717067,-0.367815,-0.361542,0.917178,0.480806,0.367489,-0.094688,0.405910,-0.767221,-0.337802
Kansas,-0.389622,1.366898,-0.781532,-1.820460,-0.242232,0.209493,-0.746000,0.196424,-0.315027,0.231442


In [6]:
# Score the teams and display results.

scores = pca_df.values @ pca.explained_variance_ratio_
scores_df = pd.DataFrame({"Score": scores}, index=pca_df.index)
scores_df = scores_df.sort_values(by="Score", ascending=False)

scores_df.head(50)

Unnamed: 0,Score
Duke,1.10406
Florida,1.07391
St. John's (NY),0.944836
VCU,0.943599
Auburn,0.836444
Houston,0.801315
Grand Canyon,0.726095
Michigan State,0.713679
Maryland,0.677019
New Mexico,0.602214
