In [1]:
import os
import time

from bs4 import BeautifulSoup
import pandas as pd
import requests
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [None]:
BASE_URL = "https://www.sports-reference.com"
MENS_TEAMS = "/cbb/postseason/men/2025-ncaa.html"

In [None]:
# Gather teams.

# Include teams in the first four.
team_to_link = {
    "Alabama State": "/cbb/schools/alabama-state/men/2025.html",
    "Saint Francis (PA)": "/cbb/schools/saint-francis-pa/men/2025.html",
    "San Diego State": "/cbb/schools/san-diego-state/men/2025.html",
    "North Carolina": "/cbb/schools/north-carolina/men/2025.html",
    "American": "/cbb/schools/american/men/2025.html",
    "Mount St. Mary's": "/cbb/schools/mount-st-marys/men/2025.html",
    "Texas": "/cbb/schools/texas/men/2025.html",
    "Xavier": "/cbb/schools/xavier/men/2025.html"
}

response = requests.get(BASE_URL + MENS_TEAMS)
soup = BeautifulSoup(response.text, "html.parser")
brackets = soup.find("div", id="brackets")
team_to_link.update({a.text: a["href"] for a in brackets.find_all("a", href=True) if a.text != "tbd"})

print(f"Number of teams: {len(team_to_link)}")

In [None]:
# Create a data frame for each team.

team_to_df = dict()
for team, link in team_to_link.items():
    link = BASE_URL + link[:-5] + "-gamelogs.html"
    while (response := requests.get(link)).status_code != 200:
        print(f"Received status {response.status_code}, retrying for {team}.")
        time.sleep(1)
    soup = BeautifulSoup(response.text, "html.parser")
    
    table = soup.find("table")
    rows = [[cell.text.strip() for cell in row.find_all(["th", "td"])] for row in table.find_all("tr")]
    headers = rows[1]
    rows = [row for row in rows if row[0].isnumeric()]
    rows = [[float(cell) if cell.isnumeric() else cell for cell in row] for row in rows]
    
    df = pd.DataFrame(rows, columns=headers)
    team_to_df[team] = df

    print(f"Gathered data for {team}.")

In [None]:
# Write data to CSVs.

os.makedirs("teams", exist_ok=True)
for team, df in team_to_df.items():
    df.to_csv(os.path.join("teams", f"{team}.csv"), index=False)

In [2]:
# Load data from CSVs.

team_to_df = dict()
for file in os.listdir("teams"):
    name = file[:-4]
    file = os.path.join("teams", file)
    team_to_df[name] = pd.read_csv(file)

In [12]:
# Signals

def win_percentage(df):
    counts = df["Rslt"].value_counts()
    wins = counts["W"] if "W" in counts else 0
    losses = counts["L"] if "L" in counts else 0
    total = wins + losses
    return wins / total

def average_score_differential(df):
    team_scores = df["Tm"]
    opponent_scores = df["Opp.1"]
    differentials = team_scores - opponent_scores
    return differentials.mean()

def average_foul_differential(df):
    team_fouls = df["PF"]
    opponent_fouls = df["PF.1"]
    differentials = team_fouls - opponent_fouls
    return -differentials.mean()

def average_rebounds(df):
    rebounds = df["TRB"]
    return rebounds.mean()

def average_assists(df):
    assists = df["AST"]
    return assists.mean()

def average_steals(df):
    steals = df["STL"]
    return steals.mean()

def average_blocks(df):
    blocks = df["BLK"]
    return blocks.mean()

def average_turnovers(df):
    turnovers = df["TOV"]
    return -turnovers.mean()

def average_effective_field_goal_percentage(df):
    effective_field_goal_percentage = df["eFG%"]
    return effective_field_goal_percentage.mean()

def average_free_throw_percentage(df):
    free_throw_percentage = df["FT%"]
    return free_throw_percentage.mean()

In [13]:
# Compute and normalize the signals.

signals = [win_percentage, average_score_differential, average_foul_differential, average_rebounds, average_assists, average_steals, average_blocks, average_turnovers, average_effective_field_goal_percentage, average_free_throw_percentage]

team_to_signals = {team: [signal(df) for signal in signals] for team, df in team_to_df.items()}
signals_df = pd.DataFrame.from_dict(team_to_signals, orient="index", columns=[signal.__name__ for signal in signals])

scaler = StandardScaler()
signals_normalized = scaler.fit_transform(signals_df)
signals_normalized_df = pd.DataFrame(signals_normalized, columns=signals_df.columns, index=signals_df.index)

signals_normalized_df

Unnamed: 0,win_percentage,average_score_differential,average_foul_differential,average_rebounds,average_assists,average_steals,average_blocks,average_turnovers,average_effective_field_goal_percentage,average_free_throw_percentage
Georgia,-1.06563,-0.676985,1.615586,-0.152777,-1.630458,0.380769,1.284046,-1.378358,-0.48533,-0.490916
North Carolina,-1.028192,-0.778152,-0.214998,-0.077277,-0.268715,-1.01015,-0.070319,0.026924,0.312588,-0.298869
Bryant,-0.526074,-0.136856,0.501911,1.90411,-0.073671,-0.387801,2.058037,-0.849928,-0.865994,-0.497925
Wisconsin,0.169843,0.051761,0.7201,-0.155937,-0.286493,-1.552155,-1.409909,1.002539,0.096727,2.644924
Maryland,0.324135,1.3435,1.130032,0.132144,-0.262788,0.502207,0.693796,0.578329,-0.008659,0.638387
Ole Miss,-0.628847,-0.86794,-0.765352,-1.536413,-0.545633,1.192031,0.053054,1.6612,-1.101158,0.200854
Robert Morris,0.398879,-0.638152,1.336165,0.226789,0.58519,-0.633298,0.679048,-1.317049,-0.786726,-0.093051
Baylor,-1.581829,-0.424197,-0.302526,-0.165812,-0.300501,0.318254,-0.838415,0.217372,-0.811967,0.422594
Texas,-1.759345,-0.687576,-0.631819,-0.270623,-1.318186,-0.700251,0.516814,0.901775,-0.543725,0.513849
Colorado State,0.090561,-0.221582,-0.546255,-0.721764,0.951224,-1.592965,-0.943293,-0.476231,1.096205,1.683945


In [14]:
# Perform PCA.

pca = PCA()
principal_components = pca.fit_transform(signals_normalized_df)
pca_df = pd.DataFrame(principal_components, index=signals_normalized_df.index, columns=[f"PC{i}" for i in range(principal_components.shape[1])])

pca_df

Unnamed: 0,PC0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9
Georgia,-1.824637,1.58748,0.112131,1.562625,-0.502547,0.220718,-0.042238,1.303657,0.82828,0.25203
North Carolina,-1.005884,-0.084137,-0.984551,-0.592766,0.281185,-0.53753,0.041135,0.369419,0.392882,-0.00965
Bryant,-0.424207,3.007129,0.288993,-0.309765,0.684522,0.120124,-0.182316,0.337831,0.498427,-0.059795
Wisconsin,0.843088,-1.368733,-1.908805,1.512383,1.990551,-0.345374,0.137712,-0.53804,-0.382231,-0.111201
Maryland,1.277916,0.284627,0.718878,1.200906,0.462113,0.023846,-0.545747,0.495307,0.406866,0.39156
Ole Miss,-1.459728,-1.787528,1.423541,0.026753,0.777682,0.266974,-0.972291,0.544323,-0.653057,-0.36278
Robert Morris,-0.230973,1.822781,-0.815705,0.672564,-0.446632,0.155844,-0.061747,0.307306,-1.074297,0.014677
Baylor,-1.443672,-0.702873,-0.220828,0.132863,0.595905,0.315455,-0.852269,-0.726534,0.348619,0.325257
Texas,-1.758658,-0.514004,-0.124853,-0.188036,1.828869,-0.252047,-0.220376,0.69963,0.719758,0.189768
Colorado State,0.776531,-1.159215,-2.408614,-0.390108,0.484177,0.624257,0.762822,-0.086242,-0.555015,0.22594


In [15]:
# Score the teams and display results.

scores = pca_df.values @ pca.explained_variance_ratio_
scores_df = pd.DataFrame({"Score": scores}, index=pca_df.index)
scores_df = scores_df.sort_values(by="Score", ascending=False)

pd.set_option("display.max_rows", 100)
scores_df

Unnamed: 0,Score
Duke,1.10406
Florida,1.07391
St. John's (NY),0.944836
VCU,0.943599
Auburn,0.836444
Houston,0.801315
Grand Canyon,0.726095
Michigan State,0.713679
Maryland,0.677019
New Mexico,0.602214
