In [1]:
import os
import time

from bs4 import BeautifulSoup
import pandas as pd
import requests
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [None]:
BASE_URL = "https://www.sports-reference.com"
MENS_TEAMS = "/cbb/postseason/men/2025-ncaa.html"

In [None]:
# Gather teams.

# Include teams in the first four.
team_to_link = {
    "Alabama State": "/cbb/schools/alabama-state/men/2025.html",
    "Saint Francis (PA)": "/cbb/schools/saint-francis-pa/men/2025.html",
    "San Diego State": "/cbb/schools/san-diego-state/men/2025.html",
    "North Carolina": "/cbb/schools/north-carolina/men/2025.html",
    "American": "/cbb/schools/american/men/2025.html",
    "Mount St. Mary's": "/cbb/schools/mount-st-marys/men/2025.html",
    "Texas": "/cbb/schools/texas/men/2025.html",
    "Xavier": "/cbb/schools/xavier/men/2025.html"
}

response = requests.get(BASE_URL + MENS_TEAMS)
soup = BeautifulSoup(response.text, "html.parser")
brackets = soup.find("div", id="brackets")
team_to_link.update({a.text: a["href"] for a in brackets.find_all("a", href=True) if a.text != "tbd"})

print(f"Number of teams: {len(team_to_link)}")

In [None]:
# Create a data frame for each team.

team_to_df = dict()
for team, link in team_to_link.items():
    link = BASE_URL + link[:-5] + "-gamelogs.html"
    while (response := requests.get(link)).status_code != 200:
        print(f"Received status {response.status_code}, retrying for {team}.")
        time.sleep(1)
    soup = BeautifulSoup(response.text, "html.parser")
    
    table = soup.find("table")
    rows = [[cell.text.strip() for cell in row.find_all(["th", "td"])] for row in table.find_all("tr")]
    headers = rows[1]
    rows = [row for row in rows if row[0].isnumeric()]
    rows = [[float(cell) if cell.isnumeric() else cell for cell in row] for row in rows]
    
    df = pd.DataFrame(rows, columns=headers)
    team_to_df[team] = df

    print(f"Gathered data for {team}.")

In [None]:
# Write data to CSVs.

os.makedirs("teams", exist_ok=True)
for team, df in team_to_df.items():
    df.to_csv(os.path.join("teams", f"{team}.csv"), index=False)

In [2]:
# Load data from CSVs.

team_to_df = dict()
for file in os.listdir("teams"):
    name = file[:-4]
    file = os.path.join("teams", file)
    team_to_df[name] = pd.read_csv(file)

In [3]:
# Signals

def win_percentage(df):
    counts = df["Rslt"].value_counts()
    wins = counts["W"] if "W" in counts else 0
    losses = counts["L"] if "L" in counts else 0
    total = wins + losses
    return wins / total

def average_score_differential(df):
    team_scores = df["Tm"]
    opponent_scores = df["Opp.1"]
    differentials = team_scores - opponent_scores
    return differentials.mean()

def average_foul_differential(df):
    team_fouls = df["PF"]
    opponent_fouls = df["PF.1"]
    differentials = team_fouls - opponent_fouls
    return -differentials.mean()

def average_rebounds(df):
    rebounds = df["TRB"]
    return rebounds.mean()

def average_assists(df):
    assists = df["AST"]
    return assists.mean()

def average_steals(df):
    steals = df["STL"]
    return steals.mean()

def average_blocks(df):
    blocks = df["BLK"]
    return blocks.mean()

def average_turnovers(df):
    turnovers = df["TOV"]
    return -turnovers.mean()

def average_effective_field_goal_percentage(df):
    effective_field_goal_percentage = df["eFG%"]
    return effective_field_goal_percentage.mean()

def average_free_throw_percentage(df):
    free_throw_percentage = df["FT%"]
    return free_throw_percentage.mean()

def strength_of_schedule(name):
    # Source: https://www.teamrankings.com/ncaa-basketball/ranking/schedule-strength-by-other.
    strengths_of_schedule = {"Auburn": 16.4, "Alabama": 16.0, "Arizona": 14.6, "Tennessee": 14.6, "Houston": 14.3, "Florida": 14.3, "Kentucky": 14.2, "Kansas": 13.7, "Baylor": 13.4, "Illinois": 13.3, "Purdue": 13.1, "Texas A&M": 13.1, "Iowa State": 13.0, "Duke": 12.9, "Ole Miss": 12.8, "Texas Tech": 12.5, "Wisconsin": 12.4, "Michigan State": 12.4, "Michigan": 12.4, "Missouri": 12.0, "Mississippi State": 12.0, "UCLA": 11.8, "North Carolina": 11.7, "Maryland": 11.7, "Marquette": 11.6, "BYU": 11.6, "Oregon": 11.5, "Georgia": 11.5, "Oklahoma": 11.4, "Texas": 11.3, "Arkansas": 11.1, "Creighton": 10.7, "UConn": 10.5, "St. John's (NY)": 10.2, "Louisville": 10.1, "Vanderbilt": 10.0, "Xavier": 9.5, "Clemson": 8.9, "Gonzaga": 8.7, "San Diego State": 7.1, "Saint Mary's": 6.3, "New Mexico": 6.1, "Memphis": 6.1, "Colorado State": 5.8, "Utah State": 5.4, "VCU": 3.5, "Drake": 0.7, "UC-San Diego": 0.2, "Liberty": 0.0, "Yale": -0.1, "McNeese State": -0.9, "Wofford": -1.0, "Troy": -1.0, "Grand Canyon": -1.1, "Lipscomb": -2.3, "Akron": -2.4, "Montana": -2.8, "UNC Wilmington": -2.8, "Omaha": -3.0, "High Point": -3.1, "Bryant": -3.4, "Robert Morris": -4.1, "Norfolk State": -4.6, "Mount St. Mary's": -5.2, "American": -5.4, "Alabama State": -6.9, "SIU-Edwardsville": -7.4, "Saint Francis (PA)": -7.5}
    return strengths_of_schedule[name]

In [4]:
# Compute and normalize the signals.

signals = [win_percentage, average_score_differential, average_foul_differential, average_rebounds, average_assists, average_steals, average_blocks, average_turnovers, average_effective_field_goal_percentage, average_free_throw_percentage]

team_to_signals = {team: [signal(df) for signal in signals] for team, df in team_to_df.items()}
signals_df = pd.DataFrame.from_dict(team_to_signals, orient="index", columns=[signal.__name__ for signal in signals])

# Strength of schedule is a special case.
signals_df["strength_of_schedule"] = signals_df.index.to_series().apply(strength_of_schedule)

scaler = StandardScaler()
signals_normalized = scaler.fit_transform(signals_df)
signals_normalized_df = pd.DataFrame(signals_normalized, columns=signals_df.columns, index=signals_df.index)

signals_normalized_df

Unnamed: 0,win_percentage,average_score_differential,average_foul_differential,average_rebounds,average_assists,average_steals,average_blocks,average_turnovers,average_effective_field_goal_percentage,average_free_throw_percentage,strength_of_schedule
Georgia,-1.065630,-0.676985,1.615586,-0.152777,-1.630458,0.380769,1.284046,-1.378358,-0.485330,-0.490916,0.674704
North Carolina,-1.028192,-0.778152,-0.214998,-0.077277,-0.268715,-1.010150,-0.070319,0.026924,0.312588,-0.298869,0.702628
Bryant,-0.526074,-0.136856,0.501911,1.904110,-0.073671,-0.387801,2.058037,-0.849928,-0.865994,-0.497925,-1.405667
Wisconsin,0.169843,0.051761,0.720100,-0.155937,-0.286493,-1.552155,-1.409909,1.002539,0.096727,2.644924,0.800364
Maryland,0.324135,1.343500,1.130032,0.132144,-0.262788,0.502207,0.693796,0.578329,-0.008659,0.638387,0.702628
...,...,...,...,...,...,...,...,...,...,...,...
Illinois,-0.946507,0.034094,0.953717,2.420451,-0.074226,-2.142119,0.359496,-0.552669,-0.609801,0.674070,0.926024
Troy,-0.311186,-0.147767,0.116222,0.573120,-0.734195,1.674908,0.860947,-1.659604,-1.352861,-0.709046,-1.070574
Oregon,0.006474,-0.911587,0.468852,-0.761726,-0.319357,-0.049652,-0.002663,0.048925,-0.591057,0.881367,0.674704
Kansas,-0.946507,-0.511491,-0.412722,0.537365,1.622840,-0.739477,0.888805,-0.624861,-0.174676,-0.887458,0.981873


In [5]:
# Perform PCA.

pca = PCA()
principal_components = pca.fit_transform(signals_normalized_df)
pca_df = pd.DataFrame(principal_components, index=signals_normalized_df.index, columns=[f"PC{i}" for i in range(principal_components.shape[1])])

pca_df

Unnamed: 0,PC0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10
Georgia,-1.434328,1.874226,0.099712,0.979803,1.453453,0.953997,-0.211663,1.109877,0.026443,-0.779740,0.251386
North Carolina,-0.775386,0.132797,-1.002444,0.733710,-0.713942,0.011166,-0.675547,0.389778,-0.409002,-0.215015,-0.013106
Bryant,-0.543488,2.835968,0.325958,-0.937655,-0.223934,-1.000554,0.370116,0.088622,0.947575,-0.916817,-0.049325
Wisconsin,0.967954,-1.294538,-1.929000,1.600616,1.197045,-1.696320,-0.048413,-0.373943,-0.300528,0.502285,-0.114742
Maryland,1.473581,0.259821,0.715597,0.880440,1.049703,-0.079945,-0.205063,0.118329,0.387287,-0.497929,0.392974
...,...,...,...,...,...,...,...,...,...,...,...
Illinois,0.412869,2.521347,-1.756777,0.983888,0.000063,-1.535830,-0.511723,-0.644215,-0.681023,-0.493930,0.229020
Troy,-1.522690,1.640051,1.518624,-0.819309,0.751209,0.604004,1.211505,-0.338350,0.196351,-0.423914,0.253327
Oregon,-0.507472,-0.169196,-0.381742,1.203811,0.723836,0.022617,0.200156,0.290266,0.274997,0.772999,-0.341108
Kansas,-0.000051,1.532683,-0.788284,0.410015,-1.893631,0.836685,-0.238059,-0.211174,0.268149,0.331266,0.228512


In [6]:
# Score the teams and display results.

scores = pca_df.values @ pca.explained_variance_ratio_
scores_df = pd.DataFrame({"Score": scores}, index=pca_df.index)
scores_df = scores_df.sort_values(by="Score", ascending=False)

pd.set_option("display.max_rows", 100)
scores_df

Unnamed: 0,Score
St. John's (NY),0.984509
Florida,0.913657
Duke,0.849151
Auburn,0.800485
Michigan State,0.702612
Tennessee,0.694601
Maryland,0.693739
Grand Canyon,0.651123
New Mexico,0.650288
Houston,0.647379
