In [1]:
import os
import time

from bs4 import BeautifulSoup
import pandas as pd
import requests
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [None]:
BASE_URL = "https://www.sports-reference.com"
MENS_TEAMS = "/cbb/postseason/men/2025-ncaa.html"

In [None]:
# Gather teams.
response = requests.get(BASE_URL + MENS_TEAMS)
soup = BeautifulSoup(response.text, "html.parser")
brackets = soup.find("div", id="brackets")
team_to_link = {a.text: a["href"] for a in brackets.find_all("a", href=True) if a.text != "tbd"}

print(f"Number of teams: {len(team_to_link)}")

In [None]:
# Create a data frame for each team.

team_to_df = dict()
for team, link in team_to_link.items():
    link = BASE_URL + link[:-5] + "-gamelogs.html"
    while (response := requests.get(link)).status_code != 200:
        print(f"Received status {response.status_code}, retrying for {team}.")
        time.sleep(1)
    soup = BeautifulSoup(response.text, "html.parser")
    
    table = soup.find("table")
    rows = [[cell.text.strip() for cell in row.find_all(["th", "td"])] for row in table.find_all("tr")]
    headers = rows[1]
    rows = [row for row in rows if row[0].isnumeric()]
    rows = [[float(cell) if cell.isnumeric() else cell for cell in row] for row in rows]
    
    df = pd.DataFrame(rows, columns=headers)
    team_to_df[team] = df

    print(f"Gathered data for {team}.")

In [None]:
# Write data to CSVs.

os.makedirs("teams", exist_ok=True)
for team, df in team_to_df.items():
    df.to_csv(os.path.join("teams", f"{team}.csv"), index=False)

In [2]:
# Load data from CSVs.

team_to_df = dict()
for file in os.listdir("teams"):
    name = file[:-4]
    file = os.path.join("teams", file)
    team_to_df[name] = pd.read_csv(file)

In [3]:
# Signals

def win_percentage(df):
    counts = df["Rslt"].value_counts()
    wins = counts["W"] if "W" in counts else 0
    losses = counts["L"] if "L" in counts else 0
    total = wins + losses
    return wins / total

def average_score_differential(df):
    team_scores = df["Tm"]
    opponent_scores = df["Opp.1"]
    differentials = team_scores - opponent_scores
    return differentials.mean()

def average_foul_differential(df):
    team_fouls = df["PF"]
    opponent_fouls = df["PF.1"]
    differentials = team_fouls - opponent_fouls
    return -differentials.mean()

def average_rebounds(df):
    rebounds = df["TRB"]
    return rebounds.mean()

def average_assists(df):
    assists = df["AST"]
    return assists.mean()

def average_steals(df):
    steals = df["STL"]
    return steals.mean()

def average_blocks(df):
    blocks = df["BLK"]
    return blocks.mean()

def average_turnovers(df):
    turnovers = df["TOV"]
    return -turnovers.mean()

def average_effective_field_goal_percentage(df):
    effective_field_goal_percentage = df["eFG%"]
    return effective_field_goal_percentage.mean()

def average_free_throw_percentage(df):
    free_throw_percentage = df["FT%"]
    return free_throw_percentage.mean()

In [4]:
# Compute and normalize the signals.

signals = [win_percentage, average_score_differential, average_foul_differential, average_rebounds, average_assists, average_steals, average_blocks, average_turnovers, average_effective_field_goal_percentage, average_free_throw_percentage]

team_to_signals = {team: [signal(df) for signal in signals] for team, df in team_to_df.items()}
signals_df = pd.DataFrame.from_dict(team_to_signals, orient="index", columns=[signal.__name__ for signal in signals])

scaler = StandardScaler()
signals_normalized = scaler.fit_transform(signals_df)
signals_normalized_df = pd.DataFrame(signals_normalized, columns=signals_df.columns, index=signals_df.index)

signals_normalized_df

Unnamed: 0,win_percentage,average_score_differential,average_foul_differential,average_rebounds,average_assists,average_steals,average_blocks,average_turnovers,average_effective_field_goal_percentage,average_free_throw_percentage
Georgia,-1.336071,-0.948452,1.602167,-0.241655,-1.796019,0.326947,1.244315,-1.487877,-0.594789,-0.517122
Bryant,-0.749236,-0.346416,0.414577,1.816499,-0.179263,-0.41367,2.03466,-0.926591,-0.984362,-0.524141
Wisconsin,0.007661,-0.136182,0.647248,-0.244817,-0.400284,-1.535675,-1.506559,1.041056,0.000892,2.623272
Maryland,0.175473,1.303606,1.084387,0.043442,-0.375666,0.443968,0.641595,0.59047,-0.106961,0.613821
Ole Miss,-0.861014,-1.161292,-0.936793,-1.626143,-0.669405,1.108703,-0.012686,1.74067,-1.225031,0.175653
Robert Morris,0.256766,-0.905168,1.304201,0.138145,0.504978,-0.650237,0.626534,-1.422756,-0.903239,-0.118679
Baylor,-1.897501,-0.666691,-0.443249,-0.254698,-0.414831,0.266706,-0.922989,0.20707,-0.929071,0.397715
Colorado State,-0.078568,-0.440853,-0.703154,-0.810993,0.885111,-1.575,-1.030084,-0.529659,1.023764,1.660897
Drake,1.902952,0.500892,0.379324,-1.912358,-1.237302,1.285965,-1.349694,0.84607,0.415079,-0.698133
Lipscomb,-0.078568,0.590083,1.760418,-0.162801,-0.08423,-0.241621,-0.753981,1.231725,1.103557,1.716225


In [5]:
# Perform PCA.

pca = PCA()
principal_components = pca.fit_transform(signals_normalized_df)
pca_df = pd.DataFrame(principal_components, index=signals_normalized_df.index, columns=[f"PC{i}" for i in range(principal_components.shape[1])])

pca_df

Unnamed: 0,PC0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9
Georgia,-3.005955,-0.352065,0.211426,1.121885,-0.283639,-0.692779,-0.200607,0.917609,1.012311,0.308851
Bryant,-2.321771,2.035981,0.258613,-0.356247,0.624658,0.139745,-0.055218,0.262646,0.620161,-0.047713
Wisconsin,1.144998,-0.844646,-1.927478,1.903444,1.723713,0.63033,-0.010557,-0.47948,-0.523339,-0.126319
Maryland,0.603046,0.643665,0.660406,1.295927,0.356084,0.027585,0.549753,0.742531,0.610848,0.383285
Ole Miss,-0.306828,-2.51384,1.329049,-0.497535,1.285075,0.594967,0.386492,0.872366,-0.619305,-0.352208
Robert Morris,-1.686503,0.997222,-0.754025,0.555251,-0.463383,-0.460978,0.061808,0.444801,-1.086975,0.061554
Baylor,-1.019727,-1.650434,-0.227948,-0.281758,0.996477,0.402168,0.929258,-0.635252,0.182927,0.339597
Colorado State,1.077799,-0.635909,-2.433176,-0.282556,0.702438,-0.755005,-0.560541,-0.464451,-0.719175,0.296966
Drake,1.619965,-1.797731,1.616398,1.407854,-1.675896,-0.006011,-0.645204,0.207799,-0.555251,-0.166066
Lipscomb,1.509428,-0.295344,-1.265343,1.957354,0.422944,0.268296,0.838483,0.605807,0.706642,-0.419678


In [6]:
# Score the teams and display results.

scores = pca_df.values @ pca.explained_variance_ratio_
scores_df = pd.DataFrame({"Score": scores}, index=pca_df.index)
scores_df = scores_df.sort_values(by="Score", ascending=False)

scores_df

Unnamed: 0,Score
Duke,1.461658
UC-San Diego,1.09679
Houston,1.004298
Florida,0.96651
Gonzaga,0.927437
Auburn,0.902006
VCU,0.692862
St. John's (NY),0.674817
Maryland,0.641087
Saint Mary's,0.628022
