In [1]:
import os
import time

from bs4 import BeautifulSoup
import pandas as pd
import requests
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [None]:
BASE_URL = "https://www.sports-reference.com"
WOMENS_TEAMS = "/cbb/postseason/women/2025-ncaa.html"

In [None]:
# Gather teams.

# Include teams in the first four.
team_to_link = {
    "Iowa State": "/cbb/schools/iowa-state/women/2025.html",
    "Princeton": "/cbb/schools/princeton/women/2025.html",
    "UC-San Diego": "/cbb/schools/california-san-diego/women/2025.html",
    "Southern": "/cbb/schools/southern/women/2025.html",
    "Columbia": "/cbb/schools/columbia/women/2025.html",
    "Washington": "/cbb/schools/washington/women/2025.html",
    "High Point": "/cbb/schools/high-point/women/2025.html",
    "William & Mary": "/cbb/schools/william-mary/women/2025.html"
}

response = requests.get(BASE_URL + WOMENS_TEAMS)
soup = BeautifulSoup(response.text, "html.parser")
brackets = soup.find("div", id="brackets")
team_to_link.update({a.text: a["href"] for a in brackets.find_all("a", href=True) if a.text != "tbd"})

print(f"Number of teams: {len(team_to_link)}")

In [None]:
# Create a data frame for each team.
team_to_df = dict()
for team, link in team_to_link.items():
    link = BASE_URL + link[:-5] + "-gamelogs.html"
    while (response := requests.get(link)).status_code != 200:
        print(f"Received status {response.status_code}, retrying for {team}.")
        time.sleep(1)
    soup = BeautifulSoup(response.text, "html.parser")
    
    table = soup.find("table")
    rows = [[cell.text.strip() for cell in row.find_all(["th", "td"])] for row in table.find_all("tr")]
    headers = rows[1]
    rows = [row for row in rows if row[0].isnumeric()]
    rows = [[float(cell) if cell.isnumeric() else cell for cell in row] for row in rows]
    
    df = pd.DataFrame(rows, columns=headers)
    team_to_df[team] = df

    print(f"Gathered data for {team}.")

In [None]:
# Write data to CSVs.

os.makedirs("wteams", exist_ok=True)
for team, df in team_to_df.items():
    df.to_csv(os.path.join("wteams", f"{team}.csv"), index=False)

In [2]:
# Load data from CSVs.

team_to_df = dict()
for file in os.listdir("wteams"):
    name = file[:-4]
    file = os.path.join("wteams", file)
    team_to_df[name] = pd.read_csv(file)

In [3]:
# Signals

def win_percentage(df):
    counts = df["Rslt"].value_counts()
    wins = counts["W"] if "W" in counts else 0
    losses = counts["L"] if "L" in counts else 0
    total = wins + losses
    return wins / total

def average_score_differential(df):
    team_scores = df["Tm"]
    opponent_scores = df["Opp.1"]
    differentials = team_scores - opponent_scores
    return differentials.mean()

def average_foul_differential(df):
    team_fouls = df["PF"]
    opponent_fouls = df["PF.1"]
    differentials = team_fouls - opponent_fouls
    return -differentials.mean()

def average_rebounds(df):
    rebounds = df["TRB"]
    return rebounds.mean()

def average_assists(df):
    assists = df["AST"]
    return assists.mean()

def average_steals(df):
    steals = df["STL"]
    return steals.mean()

def average_blocks(df):
    blocks = df["BLK"]
    return blocks.mean()

def average_turnovers(df):
    turnovers = df["TOV"]
    return -turnovers.mean()

def average_effective_field_goal_percentage(df):
    effective_field_goal_percentage = df["eFG%"]
    return effective_field_goal_percentage.mean()

def average_free_throw_percentage(df):
    free_throw_percentage = df["FT%"]
    return free_throw_percentage.mean()

In [4]:
# Compute and normalize the signals.

signals = [win_percentage, average_score_differential, average_foul_differential, average_rebounds, average_assists, average_steals, average_blocks, average_turnovers, average_effective_field_goal_percentage, average_free_throw_percentage]

team_to_signals = {team: [signal(df) for signal in signals] for team, df in team_to_df.items()}
signals_df = pd.DataFrame.from_dict(team_to_signals, orient="index", columns=[signal.__name__ for signal in signals])

scaler = StandardScaler()
signals_normalized = scaler.fit_transform(signals_df)
signals_normalized_df = pd.DataFrame(signals_normalized, columns=signals_df.columns, index=signals_df.index)

signals_normalized_df

Unnamed: 0,win_percentage,average_score_differential,average_foul_differential,average_rebounds,average_assists,average_steals,average_blocks,average_turnovers,average_effective_field_goal_percentage,average_free_throw_percentage
UNC,0.255292,-0.135949,0.427849,0.341507,-0.672660,-0.443195,0.225618,0.608518,-0.705297,-1.588252
Richmond,0.484378,0.265698,2.185686,-1.303887,0.681623,-0.776580,0.147861,0.000367,2.234600,0.611066
Murray State,0.132795,0.368707,1.109193,0.520810,1.055628,-0.331533,-1.311162,0.498727,0.811400,1.807588
Maryland,-0.006035,-0.503957,1.091563,0.843118,-0.495334,-0.950966,-0.031810,-1.278506,0.290098,0.830824
William & Mary,-2.977359,-3.077670,-1.985658,-0.740880,-0.748201,-0.513617,-0.570822,-1.314363,-2.241449,-0.861665
...,...,...,...,...,...,...,...,...,...,...
Iowa,-0.759684,-0.758993,0.633185,0.096525,0.864488,-0.745440,-0.331796,-0.909913,0.590326,-0.870424
Vermont,-1.246491,-0.801668,-0.806057,-2.180878,-1.145374,-0.901142,-0.108811,1.280949,0.279379,-0.928468
Lehigh,0.484378,-0.220895,1.091563,-2.018472,-0.483419,0.482874,-1.084167,-0.153302,0.335844,1.777426
West Virginia,0.065619,1.180150,0.218029,-0.867661,-0.998462,2.175391,-1.334216,-0.806537,0.083897,0.790172


In [5]:
# Perform PCA.

pca = PCA()
principal_components = pca.fit_transform(signals_normalized_df)
pca_df = pd.DataFrame(principal_components, index=signals_normalized_df.index, columns=[f"PC{i}" for i in range(principal_components.shape[1])])

pca_df

Unnamed: 0,PC0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9
UNC,-0.463185,0.577447,0.221856,1.428915,0.781412,-0.940001,-0.540897,0.170108,0.135647,0.017322
Richmond,1.644257,-2.522767,-0.064722,-0.414561,-0.676724,-0.571983,-1.651339,-0.660351,-0.081870,-0.200409
Murray State,1.189592,-1.972820,0.035213,-0.260118,-0.913696,-0.330928,1.489557,-0.260450,-0.397768,-0.088188
Maryland,0.053869,-0.919748,-1.417865,0.829305,-1.262209,0.234575,-0.140438,0.669861,-0.211784,-0.087536
William & Mary,-5.041196,0.861296,-1.881709,-0.648650,0.692288,0.585209,0.372567,-0.276309,0.560725,0.060434
...,...,...,...,...,...,...,...,...,...,...
Iowa,-0.309046,-0.456973,-1.666481,-0.285073,-0.061122,-1.143829,-0.429629,-0.470812,0.141990,-0.096708
Vermont,-2.144753,-0.910308,0.874901,-0.640495,1.836090,-0.063235,-1.409659,-0.179316,-0.534851,0.184885
Lehigh,-0.399011,-1.915886,1.487180,-0.854055,-1.783721,0.440301,-0.435702,-0.121252,0.367684,-0.027852
West Virginia,-0.190762,0.633149,1.731401,-1.094294,-2.348398,0.069344,-0.007728,-0.111558,-0.820572,0.388084


In [6]:
# Score the teams and display results.

scores = pca_df.values @ pca.explained_variance_ratio_
scores_df = pd.DataFrame({"Score": scores}, index=pca_df.index)
scores_df = scores_df.sort_values(by="Score", ascending=False)

pd.set_option("display.max_rows", 100)
scores_df

Unnamed: 0,Score
USC,1.277691
South Carolina,1.263167
UConn,1.243194
Texas,1.041681
Notre Dame,1.021787
UCLA,1.005851
TCU,0.952601
Norfolk State,0.85323
LSU,0.765334
Kansas State,0.763234
