In [1]:
import pandas as pd

In [67]:
compact_results = pd.read_csv('data/men/MRegularSeasonCompactResults.csv')
compact_results[compact_results['Season'] == 2024]

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
181682,2024,0,1101,64,1329,59,A,0
181683,2024,0,1103,81,1355,75,A,0
181684,2024,0,1104,105,1287,73,H,0
181685,2024,0,1112,122,1288,59,H,0
181686,2024,0,1114,71,1402,66,H,0
...,...,...,...,...,...,...,...,...
187284,2024,132,1120,86,1196,67,N,0
187285,2024,132,1182,57,1433,51,N,0
187286,2024,132,1228,93,1458,87,N,0
187287,2024,132,1412,85,1396,69,N,0


In [3]:
compact_results.columns

Index(['Season', 'DayNum', 'WTeamID', 'WScore', 'LTeamID', 'LScore', 'WLoc',
       'NumOT'],
      dtype='object')

In [68]:
# Create winner's perspective
winner_rows = compact_results.copy()
winner_rows['TeamID'] = winner_rows['WTeamID']
winner_rows['OppID'] = winner_rows['LTeamID']
winner_rows['TeamScore'] = winner_rows['WScore']
winner_rows['OppScore'] = winner_rows['LScore']
winner_rows['Win'] = 1  # Mark as a win
winner_rows['TeamLoc'] = winner_rows['WLoc']  # Team's location stays the same for the winner

# Create loser's perspective
loser_rows = compact_results.copy()
loser_rows['TeamID'] = loser_rows['LTeamID']
loser_rows['OppID'] = loser_rows['WTeamID']
loser_rows['TeamScore'] = loser_rows['LScore']
loser_rows['OppScore'] = loser_rows['WScore']
loser_rows['Win'] = 0  # Mark as a loss
loser_rows['TeamLoc'] = loser_rows['WLoc'].apply(lambda x: 'A' if x == 'H' else ('H' if x == 'A' else 'N'))  # Reverse location, handle 'N'

# Combine both
games_expanded = pd.concat([winner_rows, loser_rows], ignore_index=True)

# Drop original W/L columns if no longer needed
games_expanded = games_expanded.drop(columns=['WTeamID', 'LTeamID', 'WScore', 'LScore', 'WLoc'])

# Check result
print(games_expanded[games_expanded['Season'] == 2024])

        Season  DayNum  NumOT  TeamID  OppID  TeamScore  OppScore  Win TeamLoc
181682    2024       0      0    1101   1329         64        59    1       A
181683    2024       0      0    1103   1355         81        75    1       A
181684    2024       0      0    1104   1287        105        73    1       H
181685    2024       0      0    1112   1288        122        59    1       H
181686    2024       0      0    1114   1402         71        66    1       H
...        ...     ...    ...     ...    ...        ...       ...  ...     ...
379080    2024     132      0    1196   1120         67        86    0       N
379081    2024     132      0    1433   1182         51        57    0       N
379082    2024     132      0    1458   1228         87        93    0       N
379083    2024     132      0    1396   1412         69        85    0       N
379084    2024     132      0    1135   1463         61        62    0       N

[11214 rows x 9 columns]


In [63]:
# Function to calculate the offensive and defensive rating adjustments
def adjust_ratings(games_expanded, offensive_ratings, defensive_ratings, iterations=1, k=.01):
    for _ in range(iterations):
        #print(defensive_ratings[1101])
        #TODO some way of tracking if we've iterated enough for things to stabilize, some kind of max adj
        for _, game in games_expanded.iterrows():
            team = game['TeamID']
            opponent = game['OppID']
            #print(team, opponent)
            # Calculate expected score difference based on offensive/defensive ratings
            expected_team_score = offensive_ratings[team] * (defensive_ratings[opponent] / 100)
            expected_opponent_score = offensive_ratings[opponent] * (defensive_ratings[team] / 100)
            #print(expected_team_score, expected_opponent_score)

            if pd.isna(expected_team_score):
                print('Team score for team #', team)
            # Actual score difference
            actual_team_score = game['TeamScore']
            actual_opponent_score = game['OppScore']

            #print(actual_team_score, actual_opponent_score)

            # Rating adjustments
            team_rating_adjustment = k * (actual_team_score - expected_team_score)
            opponent_rating_adjustment = k * (actual_opponent_score - expected_opponent_score)
            #print(team_rating_adjustment, opponent_rating_adjustment)


            #max_adj = max(max_adj, team_rating_adjustment, opponent_rating_adjustment)

            # Adjust offensive and defensive ratings for both teams
            #print('Team score adjustment:', offensive_ratings[team])
            offensive_ratings[team] += team_rating_adjustment
            defensive_ratings[team] += opponent_rating_adjustment
            offensive_ratings[opponent] += opponent_rating_adjustment
            defensive_ratings[opponent] += team_rating_adjustment



        # Convert to DataFrame for easier inspection
        #team_ratings = pd.DataFrame({
        #    'TeamID': list(offensive_ratings.keys()),
        #    'OffensiveRating': list(offensive_ratings.values()),
        #    'DefensiveRating': list(defensive_ratings.values())
        #})

        #print(team_ratings)

    return offensive_ratings, defensive_ratings

#TODO Really we should hold out the games one at a time to get ratings for the whole college basketball landscape without that game, then predict the score of the game based on those...
all_seasons_team_ratings = []
#seasons = games_expanded['Season'].unique()
seasons = [2024]
for y in seasons:
    print('Running Season', y)
    year_games_expanded = games_expanded[games_expanded['Season'] == y].copy()
    year_teams = year_games_expanded['TeamID'].unique()

    offensive_ratings = {team: 100 for team in year_teams}  # Starting offensive rating of 100 for all teams
    defensive_ratings = {team: 100 for team in year_teams}  # Starting defensive rating of 100 for all teams

    offensive_ratings, defensive_ratings = adjust_ratings(year_games_expanded, offensive_ratings, defensive_ratings, iterations = 10)

    # Convert to DataFrame
    year_team_ratings = pd.DataFrame({
        'TeamID': list(offensive_ratings.keys()),
        'OffensiveRating': list(offensive_ratings.values()),
        'DefensiveRating': list(defensive_ratings.values())
    })

    year_team_ratings['Season'] = y

    #print(year_team_ratings.head())
    all_seasons_team_ratings.append(year_team_ratings)

big_team_ratings_df = pd.concat(all_seasons_team_ratings, ignore_index=True)


Running Season 2024


In [64]:
big_team_ratings_df

Unnamed: 0,TeamID,OffensiveRating,DefensiveRating,Season
0,1101,82.628890,86.655506,2024
1,1103,83.475507,78.760570,2024
2,1104,110.781644,87.735870,2024
3,1112,106.944707,80.159893,2024
4,1114,87.801059,88.507708,2024
...,...,...,...,...
357,1223,79.690461,100.587838,2024
358,1322,77.731077,91.967189,2024
359,1440,78.249443,100.860110,2024
360,1178,75.776116,93.279532,2024


In [55]:
compact_results[compact_results['Season'] == 2025].tail()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
191791,2025,106,1461,69,1102,62,H,0
191792,2025,106,1462,76,1139,63,H,0
191793,2025,106,1466,80,1480,62,H,0
191794,2025,106,1468,94,1122,68,H,0
191795,2025,106,1474,89,1146,72,H,0
