In [1]:
import pandas as pd

In [2]:
read_data = pd.read_csv('2024nbamatches.csv')

In [3]:
def format(row):
    home_t1 = pd.Series({
        'date': pd.to_datetime(row["Date"]),
        'arena': row['Arena'],
        'team': row['Home/Neutral'],
        'opponent': row['Visitor/Neutral'],
        'home': 1,
        'win': int(row['H_PTS'] > row['V_PTS']),
        'points': row['H_PTS'],
        'opponent_points': row['V_PTS'],
        'true_shooting': row['Team 2 TS PCT'],
        'opponent_true_shooting': row['Team 1 TS PCT'],
        'effective_field_goal': row['Team 2 EFG PCT'],
        'opponent_effective_field_goal': row['Team 1 EFG PCT'],
        '3pt_attempted': row['Team 2 3P AR'],
        'opponent_3pt_attempted': row['Team 1 3P AR'],
        'free_throws_attempted': row['Team 2 FT AR'],
        'opponent_free_throws_attempted': row['Team 1 FT AR'],
        'rebounds': row['Team 2 TRB PCT'],
        'opponent_rebounds': row['Team 1 TRB PCT'],
        'assists': row['Team 2 AST PCT'],
        'opponent_assists': row['Team 1 AST PCT'],
        'steals': row['Team 2 STL PCT'],
        'opponent_steals': row['Team 1 STL PCT'],
        'blocks': row['Team 2 BLK PCT'],
        'opponent_blocks': row['Team 1 BLK PCT'],
        'turnovers': row['Team 2 TO PCT'],
        'opponent_turnovers': row['Team 1 TO PCT'],
        'offensive_rating': row['Team 2 OFF RT'],
        'opponent_offensive_rating': row['Team 1 OFF RT'],
        'defensive_rating': row['Team 2 DEF RT'],
        'opponent_defensive_rating': row['Team 1 DEF RT'],
        
        
    })
    visior_t1 = pd.Series({
        'date': pd.to_datetime(row["Date"]),
        'arena': row['Arena'],
        'team': row['Visitor/Neutral'],
        'opponent': row['Home/Neutral'],
        'home': 0,
        'win': int(row['V_PTS'] > row['H_PTS']),
        'points': row['V_PTS'],
        'opponent_points': row['H_PTS'],
        'true_shooting': row['Team 1 TS PCT'],
        'opponent_true_shooting': row['Team 2 TS PCT'],
        'effective_field_goal': row['Team 1 EFG PCT'],
        'opponent_effective_field_goal': row['Team 2 EFG PCT'],
        '3pt_attempted': row['Team 1 3P AR'],
        'opponent_3pt_attempted': row['Team 2 3P AR'],
        'free_throws_attempted': row['Team 1 FT AR'],
        'opponent_free_throws_attempted': row['Team 2 FT AR'],
        'rebounds': row['Team 1 TRB PCT'],
        'opponent_rebounds': row['Team 2 TRB PCT'],
        'assists': row['Team 1 AST PCT'],
        'opponent_assists': row['Team 2 AST PCT'],
        'steals': row['Team 1 STL PCT'],
        'opponent_steals': row['Team 2 STL PCT'],
        'blocks': row['Team 1 BLK PCT'],
        'opponent_blocks': row['Team 2 BLK PCT'],
        'turnovers': row['Team 1 TO PCT'],
        'opponent_turnovers': row['Team 2 TO PCT'],
        'offensive_rating': row['Team 1 OFF RT'],
        'opponent_offensive_rating': row['Team 2 OFF RT'],
        'defensive_rating': row['Team 1 DEF RT'],
        'opponent_defensive_rating': row['Team 2 DEF RT'],
    })
    
    return pd.DataFrame([home_t1, visior_t1])

In [4]:
matches = pd.concat(read_data.apply(format, axis=1).to_list(), ignore_index=True)

In [5]:
matches["team_code"] = matches["team"].astype('category').cat.codes
matches["opponent_code"] = matches["opponent"].astype('category').cat.codes
matches["arena_code"] = matches["arena"].astype('category').cat.codes
matches.shape

(2638, 33)

In [6]:
matches.dtypes

date                              datetime64[ns]
arena                                     object
team                                      object
opponent                                  object
home                                       int64
win                                        int64
points                                     int64
opponent_points                            int64
true_shooting                            float64
opponent_true_shooting                   float64
effective_field_goal                     float64
opponent_effective_field_goal            float64
3pt_attempted                            float64
opponent_3pt_attempted                   float64
free_throws_attempted                    float64
opponent_free_throws_attempted           float64
rebounds                                 float64
opponent_rebounds                        float64
assists                                  float64
opponent_assists                         float64
steals              

In [7]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)
training = matches[matches["date"] < '2024-01-01']
testing = matches[matches["date"] >= '2024-01-01']
predictors = ["team_code", "opponent_code", "arena_code", "home"]
point_targets = ["points", "opponent_points"]
win_target = ["win"]

In [8]:
rf.fit(training[predictors], training[point_targets])
score_preds = rf.predict(testing[predictors])

In [9]:
from sklearn.metrics import mean_squared_error, accuracy_score, precision_score
import numpy as np
mse = mean_squared_error(testing[point_targets], score_preds)
rmse = np.sqrt(mse)
mse, rmse

(264.75566825775655, 16.271314275674126)

In [10]:
rf.fit(training[predictors], training[win_target])
win_pred = rf.predict(testing[predictors])
acc = accuracy_score(testing[win_target], win_pred)
combined = pd.DataFrame(dict(actual=testing["win"], predictors=win_pred))
precision = precision_score(testing["win"], win_pred)
acc
precision

  return fit_method(estimator, *args, **kwargs)


0.6131736526946108

In [11]:
pd.crosstab(index=combined["actual"], columns=combined["predictors"])

predictors,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,515,323
1,326,512


In [12]:
grouped_matches = matches.groupby("team")

In [13]:
def rolling_average (group, cols, new_cols):
    group = group.sort_values("date")
    rolling_stats = group[cols].rolling(4, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

In [14]:
import pandas as pd
import numpy as np

def weighted_statistics(grouped_df, stat_cols, date_col, decay_rate=1.0):
    """
    Calculate weighted statistics for multiple columns based on recency, excluding the current match statistics.

    Parameters:
    - grouped_df: pandas DataFrameGroupBy object, already grouped by teams.
    - stat_cols: list of str, columns for statistics to be weighted (e.g., ['offensive_rating', 'defensive_rating']).
    - date_col: str, column name for the date of the match.
    - decay_rate: float, the rate at which older matches lose importance (higher values decay faster).

    Returns:
    - DataFrame with new columns 'weighted_<stat_col>' for each statistic in stat_cols, excluding the current row.
    """
    
    # Create a list to hold results
    result_dfs = []

    # Loop through each team group
    for team, team_df in grouped_df:
        # Sort the team's DataFrame by date
        team_df = team_df.sort_values(by=date_col).reset_index(drop=True)

        # Initialize result columns for the team DataFrame
        for stat_col in stat_cols:
            team_df[f'weighted_{stat_col}'] = np.nan  # Create a new column to hold the results

        # Calculate recency weights
        recency_weights = np.exp(-(team_df.index.max() - team_df.index) * decay_rate)

        # Calculate weighted statistics for each specified column
        for stat_col in stat_cols:
            weighted_stats = []

            for idx in team_df.index:
                # Exclude the current match from the weighted calculation
                past_games = team_df[team_df.index < idx]

                if not past_games.empty:
                    weighted_avg = np.average(past_games[stat_col], weights=recency_weights[past_games.index])
                else:
                    weighted_avg = np.nan  # No past games to base prediction on

                weighted_stats.append(weighted_avg)

            # Assign the weighted stats to the corresponding column in the team DataFrame
            team_df[f'weighted_{stat_col}'] = weighted_stats

        result_dfs.append(team_df)  # Append the modified team DataFrame to results

    # Concatenate all team DataFrames back into a single DataFrame
    return pd.concat(result_dfs, ignore_index=True)

# Example usage
# Assuming your original DataFrame is grouped like this:
# grouped_matches = original_df.groupby('team')
# stats_list = ['offensive_rating', 'defensive_rating', 'rebound_rate']
# df = weighted_statistics(grouped_matches, stats_list, 'date', decay_rate=0.5)


In [15]:
cols = ["points", "opponent_points", "true_shooting", "opponent_true_shooting", "effective_field_goal", "opponent_effective_field_goal", "3pt_attempted", "opponent_3pt_attempted", "free_throws_attempted", "opponent_free_throws_attempted", "rebounds", "opponent_rebounds", "assists", "opponent_assists", "steals", "opponent_steals", "blocks", "opponent_blocks", "turnovers", "opponent_turnovers", "offensive_rating", "opponent_offensive_rating", "defensive_rating", "opponent_defensive_rating"]
new_cols = [f"weighted_{c}" for c in cols]
new_cols

['weighted_points',
 'weighted_opponent_points',
 'weighted_true_shooting',
 'weighted_opponent_true_shooting',
 'weighted_effective_field_goal',
 'weighted_opponent_effective_field_goal',
 'weighted_3pt_attempted',
 'weighted_opponent_3pt_attempted',
 'weighted_free_throws_attempted',
 'weighted_opponent_free_throws_attempted',
 'weighted_rebounds',
 'weighted_opponent_rebounds',
 'weighted_assists',
 'weighted_opponent_assists',
 'weighted_steals',
 'weighted_opponent_steals',
 'weighted_blocks',
 'weighted_opponent_blocks',
 'weighted_turnovers',
 'weighted_opponent_turnovers',
 'weighted_offensive_rating',
 'weighted_opponent_offensive_rating',
 'weighted_defensive_rating',
 'weighted_opponent_defensive_rating']

In [16]:
grouped_matches = weighted_statistics(grouped_matches, cols, "date", decay_rate = 0.1)


In [17]:
grouped_matches

Unnamed: 0,date,arena,team,opponent,home,win,points,opponent_points,true_shooting,opponent_true_shooting,...,weighted_steals,weighted_opponent_steals,weighted_blocks,weighted_opponent_blocks,weighted_turnovers,weighted_opponent_turnovers,weighted_offensive_rating,weighted_opponent_offensive_rating,weighted_defensive_rating,weighted_opponent_defensive_rating
0,2023-10-25,Spectrum Center,Atlanta Hawks,Charlotte Hornets,0,0,110,116,0.512,0.595,...,,,,,,,,,,
1,2023-10-27,State Farm Arena,Atlanta Hawks,New York Knicks,1,0,120,126,0.599,0.616,...,11.600000,4.800000,2.000000,4.700000,10.000000,16.300000,106.400000,112.200000,112.200000,106.400000
2,2023-10-29,Fiserv Forum,Atlanta Hawks,Milwaukee Bucks,0,1,127,110,0.618,0.580,...,9.080100,6.847419,7.774771,7.954871,11.207452,14.830058,111.387302,117.187302,117.187302,111.387302
3,2023-10-30,State Farm Arena,Atlanta Hawks,Minnesota Timberwolves,1,1,127,113,0.676,0.585,...,11.070100,7.894788,6.682538,7.016811,12.306212,16.544699,115.687800,113.300011,113.300011,115.687800
4,2023-11-01,State Farm Arena,Atlanta Hawks,Washington Wizards,1,1,130,121,0.613,0.565,...,9.664339,7.088068,8.304026,7.560396,11.784846,15.434921,120.194283,113.992771,114.339153,119.819036
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2633,2024-04-05,Capital One Arena,Washington Wizards,Portland Trail Blazers,1,0,102,108,0.504,0.516,...,7.073703,8.415032,7.075002,11.929735,13.239553,11.491802,108.314932,115.791739,115.708182,108.408238
2634,2024-04-07,Scotiabank Arena,Washington Wizards,Toronto Raptors,0,0,122,130,0.597,0.600,...,7.285650,8.299359,7.134503,11.660340,12.835941,11.454502,107.989825,115.345078,115.269476,108.074249
2635,2024-04-09,Target Center,Washington Wizards,Minnesota Timberwolves,0,0,121,130,0.627,0.625,...,7.372694,8.289901,8.064157,11.330923,12.832520,11.449313,108.333506,115.635900,115.615094,108.362294
2636,2024-04-12,Capital One Arena,Washington Wizards,Chicago Bulls,1,0,127,129,0.643,0.585,...,7.146827,8.357498,8.077089,11.908666,12.743749,11.035283,109.539287,117.003283,116.984458,109.565334


In [18]:
# Step 2: Sort the DataFrame by the date column
rolling_matches = grouped_matches.sort_values(by='date')

# Step 3: Reset the index to have a continuous, clean index
grouped_matches.reset_index(drop=True, inplace=True)

In [19]:
grouped_matches

Unnamed: 0,date,arena,team,opponent,home,win,points,opponent_points,true_shooting,opponent_true_shooting,...,weighted_steals,weighted_opponent_steals,weighted_blocks,weighted_opponent_blocks,weighted_turnovers,weighted_opponent_turnovers,weighted_offensive_rating,weighted_opponent_offensive_rating,weighted_defensive_rating,weighted_opponent_defensive_rating
0,2023-10-25,Spectrum Center,Atlanta Hawks,Charlotte Hornets,0,0,110,116,0.512,0.595,...,,,,,,,,,,
1,2023-10-27,State Farm Arena,Atlanta Hawks,New York Knicks,1,0,120,126,0.599,0.616,...,11.600000,4.800000,2.000000,4.700000,10.000000,16.300000,106.400000,112.200000,112.200000,106.400000
2,2023-10-29,Fiserv Forum,Atlanta Hawks,Milwaukee Bucks,0,1,127,110,0.618,0.580,...,9.080100,6.847419,7.774771,7.954871,11.207452,14.830058,111.387302,117.187302,117.187302,111.387302
3,2023-10-30,State Farm Arena,Atlanta Hawks,Minnesota Timberwolves,1,1,127,113,0.676,0.585,...,11.070100,7.894788,6.682538,7.016811,12.306212,16.544699,115.687800,113.300011,113.300011,115.687800
4,2023-11-01,State Farm Arena,Atlanta Hawks,Washington Wizards,1,1,130,121,0.613,0.565,...,9.664339,7.088068,8.304026,7.560396,11.784846,15.434921,120.194283,113.992771,114.339153,119.819036
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2633,2024-04-05,Capital One Arena,Washington Wizards,Portland Trail Blazers,1,0,102,108,0.504,0.516,...,7.073703,8.415032,7.075002,11.929735,13.239553,11.491802,108.314932,115.791739,115.708182,108.408238
2634,2024-04-07,Scotiabank Arena,Washington Wizards,Toronto Raptors,0,0,122,130,0.597,0.600,...,7.285650,8.299359,7.134503,11.660340,12.835941,11.454502,107.989825,115.345078,115.269476,108.074249
2635,2024-04-09,Target Center,Washington Wizards,Minnesota Timberwolves,0,0,121,130,0.627,0.625,...,7.372694,8.289901,8.064157,11.330923,12.832520,11.449313,108.333506,115.635900,115.615094,108.362294
2636,2024-04-12,Capital One Arena,Washington Wizards,Chicago Bulls,1,0,127,129,0.643,0.585,...,7.146827,8.357498,8.077089,11.908666,12.743749,11.035283,109.539287,117.003283,116.984458,109.565334


In [20]:
rf = RandomForestClassifier(n_estimators=300, min_samples_split=5, max_depth=10, random_state=1)
training = rolling_matches[rolling_matches["date"] < '2024-02-01']
testing = rolling_matches[rolling_matches["date"] >= '2024-02-01']
predictors = ["team_code", "opponent_code", "arena_code", "home"] + new_cols
point_targets = ["points", "opponent_points"]
win_target = ["win"]

In [21]:
rf.fit(training[predictors], training[point_targets])

In [22]:
score_preds = rf.predict(testing[predictors])

In [23]:
score_preds

array([[118, 103],
       [126, 119],
       [116, 113],
       ...,
       [114, 107],
       [118, 106],
       [111, 107]], dtype=int64)

In [24]:
mse = mean_squared_error(testing[point_targets], score_preds)
rmse = np.sqrt(mse)

In [25]:
mse

187.6495057660626

In [26]:
rmse

13.698522028527844

In [27]:
rf.fit(training[predictors], training[win_target])

  return fit_method(estimator, *args, **kwargs)


In [28]:
win_pred = rf.predict(testing[predictors])
acc = accuracy_score(testing[win_target], win_pred)
combined = pd.DataFrame(dict(actual=testing["win"], predictors=win_pred))
precision = precision_score(testing["win"], win_pred)
acc, precision

(0.6243822075782537, 0.6044260027662517)

In [29]:
pd.crosstab(index=combined["actual"], columns=combined["predictors"])

predictors,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,321,286
1,170,437
