In [1]:
import numpy as np
from sklearn import preprocessing as pre
import pandas as pd
import matplotlib.pyplot as plt

# DATA PREPARATION

In [2]:
from api_helpers.team_stats_helpers import load_dataframe
import pandas as pd

nba_dataframe = load_dataframe(['FGM', 'FGA', 'FG_PCT', 'FG3A', 'FTM', 'OREB', 'DREB', 'REB', 'AST','PTS'])
nba_dataframe = nba_dataframe.drop(nba_dataframe[nba_dataframe["FGA"] == 0].index)

nba_dataframe["YEAR"] = "2" + nba_dataframe["YEAR"].str.slice(0,4)
pd.DataFrame.rename(nba_dataframe, columns={"YEAR": "SEASON_ID"}, inplace=True)

nba_dataframe["NBA_FINALS_APPEARANCE"].fillna(0.0, inplace=True)
nba_dataframe["NBA_FINALS_APPEARANCE"].replace("FINALS APPEARANCE", 0, inplace=True)
nba_dataframe["NBA_FINALS_APPEARANCE"].replace("LEAGUE CHAMPION", 1, inplace=True)

dataframe_2023 = nba_dataframe[nba_dataframe['SEASON_ID'] == "22023"]
nba_dataframe = nba_dataframe[nba_dataframe['SEASON_ID'] != "22023"]
nba_dataframe = nba_dataframe.reset_index(drop=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  nba_dataframe["NBA_FINALS_APPEARANCE"].fillna(0.0, inplace=True)
  nba_dataframe["NBA_FINALS_APPEARANCE"].replace("LEAGUE CHAMPION", 1, inplace=True)


In [3]:
# # best features: ['FGA', 'FG_PCT', 'FG3M', 'FTM', 'FT_PCT', 'DREB', 'REB', 'PTS']
non_int_columns = ["TEAM_ID","SEASON_ID","NBA_FINALS_APPEARANCE"]
stats_frame = nba_dataframe.drop(non_int_columns, axis=1)
nameless_dataframe_2023 = dataframe_2023.drop(non_int_columns,axis=1)

In [4]:
from scipy.spatial import distance

def closest_teams(vectors_frame, k = 1, target = dataframe_2023.sample(1)):

    vectors = np.array(vectors_frame.drop(non_int_columns,axis=1))

    distances = distance.cdist(target, vectors, "cosine")[0]
    # Sort distances (indices of closest points at the beginning)
    closest_indices = np.argsort(distances)

    # take top k closest vectors
    return vectors_frame.iloc[list(closest_indices[:k])]

In [6]:
team_1_id = 1610612740
team_2_id = 1610612763

team_1_row = dataframe_2023[dataframe_2023["TEAM_ID"] == team_1_id]
team_2_row = dataframe_2023[dataframe_2023["TEAM_ID"] == team_2_id]

similar_rows_1 = closest_teams(nba_dataframe, k=10, target=team_1_row.drop(non_int_columns,axis=1))
similar_rows_1

Unnamed: 0,TEAM_ID,SEASON_ID,NBA_FINALS_APPEARANCE,FGM,FGA,FG_PCT,FG3A,FTM,OREB,DREB,REB,AST,PTS
1081,1610612764,22022,0.0,3456,7127,0.485,2601,1442,774,2804,3578,2083,9279
122,1610612739,22022,0.0,3408,6984,0.488,2589,1439,795,2574,3369,2045,9205
540,1610612750,22022,0.0,3515,7167,0.49,2731,1467,749,2686,3435,2145,9494
264,1610612743,22020,0.0,3114,6422,0.485,2462,1129,758,2442,3200,1933,8284
580,1610612751,22021,0.0,3442,7251,0.475,2602,1434,844,2796,3640,2071,9258
776,1610612756,22019,0.0,3006,6429,0.468,2320,1451,712,2465,3177,1987,8294
143,1610612740,22022,0.0,3447,7180,0.48,2468,1585,866,2717,3583,2122,9378
266,1610612743,22022,1.0,3574,7088,0.504,2559,1378,828,2699,3527,2368,9495
389,1610612746,22022,0.0,3370,7059,0.477,2735,1533,803,2742,3545,1959,9314
967,1610612761,22018,1.0,3460,7305,0.474,2771,1449,786,2920,3706,2085,9384


In [7]:
def join_teams():
    joined_list = []
    for _, row in similar_rows_1.iterrows():
        k=5
        year = row['SEASON_ID']

        # get 
        year_frame = nba_dataframe[nba_dataframe['SEASON_ID'] == year]

        similar_rows_2 = closest_teams(year_frame, k=k, target=team_2_row.drop(non_int_columns,axis=1))
        similar_rows_2 = similar_rows_2.add_suffix("_B")

        combined_row = pd.concat([row.to_frame().T] * len(similar_rows_2),axis=0)

        # Now merge the repeated_df1 with df2
        result = pd.concat([combined_row.reset_index(drop=True), similar_rows_2.reset_index(drop=True)], axis=1)

        joined_list.append(result)


    final_joined = pd.concat(joined_list, ignore_index=True)

    return final_joined

joined = join_teams()
# joined.drop(["SEASON_ID_B"],inplace=True,axis=1)
# pd.DataFrame.rename(joined, columns={"SEASON_ID_A":"SEASON_ID"},inplace=True)
joined

Unnamed: 0,TEAM_ID,SEASON_ID,NBA_FINALS_APPEARANCE,FGM,FGA,FG_PCT,FG3A,FTM,OREB,DREB,...,FGM_B,FGA_B,FG_PCT_B,FG3A_B,FTM_B,OREB_B,DREB_B,REB_B,AST_B,PTS_B
0,1610612764,22022,0.0,3456,7127,0.485,2601,1442,774,2804,...,3485,7365,0.473,3099,1536,968,2794,3762,2129,9600
1,1610612764,22022,0.0,3456,7127,0.485,2601,1442,774,2804,...,3504,7411,0.473,3306,1364,914,3074,3988,2115,9589
2,1610612764,22022,0.0,3456,7127,0.485,2601,1442,774,2804,...,3385,7413,0.457,2669,1447,901,2751,3652,2062,9098
3,1610612764,22022,0.0,3456,7127,0.485,2601,1442,774,2804,...,3215,6991,0.46,2852,1567,796,2533,3329,1955,8977
4,1610612764,22022,0.0,3456,7127,0.485,2601,1442,774,2804,...,3585,7551,0.475,2807,1432,988,2835,3823,2135,9587
5,1610612739,22022,0.0,3408,6984,0.488,2589,1439,795,2574,...,3485,7365,0.473,3099,1536,968,2794,3762,2129,9600
6,1610612739,22022,0.0,3408,6984,0.488,2589,1439,795,2574,...,3504,7411,0.473,3306,1364,914,3074,3988,2115,9589
7,1610612739,22022,0.0,3408,6984,0.488,2589,1439,795,2574,...,3385,7413,0.457,2669,1447,901,2751,3652,2062,9098
8,1610612739,22022,0.0,3408,6984,0.488,2589,1439,795,2574,...,3215,6991,0.46,2852,1567,796,2533,3329,1955,8977
9,1610612739,22022,0.0,3408,6984,0.488,2589,1439,795,2574,...,3585,7551,0.475,2807,1432,988,2835,3823,2135,9587


In [12]:
joined.to_csv("new_data.csv")

In [9]:
all_games_df = pd.read_csv("data/all_games.csv")

In [21]:
from api_helpers.game_stats_helpers import home_matchups
final = pd.DataFrame()
final_stats_list = []

for i in range(len(joined)):
    row = joined.iloc[i]
    
    matchups = home_matchups(all_games_df=all_games_df,team_a=int(row["TEAM_ID"]), team_b=int(row["TEAM_ID_B"]), year=int(row["SEASON_ID"]))
    
    # output data
    final = pd.concat([final, matchups],axis=0)

    if len(matchups) == 0:
        continue
    else:
        repeated_row = pd.concat([row.to_frame().T] * len(matchups),axis=0)
        final_stats_list.append(repeated_row)

# dataframe of team stats (input data)
final_stats_df = pd.concat(final_stats_list,axis=0)

Unnamed: 0,TEAM_ID,SEASON_ID,NBA_FINALS_APPEARANCE,FGM,FGA,FG_PCT,FG3A,FTM,OREB,DREB,...,FGM_B,FGA_B,FG_PCT_B,FG3A_B,FTM_B,OREB_B,DREB_B,REB_B,AST_B,PTS_B
0,1610612764,22022,0.0,3456,7127,0.485,2601,1442,774,2804,...,3485,7365,0.473,3099,1536,968,2794,3762,2129,9600
1,1610612764,22022,0.0,3456,7127,0.485,2601,1442,774,2804,...,3504,7411,0.473,3306,1364,914,3074,3988,2115,9589
1,1610612764,22022,0.0,3456,7127,0.485,2601,1442,774,2804,...,3504,7411,0.473,3306,1364,914,3074,3988,2115,9589
2,1610612764,22022,0.0,3456,7127,0.485,2601,1442,774,2804,...,3385,7413,0.457,2669,1447,901,2751,3652,2062,9098
2,1610612764,22022,0.0,3456,7127,0.485,2601,1442,774,2804,...,3385,7413,0.457,2669,1447,901,2751,3652,2062,9098
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47,1610612761,22018,1.0,3460,7305,0.474,2771,1449,786,2920,...,3182,7122,0.447,3002,1541,832,2884,3716,1918,8927
48,1610612761,22018,1.0,3460,7305,0.474,2771,1449,786,2920,...,3301,7358,0.449,2965,1555,900,2919,3819,1954,9204
48,1610612761,22018,1.0,3460,7305,0.474,2771,1449,786,2920,...,3301,7358,0.449,2965,1555,900,2919,3819,1954,9204
49,1610612761,22018,1.0,3460,7305,0.474,2771,1449,786,2920,...,3451,7423,0.465,2829,1282,804,2849,3653,2155,9216


In [11]:
final.to_csv("test_data.csv")
type(all_games_df["TEAM_ID_B"].iloc[0])

numpy.int64

In [29]:
final.columns

Index(['Unnamed: 0', 'SEASON_ID', 'TEAM_ID_A', 'TEAM_ABBREVIATION_A',
       'TEAM_NAME_A', 'GAME_ID', 'GAME_DATE', 'MATCHUP_A', 'WL_A', 'MIN_A',
       'PTS_A', 'FGM_A', 'FGA_A', 'FG_PCT_A', 'FG3M_A', 'FG3A_A', 'FG3_PCT_A',
       'FTM_A', 'FTA_A', 'FT_PCT_A', 'OREB_A', 'DREB_A', 'REB_A', 'AST_A',
       'STL_A', 'BLK_A', 'TOV_A', 'PF_A', 'PLUS_MINUS_A', 'TEAM_ID_B',
       'TEAM_ABBREVIATION_B', 'TEAM_NAME_B', 'MATCHUP_B', 'WL_B', 'MIN_B',
       'PTS_B', 'FGM_B', 'FGA_B', 'FG_PCT_B', 'FG3M_B', 'FG3A_B', 'FG3_PCT_B',
       'FTM_B', 'FTA_B', 'FT_PCT_B', 'OREB_B', 'DREB_B', 'REB_B', 'AST_B',
       'STL_B', 'BLK_B', 'TOV_B', 'PF_B', 'PLUS_MINUS_B'],
      dtype='object')

# Random Forest Modelling

In [44]:
# Getting input features
input_a_features = ['FG_PCT','FG3A', 'FTM', 'OREB', 'DREB', 'REB', 'AST']
input_b_features = [str(word) + "_B" for word in input_a_features]

input_features = input_a_features + input_b_features
X = final_stats_df[input_features]
X

Unnamed: 0,FG_PCT,FG3A,FTM,OREB,DREB,REB,AST,FG_PCT_B,FG3A_B,FTM_B,OREB_B,DREB_B,REB_B,AST_B
0,0.485,2601,1442,774,2804,3578,2083,0.473,3099,1536,968,2794,3762,2129
1,0.485,2601,1442,774,2804,3578,2083,0.473,3306,1364,914,3074,3988,2115
1,0.485,2601,1442,774,2804,3578,2083,0.473,3306,1364,914,3074,3988,2115
2,0.485,2601,1442,774,2804,3578,2083,0.457,2669,1447,901,2751,3652,2062
2,0.485,2601,1442,774,2804,3578,2083,0.457,2669,1447,901,2751,3652,2062
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47,0.474,2771,1449,786,2920,3706,2085,0.447,3002,1541,832,2884,3716,1918
48,0.474,2771,1449,786,2920,3706,2085,0.449,2965,1555,900,2919,3819,1954
48,0.474,2771,1449,786,2920,3706,2085,0.449,2965,1555,900,2919,3819,1954
49,0.474,2771,1449,786,2920,3706,2085,0.465,2829,1282,804,2849,3653,2155


In [45]:
output_a_features = ["FG_PCT_A","FG3_PCT_A","FTM_A","OREB_A","DREB_A","REB_A","AST_A"]
output_b_features = [word.replace("_A","_B") for word in output_a_features]
output_features = output_a_features + output_b_features
y = final[output_features]
y

Unnamed: 0,FG_PCT_A,FG3_PCT_A,FTM_A,OREB_A,DREB_A,REB_A,AST_A,FG_PCT_B,FG3_PCT_B,FTM_B,OREB_B,DREB_B,REB_B,AST_B
47753,0.500,0.444,17,6.0,40.0,46.0,27,0.468,0.350,10,7.0,35.0,42.0,33
47721,0.505,0.481,15,9.0,33.0,42.0,30,0.543,0.548,17,12.0,44.0,56.0,33
47730,0.426,0.258,11,20.0,36.0,56.0,21,0.470,0.449,17,7.0,38.0,45.0,30
47735,0.506,0.405,15,11.0,41.0,52.0,29,0.419,0.290,17,12.0,28.0,40.0,23
47749,0.430,0.382,13,14.0,35.0,49.0,19,0.390,0.250,16,15.0,36.0,51.0,19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43442,0.484,0.407,17,12.0,38.0,50.0,23,0.413,0.400,19,10.0,34.0,44.0,22
43419,0.543,0.290,18,5.0,35.0,40.0,32,0.523,0.488,13,5.0,26.0,31.0,33
43425,0.457,0.314,15,13.0,42.0,55.0,32,0.420,0.367,20,9.0,45.0,54.0,20
43415,0.489,0.472,9,8.0,37.0,45.0,33,0.384,0.200,23,10.0,37.0,47.0,24


### Creating a Test Train Split

In [73]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import numpy as np

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=3)

14

### Implementing Random Forest Regressor

In [74]:
from sklearn.ensemble import RandomForestRegressor

rfe = RandomForestRegressor(random_state=10, n_estimators=1000)

rfe.fit(X_train,y_train)
y_pred = rfe.predict(X_test)

## Evaluate Multioutput Regression

In [83]:
# evaluate multioutput regression model with k-fold cross-validation
from numpy import absolute
from numpy import mean
from numpy import std
from sklearn.datasets import make_regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold

# define model
model = RandomForestRegressor()
# define the evaluation procedure
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate the model and collect the scores
n_scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
# force the scores to be positive
n_scores = absolute(n_scores)
# summarize performance
print('MAE: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

MAE: 3.570 (0.419)


30


ValueError: Length mismatch: Expected axis has 30 elements, new values have 14 elements