In [31]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import numpy as np
from sklearn.metrics import log_loss, classification_report, accuracy_score

In [44]:
df_all = pd.read_csv('enriched_data.csv')

X_train = df_all[df_all['Season'] < 2024].drop(columns = ['Season', 'ScoreDiff', 'Winner', 'TeamID0_', 'TeamID1_'])
y_train = df_all[df_all['Season'] < 2024]['Winner']

X_test = df_all[df_all['Season'] == 2024].drop(columns = ['Season', 'ScoreDiff', 'Winner', 'TeamID0_', 'TeamID1_'])
y_test = df_all[df_all['Season'] == 2024]['Winner']

In [10]:
param_grid = { 
    'n_estimators': range(100,1001, 100),
    'max_features': range(2,16)
} 

In [13]:
grid_search = GridSearchCV(RandomForestClassifier(), 
                            param_grid=param_grid,
                            scoring = 'accuracy',
                            verbose = 10,
                            n_jobs=8) 
grid_search.fit(X, y)

Fitting 5 folds for each of 140 candidates, totalling 700 fits


[CV 5/5; 1/140] START max_features=2, n_estimators=100..........................
[CV 4/5; 1/140] START max_features=2, n_estimators=100..........................
[CV 3/5; 1/140] START max_features=2, n_estimators=100..........................
[CV 1/5; 1/140] START max_features=2, n_estimators=100..........................
[CV 2/5; 2/140] START max_features=2, n_estimators=200..........................
[CV 2/5; 1/140] START max_features=2, n_estimators=100..........................
[CV 1/5; 2/140] START max_features=2, n_estimators=200..........................
[CV 3/5; 2/140] START max_features=2, n_estimators=200..........................
[CV 3/5; 1/140] END max_features=2, n_estimators=100;, score=0.627 total time=   0.4s
[CV 5/5; 1/140] END max_features=2, n_estimators=100;, score=0.684 total time=   0.4s
[CV 4/5; 1/140] END max_features=2, n_estimators=100;, score=0.681 total time=   0.4s
[CV 4/5; 2/140] START max_features=2, n_estimators=200..........................
[CV 5/5; 2/14

In [14]:
rf = grid_search.best_estimator_

In [38]:
y_pred = rf.predict(X_test)
y_prob = rf.predict_proba(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.72      0.74      0.73        35
           1       0.71      0.69      0.70        32

    accuracy                           0.72        67
   macro avg       0.72      0.72      0.72        67
weighted avg       0.72      0.72      0.72        67



In [55]:
odds_df = pd.read_csv('odds.csv')
odds_df = pd.concat([odds_df.rename(columns = {'team1': 'TeamID0_', 
                                               'team2': 'TeamID1_',
                                               'team1ret': 'ret0',
                                               'team2ret': 'ret1',
                                               }), 
                                    odds_df.rename(columns = {'team1': 'TeamID1_', 
                                                              'team2': 'TeamID0_',
                                                              'team1ret': 'ret1',
                                                              'team2ret': 'ret0',})], 
                                    axis = 0)
df_2024 = df_all[df_all['Season'] == 2024]
df_2024.loc[:, ['prob0', 'prob1']] = y_prob
df_2024 = df_2024.merge(odds_df, how = 'inner',)


In [58]:
df_2024[['Winner', 'ret0', 'ret1', 'prob0', 'prob1']].to_csv('results.csv', index = False)

In [45]:
df_all

Unnamed: 0,Season,TeamID0_,TeamID1_,ScoreDiff,Winner,ScoreRegSznDiff,FGMDiff,FGADiff,FGM3Diff,FGA3Diff,FTMDiff,FTADiff,ORDiff,DRDiff,AstDiff,TODiff,StlDiff,BlkDiff,PFDiff,SeedDiff
0,2003,1421,1411,8,0,-1.593103,-0.354023,1.526437,0.549425,-0.500000,-1.434483,-7.135632,-0.890805,-1.627586,-1.165517,0.973563,0.635632,0.766667,0.803448,0.0
1,2003,1436,1112,-29,1,-17.421182,-5.493842,-9.852217,-1.759852,-4.588670,-4.673645,-5.448276,-2.213054,-1.918719,-3.435961,-0.716749,-1.602217,-1.248768,-1.853448,15.0
2,2003,1113,1272,13,0,1.448276,0.931034,-3.103448,-3.000000,-7.482759,2.586207,3.310345,-0.379310,-2.655172,-1.068966,0.206897,-2.172414,-0.827586,0.655172,3.0
3,2003,1141,1166,6,0,0.102403,-2.076280,-4.764890,-1.142111,-2.553814,5.397074,5.142111,-0.292581,0.094044,-1.197492,4.877743,-1.290491,-0.454545,3.692790,5.0
4,2003,1143,1301,2,0,2.082759,3.011494,5.390805,-1.552874,-5.465517,-2.387356,-0.949425,1.508046,2.345977,1.333333,-0.027586,-1.214943,-0.273563,-1.563218,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1377,2024,1166,1397,-7,1,1.062500,1.781250,-1.687500,1.875000,3.843750,-4.375000,-6.468750,-3.625000,2.937500,0.375000,-0.156250,-3.968750,-0.531250,-6.125000,1.0
1378,2024,1345,1397,6,0,3.925189,1.077652,-3.356061,-0.385417,-5.075758,2.155303,3.812500,0.280303,1.322917,1.643939,1.000947,-2.270833,-0.868371,-3.073864,-1.0
1379,2024,1104,1163,-14,1,9.279412,1.818015,6.102941,2.268382,6.310662,3.375000,3.283088,0.943015,-0.082721,-2.623162,2.694853,1.014706,-1.319853,3.639706,3.0
1380,2024,1301,1345,-13,1,-7.032828,-1.126263,2.578283,-1.277778,-0.035354,-3.502525,-5.194444,-1.974747,-3.083333,-5.699495,-1.747475,1.750000,-0.287879,1.997475,10.0
