In [1]:
from pandas import read_csv
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.preprocessing import MaxAbsScaler
from sklearn.feature_selection import SelectPercentile, f_classif
import statsmodels.api as sm
from scipy import stats

In [2]:
df = read_csv('../data/use_for_predictions.csv')
df = df[df['result'] != 0.5]

In [3]:
y = df['result'].values
X = df.drop(columns=['result', 'day', 'day_game_num', 'weekday', 'elo', 'start_time']).values

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=5, shuffle=False)
mmx = MaxAbsScaler().fit(X_train)
X_train = mmx.transform(X_train)
X_test = mmx.transform(X_test)

In [5]:
model = RandomForestClassifier(n_estimators=500, max_depth=10,
                               n_jobs=-1)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [6]:
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.64      0.89      0.74        63
         1.0       0.82      0.49      0.61        63

   micro avg       0.69      0.69      0.69       126
   macro avg       0.73      0.69      0.68       126
weighted avg       0.73      0.69      0.68       126

[[56  7]
 [32 31]]


In [7]:
y_pred = model.predict_proba(X_test)
print(round(roc_auc_score(y_test, y_pred[:, 1])*100, 2))

73.38


In [8]:
model.feature_importances_

array([0.59864423, 0.33994334, 0.03509844, 0.02631399])

In [9]:
est = sm.Logit(y, X)
est2 = est.fit(maxiter=3500)
print(est2.summary())

Optimization terminated successfully.
         Current function value: 0.582180
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                 2514
Model:                          Logit   Df Residuals:                     2510
Method:                           MLE   Df Model:                            3
Date:                Wed, 10 Apr 2019   Pseudo R-squ.:                  0.1601
Time:                        17:57:18   Log-Likelihood:                -1463.6
converged:                       True   LL-Null:                       -1742.5
                                        LLR p-value:                1.389e-120
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
x1             0.0187      0.001     17.041      0.000       0.017       0.021
x2            -0.0006      0.

In [10]:
fs = SelectPercentile(f_classif, 10).fit(X, y)
print(fs.pvalues_)
print(fs.scores_)

[1.26919956e-68 2.43730194e-41 3.86856974e-06 6.62896057e-03]
[326.04166139 188.11362772  21.42397691   7.38314543]
