In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import brier_score_loss
import numpy as np

In [15]:
judge = pd.read_csv('springer.csv')

In [16]:
judge.columns

Index(['Rk', 'Gcar', 'Gtm', 'Date', 'Tm', 'Unnamed: 5', 'Opp', 'Rslt', 'Inngs',
       'PA', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'BB', 'IBB', 'SO', 'HBP',
       'SH', 'SF', 'ROE', 'GDP', 'SB', 'CS', 'BA', 'OBP', 'SLG', 'OPS', 'BOP',
       'aLI', 'WPA', 'acLI', 'cWPA', 'RE24', 'DFS(DK)', 'DFS(FD)', 'Pos'],
      dtype='object')

In [17]:
judge = judge[['PA', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'BB', 'HBP', 'SF', 'SH', 'SO', 'BA', 'OBP', 'SLG', 'OPS', 'BOP']].dropna()

In [18]:
judge['Game'] = [i for i in range(0, len(judge))]

In [19]:
judge['y'] = judge['H'].shift(-1)

In [20]:
judge['weights'] = 1.01 - np.exp(-0.05 * np.arange(len(judge)))

In [21]:
def weighted_frequency(data, alpha=0.99):
    unique_values = [0, 1, 2, 3, 4]
    weights = np.power(alpha, np.arange(len(data) - 1, -1, -1))
    
    weighted_freq = {}
    for value in unique_values:
        mask = (data == value)
        weighted_freq[value] = np.sum(weights[mask]) / np.sum(weights)
    
    return pd.Series(weighted_freq)

In [22]:
np.arange(0.9, 1, 0.01)

array([0.9 , 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99])

In [24]:
def classwise_ece(y_true, y_prob, n_bins=10):
    num_classes = y_prob.shape[1]
    classwise_ece_scores = []

    for k in range(num_classes):
        y_true_k = (y_true == k).astype(int)
        y_prob_k = y_prob[:, k]
        
        bin_boundaries = np.linspace(0, 1, n_bins + 1)
        bin_lowers = bin_boundaries[:-1]
        bin_uppers = bin_boundaries[1:]

        ece_k = 0.0
        for bin_lower, bin_upper in zip(bin_lowers, bin_uppers):
            in_bin = np.logical_and(bin_lower < y_prob_k, y_prob_k <= bin_upper)
            prop_in_bin = np.mean(in_bin)
            if prop_in_bin > 0:
                accuracy_in_bin = np.mean(y_true_k[in_bin])
                avg_prob_in_bin = np.mean(y_prob_k[in_bin])
                ece_k += np.abs(avg_prob_in_bin - accuracy_in_bin) * prop_in_bin

        classwise_ece_scores.append(ece_k)

    return classwise_ece_scores

In [43]:
y = judge['RBI'].dropna()

In [48]:
len(prob_arrs)

110

In [49]:
arrs = []
for i in range(1, len(judge)):
    arr = weighted_frequency(judge['RBI'][:i], alpha = 0.97).values
    arrs.append(arr)
    prob_arrs = np.array(arrs)
classwise_ece_value = classwise_ece(y[:-1], prob_arrs, n_bins=10)

In [51]:
weighted_frequency(judge['RBI'], alpha=0.97)

0    0.752903
1    0.109276
2    0.071980
3    0.056080
4    0.000000
dtype: float64

In [None]:
judge['1B'] = judge['H'] - judge['2B'] - judge['3B'] - judge['HR']

In [None]:
rolling_hits = judge['H'].rolling(10, min_periods=1).sum()
rolling_abs = judge['AB'].rolling(10, min_periods=1).sum()
judge['BA10'] = rolling_hits / rolling_abs

rolling_hits = judge['H'].rolling(5, min_periods=1).sum()
rolling_abs = judge['AB'].rolling(5, min_periods=1).sum()
judge['BA5'] = rolling_hits / rolling_abs

rolling_bases = judge['H'].rolling(10, min_periods=1).sum() + judge['BB'].rolling(10, min_periods=1).sum() + judge['HBP'].rolling(10, min_periods=1).sum()
rolling_pa = judge['PA'].rolling(10, min_periods=1).sum()
judge['OBP10'] = rolling_bases / rolling_pa

rolling_bases = judge['H'].rolling(5, min_periods=1).sum() + judge['BB'].rolling(10, min_periods=1).sum() + judge['HBP'].rolling(10, min_periods=1).sum()
rolling_pa = judge['PA'].rolling(5, min_periods=1).sum()
judge['OBP5'] = rolling_bases / rolling_pa


rolling_bases = judge['2B'].rolling(10, min_periods=1).sum() * 2 + judge['3B'].rolling(10, min_periods=1).sum()*3 + judge['HR'].rolling(10, min_periods=1).sum()*4 + judge['1B'].rolling(10, min_periods=1).sum() 
rolling_abs = judge['AB'].rolling(10, min_periods=1).sum()
judge['SLG10'] = rolling_bases / rolling_abs

rolling_bases = judge['2B'].rolling(10, min_periods=1).sum() * 2 + judge['3B'].rolling(10, min_periods=1).sum()*3 + judge['HR'].rolling(10, min_periods=1).sum()*4 + judge['1B'].rolling(10, min_periods=1).sum() 
rolling_ab = judge['AB'].rolling(5, min_periods=1).sum()
judge['SLG5'] = rolling_bases / rolling_ab

judge['OPS10'] = judge['OBP10'] + judge['SLG10']
judge['OPS5'] = judge['OBP5'] + judge['SLG5']

test = judge.iloc[:, 13:21]

In [None]:
test

Unnamed: 0,BA,OBP,SLG,OPS,BOP,Game,y,1B
0,0.250,0.400,0.500,0.900,3.0,0,0.0,0
1,0.111,0.200,0.222,0.422,3.0,1,1.0,0
2,0.167,0.286,0.250,0.536,3.0,2,0.0,1
3,0.125,0.211,0.188,0.398,3.0,3,1.0,0
4,0.143,0.208,0.190,0.399,3.0,4,0.0,1
...,...,...,...,...,...,...,...,...
84,0.321,0.440,0.718,1.158,3.0,84,0.0,2
85,0.318,0.438,0.711,1.149,3.0,85,0.0,0
86,0.314,0.433,0.702,1.135,3.0,86,0.0,0
87,0.310,0.430,0.693,1.123,3.0,87,2.0,0


In [None]:
test = test.dropna()

In [None]:
X= test.drop(['y', '1B'], axis = 1)
y = test['y']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = RandomForestClassifier(random_state=42)

# Train the model
model.fit(X_train, y_train)

# Get feature importances
importances = model.feature_importances_

# Create a DataFrame for visualization
feature_importances = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

print(feature_importances)

  Feature  Importance
5    Game    0.215725
0      BA    0.209700
3     OPS    0.202190
2     SLG    0.197261
1     OBP    0.175124
4     BOP    0.000000


In [None]:
y_pred_proba = model.predict_proba(X_test)

In [103]:
np.array(y_test.index)

array([76,  0, 26, 22, 12, 67, 10, 18,  4, 68, 85, 65, 53, 80, 84, 64, 33,
       79], dtype=int64)

In [134]:
y_pred_hist = np.array(historic_preds)[np.array(y_test.index)]

In [135]:
classwise_ece_values = classwise_ece(y_test, y_pred_hist, n_bins=10)
print(f'Classwise ECE: {classwise_ece_values}')

Classwise ECE: [0.31351548117421346, 0.18237049784367604, 0.09386700980945989, 0.10381352363900553, 0.007375270235420725]


In [None]:
Classwise ECE: [0.32666666666666677, 0.30833333333333335, 0.34277777777777774, 0.18944444444444444, 0.0005555555555555556]

In [74]:
model.predict_proba(np.array([0.312, 0.430, 0.688, 1.118, 3.0, 88]).reshape(1, -1))



array([[0.14, 0.14, 0.72, 0.  , 0.  ]])

In [114]:
historic_preds[-1]

array([0.35227273, 0.30681818, 0.27272727, 0.05681818, 0.01136364])

In [112]:
historic_preds[-1][1:].sum()

0.6477272727272727

In [None]:
[0.160246440159461, 0.026372961837767196, 0.20887939554843984, 0.08635677160904831, 0.007748810549401915]