In [None]:
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import matplotlib as mpl

width, height = 10, 5
mpl.rcParams['figure.figsize'] = [width, height]


In [None]:
from sklearn import svm
from sklearn import pipeline
from sklearn import preprocessing
from sklearn import model_selection

from utility_fns import form_last_n_games
from utility_fns import make_train_val_test


In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import precision_recall_fscore_support
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
basedir = os.path.dirname(os.getcwd())
data = os.path.join(basedir, 'data', 'derived', 'cleaned_matches.csv')
defense = os.path.join(basedir, 'data', 'derived', 'stats_categories', 'DEFENSE_stats.csv')
snaps = os.path.join(basedir, 'data', 'derived', 'stats_categories', 'SNAP_COUNTS_stats.csv')

cleaned_matches = pd.read_csv(data)
# defense_stats = pd.read_csv(defense, index_col=[0,1,2])
# snaps_stats = pd.read_csv(snaps, index_col=[0,1,2])

In [None]:
cleaned_matches.Date = pd.to_datetime(cleaned_matches.Date)
cleaned_matches.sort_values(by='Date', inplace=True, ascending=True)
cleaned_matches.info()


In [None]:
cleaned_matches['Pct Team Score'] = cleaned_matches['Team Score'] / (cleaned_matches['Team Score'] + cleaned_matches['Opponent Score'])
cleaned_matches['Pct Opponent Score'] = cleaned_matches['Opponent Score'] / (cleaned_matches['Team Score'] + cleaned_matches['Opponent Score'])

## Train to classify based on the last-n-games a team played

In [None]:
x_cols = [
    'Team Code',
    'Opponent Code',
    'Location'
]
lookup_x_cols = [
    'Team Score',
    'Opponent Score',
    'Pct Team Score',
    'Pct Opponent Score',
    'Location'
]
key_x_cols = [
  'Team Code',
  'Opponent Code'
]
y_cols = [
    'Class'
]

played_matches = cleaned_matches.dropna(
    axis=0, how='any', subset=x_cols + y_cols + lookup_x_cols).copy()

played_matches['Class'].where(
    played_matches['Class'] > 0, other=0, inplace=True)


played_matches.info()


In [None]:
last_n_matches, new_columns = form_last_n_games(
    played_matches, 5, lookup_x_cols, key_x_cols)

In [None]:
X = pd.concat([last_n_matches[x_cols], last_n_matches[new_columns]], axis=1)
y = last_n_matches[y_cols]

numeric_columns = X.columns[X.columns.str.contains('Score')]
dummies_columns = X.columns[~X.columns.str.contains('Score')]
print(dummies_columns)
print(numeric_columns)

In [None]:
X.head()

In [None]:
def dataset_transformer():
    return ColumnTransformer([('scaler', StandardScaler(), numeric_columns), ('one-hot', OneHotEncoder(handle_unknown='ignore'), dummies_columns)])

dataset_transformer().fit(X, y).transformers

In [None]:
X.describe()

In [None]:
y.head()

In [None]:
X_train, X_val, X_test, y_train, y_val, y_test = make_train_val_test(
    X, y, test_pct=0.2, val_pct=0.2, random_state=0
)

# y_train = y_train.ravel()
# y_val = y_val.ravel()
# y_test = y_test.ravel()

print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)
print(X_test.shape, y_test.shape)


In [None]:
for kernel in ['linear', 'poly', 'rbf', 'sigmoid']:
    train_scores = []
    val_scores = []
    val_space = np.geomspace(start=0.01, stop=10, num=3)
    for C in val_space:
        print(f"Evaluating {kernel} kernel for C={C}")
        model = make_pipeline(dataset_transformer(), svm.SVC(C=C, max_iter=10_000, kernel=kernel))

        model.fit(X_train, y_train.values.ravel())
        train_scores.append(model.score(X_train, y_train.values.ravel()))
        val_scores.append(model.score(X_val, y_val.values.ravel()))

    plt.title(f'Best C for {kernel}: {val_space[np.argmax(val_scores)]}')
    plt.plot(val_space, train_scores, label='Train score')
    plt.plot(val_space, val_scores, label='Val score')
    # plt.ylim(0, 1.1)
    plt.xscale('log')
    plt.legend()
    plt.show()


In [None]:
y_pred = model.predict(X_test)
precision, recall, fbeta_score, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')

In [None]:
best_config = []
best_score = 0
best_fit_config = []
best_fit_score = 0
for estimator in [DecisionTreeClassifier(max_leaf_nodes=3), DecisionTreeClassifier(max_leaf_nodes=5), DecisionTreeClassifier(max_leaf_nodes=10), DecisionTreeClassifier(max_leaf_nodes=50)]:
  for nestimators in [10, 31, 62, 93, 124, 200, 300]:
    for nsamples in [10, 20, 30, 50, 75, 100]:
      clf = make_pipeline(dataset_transformer(), BaggingClassifier(base_estimator=estimator, n_estimators=nestimators, max_samples=nsamples))
      clf.fit(X_train, y_train.values.ravel())
      
      score = clf.score(X_val, y_val.values.ravel())
      if(score > best_score):
        best_config = [estimator, nestimators, nsamples]
        best_score = score
      
      score = clf.score(X_train, y_train)
      if(score > best_fit_score):
        best_fit_config = [estimator, nestimators, nsamples]
        best_fit_score = score
        
      print(end='.')

In [None]:
print(f"Best validation: {best_score}, {best_config}")
print(f"Best training: {best_fit_score}, {best_fit_config}")

In [None]:
clf = make_pipeline(dataset_transformer(), DecisionTreeClassifier())
clf.fit(X_train, y_train)
clf.score(X_val, y_val)

In [None]:
plt.hist(clf.predict_proba(X_test))

In [None]:
X_cv = pd.concat([X_train, X_val])
y_cv = pd.concat([y_train, y_val])

In [None]:
Cs = np.geomspace(0.01, 15, num=75)
scores = []
for c in Cs:
    model = make_pipeline(dataset_transformer(), svm.LinearSVC(dual=False, C=c, max_iter=100_000))
    scores.append(cross_val_score(
        model, X_cv, y_cv.values.ravel(), cv=5))
    print(end='.')


In [None]:
x = Cs
plt.errorbar(x, np.mean(scores,axis=1), np.std(scores,axis=1), label='Mean & Std Dev. of CV Score', linewidth=1)
plt.plot(x, np.sort(scores, axis=1), linewidth=0.2)
plt.legend()
plt.xscale('log')
plt.xlabel('C')
plt.ylabel('Accuracy')
plt.title('Cross-Validation Score')
plt.show()

In [None]:
print('Maximum mean:', np.max(np.mean(scores,axis=1)))
print('Corresponding std deviation:', np.std(scores,axis=1)[np.argmax(np.mean(scores,axis=1))])
print('C:',Cs[np.argmax(np.mean(scores,axis=1))])

In [None]:
print('Minimum std deviation:', np.min(np.std(scores,axis=1)))
print('Corresponding mean:', np.mean(scores,axis=1)[np.argmin(np.std(scores,axis=1))])
print('C:',Cs[np.argmin(np.std(scores,axis=1))])

In [None]:
best_mean_cs = np.flip(np.argsort(np.mean(scores,axis=1)))
best_std_cs = np.argsort(np.std(scores,axis=1))

In [None]:
# Do some random perturbation of the Cs
scores = []

Cs_to_examine = np.unique(np.concatenate(
    (Cs[best_mean_cs[:5]], Cs[best_std_cs[:5]])))
Cs_to_examine = np.unique(np.concatenate([Cs_to_examine, np.abs(Cs_to_examine + np.random.normal(scale=0.2,size=len(Cs_to_examine)))]))

for c in Cs_to_examine:
    model = make_pipeline(dataset_transformer(), svm.LinearSVC(dual=False, C=c, max_iter=100_000))
    
    scores.append(cross_val_score(
        model, X_cv, y_cv.values.ravel(), cv=100, n_jobs=-1))
    
    print(end='.')

In [None]:
x = Cs_to_examine
quantiles = [0, 0.1, 0.2, 0.3, 0.4, 0.6, 0.8, 1.0]
plt.errorbar(x, np.mean(scores,axis=1), np.std(scores,axis=1), label='Mean & Std Dev. of CV Score', linewidth=1)
plt.plot(x, np.quantile(scores, quantiles, axis=1).T, linewidth=0.2)
plt.legend()
plt.xscale('log')
plt.xlabel('C')
plt.ylabel('Accuracy')
plt.title('Cross-Validation Score')
plt.show()

In [None]:
best_mean_idx = np.argmax(np.mean(scores,axis=1))

print(f'Best C={Cs_to_examine[best_mean_idx]} with mean={np.mean(scores,axis=1)[best_mean_idx]}, std={np.std(scores,axis=1)[best_mean_idx]}')

In [None]:
# Train the best model.

model = make_pipeline(dataset_transformer(),
    svm.LinearSVC(dual=False, C=Cs_to_examine[best_mean_idx], max_iter=100_000))
model.fit(X_cv, y_cv.values.ravel())
model.score(X_test, y_test.values.ravel())

In [None]:
coefs = model[1].coef_[0]
plt.scatter(x=np.arange(len(coefs)),y=coefs)
plt.show()

In [None]:
model.fit(X, y.values.ravel())
confidences = model.decision_function(X)

In [None]:
plt.hist(confidences, bins=25)
plt.show()

In [None]:
plt.hist(confidences * (2 * y.to_numpy().ravel() - 1), bins=25)
plt.show()

In [None]:
np.min(confidences * (2 * y.to_numpy().ravel() - 1))