In [12]:
%matplotlib inline

#basic packages
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

#models
from xgboost import XGBClassifier
from sklearn.svm import SVC

#other model packages
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OneHotEncoder

In [13]:
df = pd.read_csv('../../Data/model-data/model_data.csv', index_col = 0)

#replace independent with ind. for better graphs
df.loc[df['Team Conference'] == 'Independent', 'Team Conference'] = 'Ind.'
df.loc[df['Opponent Conference'] == 'Indepedent', 'Opponent Conference'] = 'Ind.'

df.head(5)

Unnamed: 0,Team,Opponent,syn_date,Year,Team Win,Team Conference,Opponent Conference,Team Win_3_game_average,Team Earned YPPA_3_game_average,Team Earned YPRA_3_game_average,...,Opp Coaches Compensation,Opp Athletic Student Aid,Opp Total Revenues,"Opp Corporate Sponsorship, Advertising, Licensing",Opp Donor Contributions,Opp Competition Guarantees,Opp Ticket Sales,Opp Total Football Spending,Opp footall_expense_pct,Opp football_expense_pct
0,Akron,Ohio State,2011-09-01,2011,0,MAC,Big 10,,,,...,16511360.0,15129920.0,131815821.0,12162905.0,17636938.0,3853330.0,50009395.0,44287914.0,0.362164,0.362164
1,Alabama,Kent State,2011-09-01,2011,1,SEC,MAC,,,,...,3763340.0,5287795.0,21448821.0,424323.0,435339.0,822050.0,669316.0,11774328.0,0.567253,0.567253
2,Arkansas State,Illinois,2011-09-01,2011,0,Sun Belt,Big 10,,,,...,10259368.0,9847571.0,77863883.0,4553520.0,16437955.0,1376000.0,16533261.0,29526365.0,0.401846,0.401846
3,Army,Northern Illinois,2011-09-01,2011,0,Ind.,MAC,,,,...,3593287.0,6810435.0,24148602.0,370107.0,1329546.0,1629150.0,864185.0,13076496.0,0.573691,0.573691
4,Auburn,Utah State,2011-09-01,2011,1,SEC,WAC,,,,...,3887698.0,4389402.0,22777944.0,564983.0,1869811.0,831640.0,1965806.0,11704878.0,0.594142,0.594142


In [14]:
#one hot encode Team, Opponent, Team Conference, Opponent Conference
for category in ['Team', 'Opponent', 'Team Conference', 'Opponent Conference']:
    #define encoder
    ohe = OneHotEncoder(handle_unknown='ignore')
    
    #fit/transform
    ohe.fit(df[category].values.reshape(-1, 1))
    new_columns = ohe.transform(df[category].values.reshape(-1, 1)).toarray()
    
    #convert to dataframe
    ohe_df = pd.DataFrame(new_columns, columns = ohe.get_feature_names())

    #rename columns
    ohe_df.rename(columns = {column:column.replace('x0_', category + ' ') for column in df.columns}, 
                       inplace = True)
    
    #join back into main dataframe
    df = pd.concat([df, ohe_df], axis = 1)

In [15]:
#split into train/blind dataset
train = df[df.Year < df.Year.max()]
blind = df[df.Year == df.Year.max()]

In [23]:
#create folds for training data
kfold_df = train[['Team Conference', 'Team Win']]

#stratify on conference and wins
skf = StratifiedKFold(n_splits = 5, random_state = 1234, shuffle = True)
splits = skf.split(train['Team Conference'], train['Opponent Conference'], train['Team Win'])

#create list for custom folds
customFolds = []
for x, y in splits:
    customFolds.append((x, y))

In [24]:
#set seed
random_state = np.random.RandomState(1234)

#random cv hyper-parameter tuning for xgboost
xgbParams = {
        'learning_rate': [0.001, 0.01, 0.1],
        'max_depth': [depth for depth in range(1, 10)],
        'min_child_weight': [1, 3, 5],
        'subsample': [0.5, 0.667, 0.75],
        'n_estimators' : [100, 200, 500],
        'objective': ['binary:logistic']
    }

#initiate model
xgbModel = XGBClassifier()

#randomized search
xgb_search = RandomizedSearchCV(estimator = xgbModel,
                           param_grid = xgbParams,                   
                           cv = customFolds,
                           verbose = 1,
                           n_jobs = -1,
                           random_state = 1234)



[(array([    0,     1,     2, ..., 10228, 10229, 10230]),
  array([   40,    42,    54, ..., 10223, 10224, 10231])),
 (array([    0,     1,     2, ..., 10228, 10229, 10231]),
  array([   10,    28,    52, ..., 10221, 10222, 10230])),
 (array([    0,     2,     3, ..., 10226, 10230, 10231]),
  array([    1,     7,    11, ..., 10227, 10228, 10229])),
 (array([    1,     3,     4, ..., 10229, 10230, 10231]),
  array([    0,     2,     5, ..., 10218, 10225, 10226])),
 (array([    0,     1,     2, ..., 10229, 10230, 10231]),
  array([    3,     4,     9, ..., 10190, 10208, 10220]))]

In [None]:
#define numeric type
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

#create X, y train/test columns
X_train = train.drop('Team Win', axis = 1)
y_train = train['Team Win']

#only keep numeric columns
X_train = X_train.select_dtypes(include=numerics)
X_test = blind[X_train.columns]
y_test = blind['Team Win']



In [None]:
#fit cv
xgb_search.fit(X_train, Y_train)

#get best params
