# Modeling

## Imports

In [47]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
import pickle

## Read in the training and test data

In [55]:
with open('../../02_Data/02_Processed_Data/X_train.pkl', 'rb') as f:
    X_train = pickle.load(f)
    
with open('../../02_Data/02_Processed_Data/y_train.pkl', 'rb') as f:
    y_train = pickle.load(f)    

with open('../../02_Data/02_Processed_Data/X_test.pkl', 'rb') as f:
    X_test = pickle.load(f)
    
with open('../../02_Data/02_Processed_Data/y_test.pkl', 'rb') as f:
    y_test = pickle.load(f)    

## Build some models

In [None]:
lr = LogisticRegression(penalty='l1', C=0.1, max_iter=200)
lr.fit(X_train, y_train)
print('Train:', lr.score(X_train,y_train))
print('Test:', lr.score(X_test,y_test))

In [None]:
lr = LogisticRegression(penalty='l1', C=0.1, max_iter=1000)
lr.fit(X_train, y_train)
print('Train:', lr.score(X_train,y_train))
print('Test:', lr.score(X_test,y_test))

In [20]:
pd.DataFrame(lr.coef_, columns=X_train.columns).T.sort_values(0, ascending=False)

Unnamed: 0,0
f1_head_significant_strikes_landed_diff_em,0.146705
f1_grappling_submissions_attempts_diff_em,0.132935
f2_f2_distance_body_strikes_landed_em,0.092586
f2_f2_clinch_head_strikes_landed_em,0.088485
f1_f1_grappling_takedowns_attempts_em,0.065554
f1_f2_distance_body_strikes_attempts_em,0.061498
f1_body_significant_strikes_attempts_diff_em,0.051938
f1_stance,0.049905
f2_f2_clinch_body_strikes_attempts_em,0.039569
f2_f1_clinch_significant_strikes_attempts_em,0.039033


In [17]:
rf = RandomForestClassifier(n_estimators=1000, max_depth=3, min_samples_split=2)
rf.fit(X_train, y_train)
print('Train:', rf.score(X_train,y_train))
print('Test:',rf.score(X_test,y_test))

Train: 0.6875
Test: 0.6019607843137255


In [43]:
rf = RandomForestClassifier()
rf_params = {
    'n_estimators': [10, 50, 500],
    'min_samples_split': [2,3],
    'min_samples_leaf': [1,2]
}
gs = GridSearchCV(rf, param_grid=rf_params)
gs.fit(X_train, y_train)
print('Best Score:', gs.best_score_)
print('Best Parameters:', gs.best_params_)
print('Test:',gs.score(X_test,y_test))

Best Score: 0.5528846153846154
Best Parameters: {'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 500}
Test: 0.5901960784313726


In [44]:
rf = RandomForestClassifier()
rf_params = {
    'n_estimators': [1000],
    'min_samples_split': [2,3],
    'max_depth': [3,5,10]
}
gs = GridSearchCV(rf, param_grid=rf_params)
gs.fit(X_train, y_train)
print('Best Score:', gs.best_score_)
print('Best Parameters:', gs.best_params_)
print('Test:',gs.score(X_test,y_test))

Best Score: 0.566198224852071
Best Parameters: {'max_depth': 3, 'min_samples_split': 3, 'n_estimators': 1000}
Test: 0.6078431372549019


In [15]:
pd.DataFrame(rf.feature_importances_, index=X_train.columns).sort_values(0, ascending=False)


Unnamed: 0,0
f1_significant_strikes_landed_diff_avg,0.012945
f1_head_significant_strikes_landed_avg_diff,0.011565
f2_head_significant_strikes_landed_diff_avg,0.010678
f2_head_significant_strikes_percent_avg_diff,0.009419
f2_significant_strikes_attempts_diff_avg,0.008538
f1_f2_clinch_head_strikes_percent_avg,0.008459
f2_significant_strikes_landed_diff_avg,0.008045
f1_distance_head_strikes_landed_avg_diff,0.007903
f2_distance_head_strikes_landed_diff_avg,0.007468
f1_distance_head_strikes_landed_diff_avg,0.007286


In [16]:
ada = AdaBoostClassifier(random_state=42)
ada_params = {}
gs = GridSearchCV(ada, param_grid=ada_params)
gs.fit(X_train, y_train)
print('Best Score:', gs.best_score_)
print('Best Parameters:', gs.best_params_)
print('Test:',gs.score(X_test,y_test))

Best Score: 0.5336538461538461
Best Parameters: {}
Test: 0.5529411764705883


In [45]:
%%time
gb = GradientBoostingClassifier(random_state=42)
gb_params = {
    'n_estimators': [500, 1000],
    'learning_rate': [0.05, 0.1, 0.3],
    'max_depth': [3,5]
}
gb_gs = GridSearchCV(gb, param_grid=gb_params, verbose=2, n_jobs=3 )
gb_gs.fit(X_train, y_train)

Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] learning_rate=0.05, max_depth=3, n_estimators=500 ...............
[CV] learning_rate=0.05, max_depth=3, n_estimators=500 ...............
[CV] learning_rate=0.05, max_depth=3, n_estimators=500 ...............
[CV]  learning_rate=0.05, max_depth=3, n_estimators=500, total=  32.8s
[CV] learning_rate=0.05, max_depth=3, n_estimators=1000 ..............
[CV]  learning_rate=0.05, max_depth=3, n_estimators=500, total=  33.3s
[CV] learning_rate=0.05, max_depth=3, n_estimators=1000 ..............
[CV]  learning_rate=0.05, max_depth=3, n_estimators=500, total=  33.6s
[CV] learning_rate=0.05, max_depth=3, n_estimators=1000 ..............
[CV]  learning_rate=0.05, max_depth=3, n_estimators=1000, total= 1.1min
[CV] learning_rate=0.05, max_depth=5, n_estimators=500 ...............
[CV]  learning_rate=0.05, max_depth=3, n_estimators=1000, total= 1.1min
[CV] learning_rate=0.05, max_depth=5, n_estimators=500 ...............
[CV]  learning

[Parallel(n_jobs=3)]: Done  36 out of  36 | elapsed: 11.9min finished


CPU times: user 1min 34s, sys: 394 ms, total: 1min 34s
Wall time: 13min 26s


In [46]:
print('Best Score:', gs.best_score_)
print('Best Parameters:', gs.best_params_)
print('Test:',gs.score(X_test,y_test))

Best Score: 0.566198224852071
Best Parameters: {'max_depth': 3, 'min_samples_split': 3, 'n_estimators': 1000}
Test: 0.6078431372549019
