In [0]:
!pip install scikit-learn==0.22



In [0]:
!pip install category_encoders==2.*
!pip install eli5
!pip install pandas-profiling==2.*
!pip install pdpbox
!pip install shap



In [0]:
import pandas as pd

In [0]:
import eli5
import re
import numpy as np
import category_encoders as ce
from eli5.sklearn import PermutationImportance
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from xgboost import XGBClassifier

In [0]:
fights = pd.read_csv('fights.csv')

In [0]:
fights = fights.drop(columns=['blue_record_at_fight_time', 'red_record_at_fight_time'])

In [0]:
fights = fights[(fights['red_decision'] == 'W') | (fights['red_decision'] == 'L') | (fights['red_decision'] == 'D')]

In [0]:
train, test = train_test_split(fights, test_size=0.20)

In [0]:
train, val = train_test_split(train, test_size=0.20)

In [0]:
target = 'red_decision'
features = train.columns.drop([target, 'result'])

X_train = train[features]
y_train = train[target]

X_val = train[features]
y_val = train[target]

X_test = test[features]

In [0]:
X_train.head()

Unnamed: 0,date,blue,red,red_br_id,blue_br_id,title_fight,venue,red_born,red_debut,red_division,red_height,red_nationality,red_reach,red_stance,blue_born,blue_debut,blue_division,blue_height,blue_nationality,blue_reach,sex,blue_stance,red_age,blue_age,red_age_at_fight_time,red_years_active,blue_age_at_fight_time,blue_years_active
23039,2015-12-17,Roberto Santos,Cedric Vitu,342645,313172,True,"Cirque d'Hiver, Paris",1985-08-18,2005-12-17,super welter,178cm,France,180cm,southpaw,,,,,,,,,34.0,,11078 days 00:00:00.000000000,3652 days 00:00:00.000000000,,
47377,2016-07-30,Edwin Saul Garcia,Javier Cruz Alvarez,769593,761528,,"Auditorio Municipal, Cabo San Lucas",,2016-07-30,bantam,,Mexico,,,,,,,,,,,,,,0 days 00:00:00.000000000,,
24434,2007-07-06,Jose Rivera,Antonio DeMarco,260221,/52632,False,"Auditorio Benito Juarez, Los Mochis",1986-01-07,2004-06-21,welter,178cm,Mexico,180cm,southpaw,,,,,,,,,33.0,,7850 days 00:00:00.000000000,1110 days 00:00:00.000000000,,
35131,2017-06-28,Omari Uliza,Ibrahim Pazi,764188,767795,False,"Manyara Park, Manyara",,2016-07-16,welter,,Tanzania,,,,,,,,,,,,,,347 days 00:00:00.000000000,,
16736,2018-10-13,Jesus Francisco Soto*,Jose Carlos Cuevas,620121,835913,False,"Bar La Oficina, Tijuana",1987-09-29,2012-06-23,heavy,,Mexico,,,1990-03-07,2018-04-20,heavy,,Mexico,,male,,32.0,29.0,11337 days 00:00:00.000000000,2303 days 00:00:00.000000000,10447 days 00:00:00.000000000,176 days 00:00:00.000000000


In [0]:
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
#KNNImputer makes colab crash :/

le = LabelEncoder()

y_train = le.fit_transform(y_train)
y_val = le.transform(y_val)

pipeline = make_pipeline(
    ce.OrdinalEncoder(),
    SimpleImputer()
)

X_train_transformed = pipeline.fit_transform(X_train)
X_val_transformed = pipeline.transform(X_val)
X_test_transformed = pipeline.transform(X_test)

model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X_train_transformed, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [0]:
from sklearn.metrics import accuracy_score

rf_y_pred = model.predict(X_val_transformed)
print(model.score(X_val_transformed, rf_y_pred))

1.0


In [0]:
permuter = PermutationImportance(
    model,
    scoring='accuracy',
    n_iter=5,
    random_state=42
)

permuter.fit(X_val_transformed, y_val)

PermutationImportance(cv='prefit',
                      estimator=RandomForestClassifier(bootstrap=True,
                                                       ccp_alpha=0.0,
                                                       class_weight=None,
                                                       criterion='gini',
                                                       max_depth=None,
                                                       max_features='auto',
                                                       max_leaf_nodes=None,
                                                       max_samples=None,
                                                       min_impurity_decrease=0.0,
                                                       min_impurity_split=None,
                                                       min_samples_leaf=1,
                                                       min_samples_split=2,
                                                       min_weight_fr

In [0]:
minimum_importance = 0
mask = permuter.feature_importances_ > minimum_importance
perm_features = X_train.columns[mask]
X_train_perm = X_train[perm_features]
X_val_perm = X_val[perm_features]
X_test_perm = X_test[perm_features]

In [0]:
from xgboost import XGBClassifier

encoder = ce.OrdinalEncoder().fit(X_train_perm)

X_train_encoded = encoder.transform(X_train_perm)
X_val_encoded = encoder.transform(X_val_perm)

xgb = XGBClassifier(n_estimators=100,
                    max_depth=5,
                    learning_rate=0.1,
                    random_state=42,
                    n_jobs=-1)

eval_set = [(X_train_encoded, y_train),
            (X_val_encoded, y_val)]

xgb.fit(X_train_encoded, y_train)

y_pred = xgb.predict(X_val_encoded)

In [0]:
from sklearn.metrics import roc_auc_score

y_pred_proba = xgb.predict_proba(X_val_encoded)
roc_auc_score(y_val, y_pred_proba, multi_class='ovo', average='macro')

0.7376930350274445

In [0]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_val, y_pred)  

array([[   17,   811,  2041],
       [    0, 12475, 12884],
       [    0,  4868, 32430]])

In [0]:
import pickle

filename = 'xgboost_boxing_model.sav'

pickle.dump(xgb, open(filename, 'wb'))

In [0]:
filename = 'xgboost_boxing_model.sav'

loaded_model = pickle.load(open(filename, 'rb'))

X_test_encoded = encoder.transform(X_test_perm)
y_test_predict_loaded = loaded_model.predict_proba(X_test_encoded)

print(y_test_predict_loaded[:5])

[[0.05306702 0.2710492  0.67588377]
 [0.02927866 0.26239404 0.70832735]
 [0.04731048 0.77812445 0.17456512]
 [0.04315268 0.27505428 0.68179303]
 [0.05155388 0.42949468 0.5189514 ]]


In [0]:
X_one_fight = pd.DataFrame({'date': '2019-12-19',
                            'blue': 'Stephen Plainte', 
                            'red': 'Bruno Janota', 
                            'red_br_id': 648635, 
                            'blue_br_id': '365136',
                            'title_fight': 'True',
                            'venue': 'Staples Center, Los Angeles, California, USA',
                            'red_born': '1982-02-18',
                            'red_debut': '2012-08-22',
                            'red_division': 'welter',
                            'red_height': '176cm',
                            'red_nationality': 'USA',
                            'red_reach': '67cm',
                            'red_stance': 'southpaw',
                            'blue_born': '1986-01-08',
                            'blue_debut': '2015-10-31',
                            'blue_division': 'super middle',
                            'blue_height': '181cm',
                            'blue_nationality': 'USA',
                            'blue_reach': '74cm',
                            'sex': 'male',
                            'blue_stance': 'orthodox',
                            'red_age': 37.0,
                            'blue_age': 33.0,
                            'red_age_at_fight_time': '37',
                            'red_years_active': '2646 days 00:00:00.000000000',
                            'blue_age_at_fight_time': '33',
                            'blue_years_active': '1533 days 00:00:00.000000000'},
                           index=[0])

In [0]:
X_one_fight_encoded = encoder.transform(X_one_fight)

print(loaded_model.predict_proba(X_one_fight_encoded))

[[0.04115868 0.4351267  0.5237146 ]]
