In [38]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
import xgboost as xgb
import joblib


In [18]:
df_original = pd.read_csv('../CSV Files/df_ufc_masters_w_reversed.csv')

In [19]:
df = df_original.copy()

In [20]:
df['finish'].value_counts()

finish
U-DEC         3556
KO/TKO        2990
SUB           1708
S-DEC          966
M-DEC           64
DQ              28
Overturned       4
Name: count, dtype: int64

In [21]:
df = df.dropna(subset=['finish'])
df = df[~df['finish'].isin(['DQ', 'Overturned'])]

In [22]:
finish_mapping = {
    'U-DEC': 'Decision',
    'S-DEC': 'Decision',
    'M-DEC': 'Decision',
    'KO/TKO': 'Finish',
    'SUB': 'Finish'
}
df['finish'] = df['finish'].map(finish_mapping)


In [23]:
df_encoded = pd.get_dummies(df['finish'], prefix='finish')
df_encoded = df_encoded.astype(int)


In [24]:
df_encoded['finish_type'] = df_encoded['finish_Finish']
df_encoded = df_encoded.drop(['finish_Decision', 'finish_Finish'], axis=1)

In [None]:
#finish_mapping = {'finish_Submission': 0,'finish_KO': 1,'finish_Decision': 2}
#df_encoded['finish_type'] = df_encoded[['finish_Submission', 'finish_KO', 'finish_Decision']].idxmax(axis=1).map(finish_mapping)

In [25]:
df_result =pd.concat([df, df_encoded], axis=1)

In [None]:
df_result['B_win_by_Decision'] = df_result['B_win_by_Decision_Split'] + df_result['B_win_by_Decision_Majority'] + df_result['B_win_by_Decision_Unanimous']
df_result['R_win_by_Decision'] = df_result['R_win_by_Decision_Split'] + df_result['R_win_by_Decision_Majority'] + df_result['R_win_by_Decision_Unanimous']

features= ['B_avg_SIG_STR_landed',
    'B_avg_SIG_STR_pct',
    'B_avg_TD_pct',
    'R_avg_SIG_STR_pct',
    'R_avg_TD_pct',
    'B_avg_SUB_ATT',
    'B_avg_TD_landed',
    'R_avg_SIG_STR_landed',
    'R_avg_SUB_ATT',
    'R_avg_TD_landed',
    'B_Height_cms',
    'B_Reach_cms',
    'R_Height_cms',
    'R_Reach_cms', 
    'R_age',
    'B_age']
#features=['B_win_by_Decision', 'R_win_by_Decision','B_win_by_KO/TKO','R_win_by_KO/TKO','B_win_by_Submission', 'R_win_by_Submission']

In [None]:
# List of columns to apply np.log
columns_to_log = ['B_avg_SIG_STR_landed', 'B_avg_SUB_ATT', 'B_avg_TD_landed', 
                  'R_avg_SIG_STR_landed', 'R_avg_SUB_ATT', 'R_avg_TD_landed']

# Applying np.log to each column in the list
for col in columns_to_log:
    # Adding a small constant to avoid log(0) which is undefined
    df[col] = np.log(df[col] + 1e-5)

In [None]:
 #'B_win_by_Decision', 'R_win_by_Decision','B_win_by_KO/TKO','R_win_by_KO/TKO','B_win_by_Submission', 'R_win_by_Submission'

In [28]:
features = [
    'r_avg_sig_str_landed',
    'r_avg_sig_str_pct',
    'r_avg_sub_att',
    'r_avg_td_landed',
    'r_avg_td_pct',
    'r_height_cms',
    'r_reach_cms', 
    'r_age',
    'b_avg_sig_str_landed',
    'b_avg_sig_str_pct',
    'b_avg_sub_att',
    'b_avg_td_landed',
    'b_avg_td_pct',
    'b_height_cms',
    'b_reach_cms', 
    'b_age']

In [26]:
train_end_date = '2020-09-05'
test_start_date = '2020-09-06'
df_train = df_result[(df_result['date'] <= train_end_date)]
df_test = df_result[(df_result['date'] >= test_start_date)]

In [29]:
X_train =df_train[features]
y_train= df_train['finish_type']
X_test =df_test[features]
y_test= df_test['finish_type']

In [31]:
xgb_model = xgb.XGBClassifier(n_estimators=100,
    learning_rate=0.01,
    max_depth=4,
    min_child_weight=3,
    subsample=0.8,
    colsample_bytree=0.3,
    eval_metric='logloss',
    use_label_encoder=False)
xgb_model.fit(X_train, y_train)

In [32]:
y_pred_percent= xgb_model.predict_proba(X_test)
y_pred_percent

array([[0.41053265, 0.58946735],
       [0.47548503, 0.524515  ],
       [0.45316154, 0.54683846],
       ...,
       [0.43247247, 0.56752753],
       [0.45698017, 0.54301983],
       [0.53080994, 0.46919006]], dtype=float32)

In [33]:
y_pred_proba_df = pd.DataFrame(y_pred_percent, columns=['Probability_Decision', 'Probability_Finish'])

In [39]:
df_test.reset_index(drop=True, inplace=True)
y_pred_proba_df.reset_index(drop=True, inplace=True)
result_df = pd.concat([df_test, y_pred_proba_df], axis=1)

In [34]:
df_test.reset_index(drop=True, inplace=True)
y_pred_proba_df.reset_index(drop=True, inplace=True)
result_df = pd.concat([df_test, y_pred_proba_df], axis=1)

In [35]:
y_pred_test = xgb_model.predict(X_test)
y_pred_train = xgb_model.predict(X_train)

In [36]:
accuracy = accuracy_score(y_train, y_pred_train)
conf_matrix = confusion_matrix(y_train, y_pred_train)

print(f"Accuracy: {accuracy:.4f}")
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.6072
Confusion Matrix:
 [[2393 1665]
 [1565 2599]]


In [37]:
accuracy = accuracy_score(y_test, y_pred_test)
conf_matrix = confusion_matrix(y_test, y_pred_test)

print(f"Accuracy: {accuracy:.4f}")
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.5508
Confusion Matrix:
 [[294 234]
 [243 291]]


In [None]:
joblib.dump(xgb_model, 'finish_method.pkl')