In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder

In [2]:
df_original = pd.read_csv('../CSV Files/df_ufc_masters_w_reversed.csv')
df = df_original.copy()

In [3]:
df = df.dropna(subset=['finish'])
df = df[~df['finish'].isin(['DQ', 'Overturned'])]

In [4]:
finish_mapping = {
    'U-DEC': 'Decision',
    'S-DEC': 'Decision',
    'M-DEC': 'Decision',
    'KO/TKO': 'KO',
    'SUB': 'Submission'
}
df['finish'] = df['finish'].map(finish_mapping)

In [5]:
df_encoded = pd.get_dummies(df['finish'], prefix='finish')

# Convert boolean values to 1s and 0s
df_encoded = df_encoded.astype(int)

In [6]:
finish_mapping = {'finish_Submission': 0,'finish_KO': 1,'finish_Decision': 2}
df_encoded['finish_type'] = df_encoded[['finish_Submission', 'finish_KO', 'finish_Decision']].idxmax(axis=1).map(finish_mapping)

In [7]:
df_encoded = df_encoded.drop(['finish_Decision', 'finish_Submission', 'finish_KO'], axis=1)
df_result =pd.concat([df, df_encoded], axis=1)

In [8]:
features= ['B_avg_SIG_STR_landed',
    'B_avg_SIG_STR_pct',
    'B_avg_TD_pct',
    'R_avg_SIG_STR_pct',
    'R_avg_TD_pct',
    'B_avg_SUB_ATT',
    'B_avg_TD_landed',
    'R_avg_SIG_STR_landed',
    'R_avg_SUB_ATT',
    'R_avg_TD_landed',
    'B_Height_cms',
    'B_Reach_cms',
    'R_Height_cms',
    'R_Reach_cms', 
    'R_age',
    'B_age']

# List of columns to apply np.log
columns_to_log = ['B_avg_SIG_STR_landed', 'B_avg_SUB_ATT', 'B_avg_TD_landed', 
                  'R_avg_SIG_STR_landed', 'R_avg_SUB_ATT', 'R_avg_TD_landed']

# Applying np.log to each column in the list
for col in columns_to_log:
    # Adding a small constant to avoid log(0) which is undefined
    df[col] = np.log(df[col] + 1e-5)

In [9]:
train_end_date = '2020-09-05'
test_start_date = '2020-09-06'
df_train = df_result[(df_result['date'] <= train_end_date)]
df_test = df_result[(df_result['date'] >= test_start_date)]

In [10]:
X_train =df_train[features]
y_train= df_train['finish_type']
X_test =df_test[features]
y_test= df_test['finish_type']

In [11]:
sScaler = StandardScaler()
X_train_scaled = sScaler.fit_transform(X_train)
X_test_scaled = sScaler.transform(X_test)

In [12]:
random_forest_model = RandomForestClassifier(n_estimators=50,max_depth=3,min_samples_split=5, random_state=42)
random_forest_model.fit(X_train_scaled, y_train)

In [13]:
y_pred_percent= random_forest_model.predict_proba(X_test_scaled)
y_pred_percent

array([[0.17645835, 0.47251689, 0.35102476],
       [0.18548581, 0.35988114, 0.45463306],
       [0.18098605, 0.42915302, 0.38986093],
       ...,
       [0.20676074, 0.3491262 , 0.44411306],
       [0.17657512, 0.37370166, 0.44972322],
       [0.18106873, 0.25172297, 0.5672083 ]])

In [14]:
y_pred_test = random_forest_model.predict(X_test_scaled)
y_pred_train = random_forest_model.predict(X_train_scaled)

In [15]:
accuracy = accuracy_score(y_train, y_pred_train)
conf_matrix = confusion_matrix(y_train, y_pred_train)

print(f"Accuracy: {accuracy:.4f}")
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.5215
Confusion Matrix:
 [[   0  158 1388]
 [   0  536 2082]
 [   0  306 3752]]


In [16]:
accuracy = accuracy_score(y_test, y_pred_test)
conf_matrix = confusion_matrix(y_test, y_pred_test)

print(f"Accuracy: {accuracy:.4f}")
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.5265
Confusion Matrix:
 [[  0  20 140]
 [  0  80 292]
 [  0  49 477]]
