In [7]:
import pandas as pd
import numpy as np
from datetime import datetime
import sklearn
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Data Preparation

## Data Loading

In [8]:
# Loads in the fighter-stats df
fighters = pd.read_csv("data/fighter-stats-threading.csv")
fighters

FileNotFoundError: [Errno 2] No such file or directory: 'data/fighter-stats-threading.csv'

## Basic Data Transformation

In [None]:
# Seperate the record into w, l, d and make three new columns
pattern = r"Record:\s(\d+)-(\d+)-(\d+)"
fighters[['win', 'loss', 'draw']] = fighters['Record'].str.extract(pattern).astype(int)
# Drop the record column
fighters = fighters.drop('Record', axis=1)

# Takes in height in ft and convert it to inches
def convert_to_inches(string):
    if pd.isna(string):
         return string
    string_list = string.split("'")
    ft = int(string_list[0].strip())
    inches = int(string_list[1].replace("\"", "").strip())
    return ft * 12 + inches
# Convert height to inches
fighters['Height(inches)'] = fighters['Height(inches)'].apply(convert_to_inches) 
# Rename column names
fighters = fighters.rename(columns={'Height(inches)': 'Height', 'Weight(lbs)': 'Weight', 'Reach(inches)': 'Reach'})

# Convert DOB column to datetime
fighters['DOB'] = pd.to_datetime(fighters['DOB'])
# Calculate age
today = datetime.today()
fighters['DOB'] = fighters['DOB'].apply(lambda x: today.year - x.year - ((today.month, today.day) < (x.month, x.day)))
# Rename DOB column to age
fighters.rename(columns={'DOB': 'Age'}, inplace=True)

# Convert percentages to float numbers
def percentages_to_float(column):
    return column.str.rstrip('%').astype(float) / 100
fighters[['Str.Acc.', 'Str.Def', 'TD Acc.', 'TD Def.']] = fighters[['Str.Acc.', 'Str.Def', 'TD Acc.', 'TD Def.']].apply(percentages_to_float)

## Handling Missing Data

In [None]:
# Impute missing height with weight class average height
fighters['Height'] = fighters.groupby('Weight')['Height'].transform(
    lambda x: x.fillna(x.mean())
)

# Impute missing reach with weight class average reach
fighters['Reach'] = fighters.groupby('Weight')['Reach'].transform(
    lambda x: x.fillna(x.mean())
)

# Drop rows with missing weight because it's a low percentage (2%)
fighters = fighters[~fighters['Weight'].isnull()]

# Impute missing stance value with the mode
fighters['Stance'] = fighters['Stance'].fillna('Orthodox')

# Every fight starts on the feet. If a fighter is has all the striking stats as 0,
# that most likely mean the fighter is insignificant.
# Therefore, we can just remove those fighters
columns_to_check = ['SLpM.', 'Str.Acc.', 'SApM', 'Str.Def']
fighters = fighters[(fighters['SLpM.'] > 0) & (fighters['Str.Acc.'] > 0) & (fighters['SApM'] > 0) & (fighters['Str.Def'] > 0)]
fighters.drop('Age', axis=1, inplace=True)

In [None]:
fighters

## Missing Data Edge Cases

In [None]:
# Some fighters are in their own weight because of their unique weight. We just simplay remove those 131 entries
fighters = fighters[~fighters['Reach'].isnull()]

I decided to drop the Stance column all together.
If I were to perform one-hot encoding on this categorizal column, I would create 10 additional columns just for one feature, which might cause linear independence and might oversaturate the dimensions, impacting model performance.

In [None]:
# Drop the stance column
fighters.drop('Stance', axis=1, inplace=True)

In [None]:
fighters

## Perform Join

In [None]:
# Load in the fights dataset that includes all the fight matchups
fights = pd.read_csv("data/fight-matchups.csv")
fights.head(10)

In [None]:
# Merged the tables and rename them
matchups = fights.merge(fighters, left_on = 'fighter1', right_on = 'Full Name')
matchups = matchups.merge(fighters, left_on = 'fighter2', right_on = 'Full Name')
matchups = matchups.drop(columns=['fighter1', 'fighter2', 'Full Name_x', 'Full Name_y'])
matchups.columns
matchups = matchups.rename(columns={'Height_x': 'height1', 'Weight_x': 'weight1', 'Reach_x': 'reach1', 'SLpM._x': 'slpm1', 'Str.Acc._x': 'stracc1', 
                                    'SApM_x': 'sapm1', 'Str.Def_x': 'strdef1', 'TD Avg._x': 'tdavg1', 'TD Acc._x': 'tdacc1', 'TD Def._x': 'tddef1',
                                    'Sub. Avg._x': 'subavg1', 'win_x': 'win1', 'loss_x': 'loss1', 'draw_x': 'draw1'})
matchups = matchups.rename(columns={'Height_y': 'height2', 'Weight_y': 'weight2', 'Reach_y': 'reach2', 'SLpM._y': 'slpm2', 'Str.Acc._y': 'stracc2', 
                                    'SApM_y': 'sapm2', 'Str.Def_y': 'strdef2', 'TD Avg._y': 'tdavg2', 'TD Acc._y': 'tdacc2', 'TD Def._y': 'tddef2',
                                    'Sub. Avg._y': 'subavg2', 'win_y': 'win2', 'loss_y': 'loss2', 'draw_y': 'draw2'})
display(matchups)

## Flip the dataframe

In [None]:
half = len(matchups.columns) // 2  # Number of columns to split

# The first half of the dataframe that contains only information for fighter1
matchups_first_half = matchups.iloc[:, :half]  # First half

# The second half of the dataframe that contains only information for fighter2
matchups_second_half = matchups.iloc[:, half:]  # Second half

# Flips them by putting the second half first
matchups_reversed = pd.concat([matchups_second_half, matchups_first_half], axis=1)

# Rename the columns
matchups_reversed.columns = matchups.columns
display(matchups_reversed)

## Stack the two tables on top of each other

In [None]:
matchups_total = pd.concat([matchups, matchups_reversed], ignore_index=True)
display(matchups_total)

# Step 2: Exploratory Data Analysis

In [None]:
# Create a figure with 9 potential subplots
fig, axes = plt.subplots(1, 3, figsize=(14, 5))
ax1 = axes[0]
ax2 = axes[1]
ax3 = axes[2]

# Create a scatter plot to see if a longer reach makes you a better striker
ax1.scatter(x=matchups_total['reach1'], y=matchups_total['slpm1'])
ax1.set_xlabel('Reach')
ax1.set_ylabel('Strikes Landed Per Minute')
ax1.set_title('Reach vs Striking Ability')

# Create a overlay histogram to see the win distribution of the UFC roster
ax2.hist(matchups_total['win1'], bins=20, alpha=0.5, color='g', edgecolor='black', label='Win', density=True)
ax2.set_xlabel('Number of Wins')
ax2.set_ylabel('Density')
ax2.set_title('The Win Distribution of UFC fighters')
ax2.legend()

# Create a scatter plot to see the relationship between striking defense and strikes absorbed per minute
ax3.scatter(x=matchups_total['strdef1'], y=matchups_total['sapm1'], color='orange')
ax3.set_ylim(0, 20)
ax3.set_title('Striking Defense vs Strikes Absorbed Per Minute')
ax3.set_xlabel('Striking Defense')
ax3.set_ylabel('Strikes Absorbed Per Minute')


plt.tight_layout()
plt.show()

## Heat Map

In [None]:
# Produce the correlation matrix
correlation_matrix = matchups_total.corr()

plt.figure(figsize=(20, 20))

# Use the correlation matrix to plot the heat map
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)

## Heatmap Conclusion

1. Eliminate the win and draw columns. Having all win, loss, and draw create multicollinearity. I believe that loss is more important because wins are inflated since most fighters have lots of wins before coming to the ufc.
2. Drop the weight, reach, and height columns, and use their differences instead.

# Data Pipeline

In [None]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

## Create target variable and datasets

In [None]:
# Calculate half of the number of rows of the data frame
n = len(matchups_total) // 2

# Create n number of ones and n number of zeros and put them in a series
arr = np.concatenate([np.ones(n), np.zeros(n)])

# Convert it to a Pandas Series
series = pd.Series(arr)

# Create the X and Y dataset
X = matchups_total
y = series

## Custom Transformation class

In [None]:
# Create custom data transformation class to apply the heat map conclusions
class FeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        # Drop the win and draw columns
        X = X.drop(columns=['win1', 'draw1', 'win2', 'draw2'])

        # Difference in the amount of losses
        X['loss_difference'] = X['loss1'] - X['loss2']

        # Drop the loss columns
        X.drop(columns=['loss1', 'loss2'])

        # Drop the striking defense, and keep only strikes absorbed per minute column
        X = X.drop(columns=['strdef1', 'strdef2'])
        
        # Difference in height (and drop individual columns)
        X['height_difference'] = X['height1'] - X['height2']
        X = X.drop(columns=['height1', 'height2'])
        
        # Drop the weight columns
        X = X.drop(columns=['weight1', 'weight2'])
        
        # Difference in reach (and drop individual columns)
        X['reach_difference'] = X['reach1'] - X['reach2']
        X = X.drop(columns=['reach1', 'reach2'])

        # Create multiplication of different features
        X['sapm_*'] = X['sapm1'] * X['sapm2']
        X = X.drop(columns=['sapm1', 'sapm2'])

        X['tdavg_*'] = X['tdavg1'] * X['tdavg2']
        X = X.drop(columns=['tdavg1', 'tdavg2'])
        return X

## Train Test Split

In [None]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Create the pipeline and param_grid for logistic regression

In [None]:
# Define the pipeline with Feature Engineering, StandardScaler, PCA, and Logistic Regression
pipeline = Pipeline([
    # ('feature_engineering', FeatureEngineer()),   # Step 1: Feature Engineering
    ('scaler', StandardScaler()),                # Step 2: Standardize the data
    ('pca', PCA()),                              # Step 3: PCA (Dimensionality Reduction)
    ('log_reg', LogisticRegression(penalty='l1', solver='liblinear'))  # Step 4: Lasso Logistic Regression (L1 penalty)
])

# Define the parameter grid for GridSearchCV
param_grid = {
    'pca__n_components': list(range(2, 29)),           # Number of PCA components
    'log_reg__C': np.linspace(0.001, 10, 10)          # Regularization strength for Lasso Logistic Regression
}

## Create the pipeline and param_grid for XGBoost

In [None]:
# Define the pipeline with Feature Engineering, StandardScaler, PCA, and XGB
pipeline = Pipeline([
    # ('feature_engineering', FeatureEngineer()),   # Step 1: Feature Engineering
    ('scaler', StandardScaler()),                # Step 2: Standardize the data
    ('pca', PCA()),                              # Step 3: PCA (Dimensionality Reduction)
    ('xgb', xgb.XGBClassifier())                 # Step 4: XGBoost Classifier
])

# Define the parameter grid for GridSearchCV (XGBoost-specific hyperparameters)
param_grid = {
    'pca__n_components': list(range(2, 29)),                  # Number of PCA components
    'xgb__n_estimators': [50, 100],               # Number of boosting rounds
    'xgb__learning_rate': [0.01, 0.1, 0.5],      # Learning rate for XGBoost
    'xgb__max_depth': [3, 5, 7],                  # Maximum depth of trees
    'xgb__subsample': [0.8, 1.0],                 # Fraction of samples for each tree
    'xgb__colsample_bytree': [0.8, 1.0]           # Fraction of features for each tree
}

## Evaluate model with grid search CV

In [None]:
# Perform Grid Search with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5)

# Fit the grid search (this will perform the feature engineering, scaling, PCA, and logistic regression)
grid_search.fit(X_train, y_train)

# Get the best model from the grid search
best_model = grid_search.best_estimator_

# Output the best parameters and best score
print("Best Parameters: ", grid_search.best_params_)
print("Best Score (cross-validation): ", grid_search.best_score_)

# Evaluate the best model on the test set
test_score = best_model.score(X_test, y_test)
print("Test Score: ", test_score)

In [None]:
## Confusion Matrix, Precision, Recall, and F1 Score

In [None]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, classification_report

y_pred = best_model.predict(X_test)

# Creates the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

# Calculate precision, recall, and F1 score
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("\nPrecision: ", precision)
print("Recall: ", recall)
print("F1 Score: ", f1)

# Create classification report
print("\nClassification Report:\n", classification_report(y_test, y_pred))

In [1]:
from sklearn.metrics import classification_report
print(classification_report(Y_test, Y_pred))

NameError: name 'Y_test' is not defined

In [29]:
ian = np.array(fighters_standardized[standardized['Full Name'] == 'Alexander Volkov']).ravel()[0: -1]
print(ian)

NameError: name 'fighters_standardized' is not defined

In [None]:
shavkat = np.array(fighters_standardized[fighters_standardized['Full Name'] == 'Ciryl Gane']).ravel()[0: -1]
print(shavkat)

In [None]:
shavkat_versus_ian = np.append(shavkat, ian)
shavkat_versus_ian = shavkat_versus_ian[np.newaxis, :]
len(shavkat_versus_ian)

In [None]:
model.predict(np.array(shavkat_versus_ian))

In [None]:
import joblib

In [None]:
joblib.dump(xgb_clf, "model.pickle")

In [None]:
# Train random forest classifier
randomForest = RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=20, random_state=42)
randomForest.fit(X_train, Y_train)
Y_pred = randomForest.predict(X_test)
# Evaluate the model
print("Accuracy:", accuracy_score(Y_test, Y_pred))