In [710]:
import pandas as pd
import numpy as np
from datetime import datetime
import sklearn
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [673]:
# Loads in the fighter-stats df
fighters = pd.read_csv("fighter-stats.csv")
# Drops unwanted columns
fighters = fighters.drop(columns = ['L', 'D'])
# Seperate the record into w, l, d and make three new columns
pattern = r"Record:\s(\d+)-(\d+)-(\d+)"
fighters[['win', 'loss', 'draw']] = fighters['W'].str.extract(pattern).astype(int)
# Drop the record column
fighters = fighters.drop('W', axis=1)

In [674]:
# Takes in height in ft and convert it to inches
def convert_to_inches(string):
    if pd.isna(string):
         return string
    string_list = string.split("'")
    ft = int(string_list[0].strip())
    inches = int(string_list[1].replace("\"", "").strip())
    return ft * 12 + inches

In [675]:
# Convert height to inches
fighters['Height(inches)'] = fighters['Height(inches)'].apply(convert_to_inches) 

In [676]:
# Rename column names
fighters = fighters.rename(columns={'Height(inches)': 'Height', 'Weight(lbs)': 'Weight', 'Reach(inches)': 'Reach'})

In [677]:
# Convert DOB column to datetime
fighters['DOB'] = pd.to_datetime(fighters['DOB'])

# Calculate age
today = datetime.today()
fighters['DOB'] = fighters['DOB'].apply(lambda x: today.year - x.year - ((today.month, today.day) < (x.month, x.day)))

In [678]:
# Rename DOB column to age
fighters.rename(columns={'DOB': 'Age'}, inplace=True)

In [679]:
# Convert percentages to float numbers
def percentages_to_float(column):
    return column.str.rstrip('%').astype(float) / 100
fighters[['Str.Acc.', 'Str.Def', 'TD Acc.', 'TD Def.']] = fighters[['Str.Acc.', 'Str.Def', 'TD Acc.', 'TD Def.']].apply(percentages_to_float)

In [680]:
# Set fighters' full name as the new index
fighters.set_index('Full Name', inplace=True)

In [681]:
# Impute missing height with weight class average height
fighters['Height'] = fighters.groupby('Weight')['Height'].transform(
    lambda x: x.fillna(x.mean())
)
# Impute missing reach with weight class average reach
fighters['Reach'] = fighters.groupby('Weight')['Reach'].transform(
    lambda x: x.fillna(x.mean())
)
# Drop rows with missing weight because it's a low percentage (2%)
fighters = fighters[~fighters['Weight'].isnull()]
# Impute missing stance value with the mode
fighters['Stance'] = fighters['Stance'].fillna('Orthodox')

In [682]:
columns_to_check = ['SLpM.', 'Str.Acc.', 'SApM', 'Str.Def']
fighters = fighters[(fighters['SLpM.'] > 0) & (fighters['Str.Acc.'] > 0) & (fighters['SApM'] > 0) & (fighters['Str.Def'] > 0)]
fighters.drop('Age', axis=1, inplace=True)

In [683]:
fighters = fighters[~fighters['Reach'].isnull()]

In [684]:
full_name_column = np.array(fighters.index)

In [685]:
encoder = OneHotEncoder(sparse_output=False)

# Fit and transform the data
encoded_array = encoder.fit_transform(fighters[['Stance']])
print(encoded_array)

# Convert to a DataFrame for better readability
encoded_df = pd.DataFrame(encoded_array, columns=encoder.get_feature_names_out(['Stance']))
encoded_df

[[0. 1. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 1.]
 ...
 [0. 1. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 1. 0. 0. 0.]]


Unnamed: 0,Stance_Open Stance,Stance_Orthodox,Stance_Sideways,Stance_Southpaw,Stance_Switch
0,0.0,1.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,1.0,0.0
4,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...
2979,0.0,1.0,0.0,0.0,0.0
2980,0.0,0.0,0.0,0.0,1.0
2981,0.0,1.0,0.0,0.0,0.0
2982,0.0,1.0,0.0,0.0,0.0


In [686]:
fighters.drop('Stance', axis=1, inplace=True)

In [687]:
# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the data
fighters_standardized = scaler.fit_transform(fighters)

# Convert back to a DataFrame for better readability
fighters_standardized = pd.DataFrame(fighters_standardized, columns=fighters.columns)
fighters_standardized

Unnamed: 0,Height,Weight,Reach,SLpM.,Str.Acc.,SApM,Str.Def,TD Avg.,TD Acc.,TD Def.,Sub. Avg.,win,loss,draw
0,0.304128,-0.282150,-0.076340,0.124790,-0.499181,0.286975,0.466174,-0.782651,-1.158884,0.879481,-0.507376,-1.237327,0.036055,-0.323894
1,-0.554426,-0.282150,-1.461499,-0.043097,-2.039483,0.834584,-0.527571,-0.782651,-1.158884,0.527156,-0.507376,1.802550,-0.416951,-0.323894
2,0.590312,2.918682,1.521933,-0.998317,-1.183760,-0.086790,-1.250295,-0.226441,0.028839,0.527156,-0.507376,-0.477358,2.074585,-0.323894
3,1.162682,2.889584,0.033777,0.460565,0.698831,-0.269326,0.646855,0.776815,1.540487,-1.586792,-0.507376,-1.110665,-1.322965,-0.323894
4,1.162682,0.590804,1.778267,2.046809,0.869976,0.200053,0.104812,-0.782651,-1.158884,0.815422,-0.507376,-0.857342,-1.322965,-0.323894
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2979,1.162682,1.172774,0.781416,3.181495,0.955548,0.799816,0.104812,-0.782651,-1.158884,-1.586792,-0.507376,0.535934,0.036055,-0.323894
2980,-0.554426,-0.864119,-0.963073,-0.703067,-0.328037,-0.603976,0.285493,3.240771,1.180571,0.110772,0.597395,-0.604019,-0.869958,-0.323894
2981,-0.268242,-0.864119,-0.963073,0.124790,-0.499181,-0.890819,2.453664,-0.366793,2.440277,0.014684,-0.507376,-0.604019,-0.869958,-0.323894
2982,-1.412980,-1.155104,-1.959924,-0.946214,-1.953911,-0.404055,-0.346890,-0.439568,-0.583018,0.270920,-0.507376,-1.490650,-0.416951,-0.323894


In [688]:
fighters_standardized['Full Name'] = full_name_column
fighters_standardized.drop_duplicates(inplace=True)

In [689]:
fighters_standardized

Unnamed: 0,Height,Weight,Reach,SLpM.,Str.Acc.,SApM,Str.Def,TD Avg.,TD Acc.,TD Def.,Sub. Avg.,win,loss,draw,Full Name
0,0.304128,-0.282150,-0.076340,0.124790,-0.499181,0.286975,0.466174,-0.782651,-1.158884,0.879481,-0.507376,-1.237327,0.036055,-0.323894,Danny Abbadi
1,-0.554426,-0.282150,-1.461499,-0.043097,-2.039483,0.834584,-0.527571,-0.782651,-1.158884,0.527156,-0.507376,1.802550,-0.416951,-0.323894,Nariman Abbasov
2,0.590312,2.918682,1.521933,-0.998317,-1.183760,-0.086790,-1.250295,-0.226441,0.028839,0.527156,-0.507376,-0.477358,2.074585,-0.323894,David Abbott
3,1.162682,2.889584,0.033777,0.460565,0.698831,-0.269326,0.646855,0.776815,1.540487,-1.586792,-0.507376,-1.110665,-1.322965,-0.323894,Hamdy Abdelwahab
4,1.162682,0.590804,1.778267,2.046809,0.869976,0.200053,0.104812,-0.782651,-1.158884,0.815422,-0.507376,-0.857342,-1.322965,-0.323894,Mansur Abdul-Malik
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2979,1.162682,1.172774,0.781416,3.181495,0.955548,0.799816,0.104812,-0.782651,-1.158884,-1.586792,-0.507376,0.535934,0.036055,-0.323894,Zhang Mingyang
2980,-0.554426,-0.864119,-0.963073,-0.703067,-0.328037,-0.603976,0.285493,3.240771,1.180571,0.110772,0.597395,-0.604019,-0.869958,-0.323894,Daermisi Zhawupasi
2981,-0.268242,-0.864119,-0.963073,0.124790,-0.499181,-0.890819,2.453664,-0.366793,2.440277,0.014684,-0.507376,-0.604019,-0.869958,-0.323894,Daria Zhelezniakova
2982,-1.412980,-1.155104,-1.959924,-0.946214,-1.953911,-0.404055,-0.346890,-0.439568,-0.583018,0.270920,-0.507376,-1.490650,-0.416951,-0.323894,Yao Zhikui


In [690]:
fights = pd.read_csv("fight-matchups.csv")
fights.head(10)

Unnamed: 0,fighter1,fighter2
0,Petr Yan,Deiveson Figueiredo
1,Yan Xiaonan,Tabatha Ricci
2,Muslim Salikhov,Song Kenan
3,Gabriella Fernandes,Wang Cong
4,Carlos Ulberg,Volkan Oezdemir
5,Zhang Mingyang,Ozzy Diaz
6,SuYoung You,Baergeng Jieleyisi
7,DongHun Choi,Kiru Sahota
8,Shi Ming,Feng Xiaocan
9,Carlos Hernandez,Nyamjargal Tumendemberel


In [691]:
matchups = fights.merge(fighters_standardized, left_on = 'fighter1', right_on = 'Full Name')
fighter2 = matchups['fighter2']
matchups['Full Name'] = fighter2
matchups.drop('fighter2', axis=1, inplace=True)

In [692]:
matchups = matchups.merge(fighters_standardized, left_on = 'Full Name', right_on = 'Full Name')

In [693]:
matchups.rename(columns={'Full Name': 'fighter2'}, inplace=True)

In [694]:
matchups.drop(columns=['fighter1', 'fighter2'], axis=1, inplace=True)

In [695]:
matchups

Unnamed: 0,Height_x,Weight_x,Reach_x,SLpM._x,Str.Acc._x,SApM_x,Str.Def_x,TD Avg._x,TD Acc._x,TD Def._x,...,Str.Acc._y,SApM_y,Str.Def_y,TD Avg._y,TD Acc._y,TD Def._y,Sub. Avg._y,win_y,loss_y,draw_y
0,-0.840611,-0.864119,-1.212286,1.178427,0.869976,0.165284,0.556514,0.054263,0.604705,1.103688,...,0.869976,-0.086790,-0.256550,0.059461,0.064831,0.334979,0.597395,1.295904,-0.416951,0.973420
1,-1.412980,-1.446089,-2.209137,1.108956,0.099825,-0.134597,0.737195,-0.304415,1.468503,0.431068,...,-0.670326,0.669432,0.466174,0.641661,0.172806,1.007599,-0.428464,-0.350696,-0.643455,-0.323894
2,0.304128,0.154327,-0.464648,0.176893,0.442114,-0.369286,0.917876,-0.221243,-0.007152,0.719333,...,0.185397,0.899776,-0.075869,-0.564326,-0.079136,0.751363,-0.507376,1.042581,0.715565,-0.323894
3,-1.126796,-1.155104,-1.461499,0.223207,0.185397,0.760701,-0.256550,-0.491551,-0.511035,0.334979,...,0.185397,-0.921241,1.640600,-0.179657,0.640696,1.616160,1.307605,-0.984004,-1.096461,-0.323894
4,1.735051,1.172774,1.279841,2.388373,1.041120,0.226130,-0.166210,-0.408379,1.540487,1.071658,...,0.356542,0.204399,0.285493,-0.522740,-0.115127,0.975569,-0.428464,0.789258,0.489062,-0.323894
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4787,0.017943,0.154327,0.443646,-0.743592,0.099825,-0.908203,1.008217,0.236200,2.440277,0.142802,...,-0.328037,-1.208084,-0.166210,-0.782651,-1.158884,-0.786054,1.149780,0.915919,1.621579,0.973420
4788,0.876497,1.172774,0.781416,-0.129935,1.126693,-0.964702,0.646855,0.662454,0.496730,0.270920,...,0.185397,-0.399709,-0.166210,-0.252432,1.000612,0.110772,-0.033903,1.549227,1.848082,-0.323894
4789,0.590312,0.590804,0.532203,-0.859376,0.185397,-0.399709,-0.166210,-0.252432,1.000612,0.110772,...,-1.183760,-0.086790,-1.250295,-0.226441,0.028839,0.527156,-0.507376,-0.477358,2.074585,-0.323894
4790,0.876497,1.172774,0.781416,-0.691489,0.698831,-0.490977,-1.069614,1.239457,0.280780,0.014684,...,1.041120,-0.699590,-0.346890,-0.574722,2.440277,0.110772,0.123922,1.169242,3.660108,0.973420


In [696]:
# Split the DataFrame into two halves (vertically)
half = len(matchups.columns) // 2  # Number of columns to split

matchups1 = matchups.iloc[:, :half]  # First half
matchups2 = matchups.iloc[:, half:]  # Second half

matchups_reversed = pd.concat([matchups2, matchups1], axis=1)
matchups_reversed

Unnamed: 0,Height_y,Weight_y,Reach_y,SLpM._y,Str.Acc._y,SApM_y,Str.Def_y,TD Avg._y,TD Acc._y,TD Def._y,...,Str.Acc._x,SApM_x,Str.Def_x,TD Avg._x,TD Acc._x,TD Def._x,Sub. Avg._x,win_x,loss_x,draw_x
0,-1.412980,-0.864119,-0.963073,-0.095200,0.869976,-0.086790,-0.256550,0.059461,0.064831,0.334979,...,0.869976,0.165284,0.556514,0.054263,0.604705,1.103688,-0.428464,0.535934,-0.190448,-0.323894
1,-2.557719,-1.446089,-2.707563,0.640030,-0.670326,0.669432,0.466174,0.641661,0.172806,1.007599,...,0.099825,-0.134597,0.737195,-0.304415,1.468503,0.431068,-0.507376,0.662596,-0.416951,-0.323894
2,0.590312,0.154327,-0.215435,0.819496,0.185397,0.899776,-0.075869,-0.564326,-0.079136,0.751363,...,0.442114,-0.369286,0.917876,-0.221243,-0.007152,0.719333,-0.507376,0.915919,-0.190448,-0.323894
3,-1.126796,-1.155104,-1.461499,1.583672,0.185397,-0.921241,1.640600,-0.179657,0.640696,1.616160,...,0.185397,0.760701,-0.256550,-0.491551,-0.511035,0.334979,-0.270639,-0.477358,-0.643455,-0.323894
4,1.162682,1.172774,0.781416,1.085800,0.356542,0.204399,0.285493,-0.522740,-0.115127,0.975569,...,1.041120,0.226130,-0.166210,-0.408379,1.540487,1.071658,-0.349551,-0.224035,-1.096461,-0.323894
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4787,-0.554426,0.154327,0.443646,-1.432508,-0.328037,-1.208084,-0.166210,-0.782651,-1.158884,-0.786054,...,0.099825,-0.908203,1.008217,0.236200,2.440277,0.142802,0.281746,1.929211,0.262559,2.270734
4788,0.590312,0.590804,0.532203,-0.859376,0.185397,-0.399709,-0.166210,-0.252432,1.000612,0.110772,...,1.126693,-0.964702,0.646855,0.662454,0.496730,0.270920,-0.112815,0.662596,1.168572,-0.323894
4789,0.590312,2.918682,1.521933,-0.998317,-1.183760,-0.086790,-1.250295,-0.226441,0.028839,0.527156,...,0.185397,-0.399709,-0.166210,-0.252432,1.000612,0.110772,-0.033903,1.549227,1.848082,-0.323894
4790,1.448866,2.453107,0.781416,-0.714646,1.041120,-0.699590,-0.346890,-0.574722,2.440277,0.110772,...,0.698831,-0.490977,-1.069614,1.239457,0.280780,0.014684,0.123922,0.282611,0.942069,-0.323894


In [697]:
matchups_reversed.columns = matchups.columns

In [698]:
matchups['Result'] = 1
matchups_reversed['Result'] = 0

In [699]:
matchups_reversed

Unnamed: 0,Height_x,Weight_x,Reach_x,SLpM._x,Str.Acc._x,SApM_x,Str.Def_x,TD Avg._x,TD Acc._x,TD Def._x,...,SApM_y,Str.Def_y,TD Avg._y,TD Acc._y,TD Def._y,Sub. Avg._y,win_y,loss_y,draw_y,Result
0,-1.412980,-0.864119,-0.963073,-0.095200,0.869976,-0.086790,-0.256550,0.059461,0.064831,0.334979,...,0.165284,0.556514,0.054263,0.604705,1.103688,-0.428464,0.535934,-0.190448,-0.323894,0
1,-2.557719,-1.446089,-2.707563,0.640030,-0.670326,0.669432,0.466174,0.641661,0.172806,1.007599,...,-0.134597,0.737195,-0.304415,1.468503,0.431068,-0.507376,0.662596,-0.416951,-0.323894,0
2,0.590312,0.154327,-0.215435,0.819496,0.185397,0.899776,-0.075869,-0.564326,-0.079136,0.751363,...,-0.369286,0.917876,-0.221243,-0.007152,0.719333,-0.507376,0.915919,-0.190448,-0.323894,0
3,-1.126796,-1.155104,-1.461499,1.583672,0.185397,-0.921241,1.640600,-0.179657,0.640696,1.616160,...,0.760701,-0.256550,-0.491551,-0.511035,0.334979,-0.270639,-0.477358,-0.643455,-0.323894,0
4,1.162682,1.172774,0.781416,1.085800,0.356542,0.204399,0.285493,-0.522740,-0.115127,0.975569,...,0.226130,-0.166210,-0.408379,1.540487,1.071658,-0.349551,-0.224035,-1.096461,-0.323894,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4787,-0.554426,0.154327,0.443646,-1.432508,-0.328037,-1.208084,-0.166210,-0.782651,-1.158884,-0.786054,...,-0.908203,1.008217,0.236200,2.440277,0.142802,0.281746,1.929211,0.262559,2.270734,0
4788,0.590312,0.590804,0.532203,-0.859376,0.185397,-0.399709,-0.166210,-0.252432,1.000612,0.110772,...,-0.964702,0.646855,0.662454,0.496730,0.270920,-0.112815,0.662596,1.168572,-0.323894,0
4789,0.590312,2.918682,1.521933,-0.998317,-1.183760,-0.086790,-1.250295,-0.226441,0.028839,0.527156,...,-0.399709,-0.166210,-0.252432,1.000612,0.110772,-0.033903,1.549227,1.848082,-0.323894,0
4790,1.448866,2.453107,0.781416,-0.714646,1.041120,-0.699590,-0.346890,-0.574722,2.440277,0.110772,...,-0.490977,-1.069614,1.239457,0.280780,0.014684,0.123922,0.282611,0.942069,-0.323894,0


In [700]:
matchups_total = pd.concat([matchups, matchups_reversed], ignore_index=True)

In [701]:
matchups_total

Unnamed: 0,Height_x,Weight_x,Reach_x,SLpM._x,Str.Acc._x,SApM_x,Str.Def_x,TD Avg._x,TD Acc._x,TD Def._x,...,SApM_y,Str.Def_y,TD Avg._y,TD Acc._y,TD Def._y,Sub. Avg._y,win_y,loss_y,draw_y,Result
0,-0.840611,-0.864119,-1.212286,1.178427,0.869976,0.165284,0.556514,0.054263,0.604705,1.103688,...,-0.086790,-0.256550,0.059461,0.064831,0.334979,0.597395,1.295904,-0.416951,0.973420,1
1,-1.412980,-1.446089,-2.209137,1.108956,0.099825,-0.134597,0.737195,-0.304415,1.468503,0.431068,...,0.669432,0.466174,0.641661,0.172806,1.007599,-0.428464,-0.350696,-0.643455,-0.323894,1
2,0.304128,0.154327,-0.464648,0.176893,0.442114,-0.369286,0.917876,-0.221243,-0.007152,0.719333,...,0.899776,-0.075869,-0.564326,-0.079136,0.751363,-0.507376,1.042581,0.715565,-0.323894,1
3,-1.126796,-1.155104,-1.461499,0.223207,0.185397,0.760701,-0.256550,-0.491551,-0.511035,0.334979,...,-0.921241,1.640600,-0.179657,0.640696,1.616160,1.307605,-0.984004,-1.096461,-0.323894,1
4,1.735051,1.172774,1.279841,2.388373,1.041120,0.226130,-0.166210,-0.408379,1.540487,1.071658,...,0.204399,0.285493,-0.522740,-0.115127,0.975569,-0.428464,0.789258,0.489062,-0.323894,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9579,-0.554426,0.154327,0.443646,-1.432508,-0.328037,-1.208084,-0.166210,-0.782651,-1.158884,-0.786054,...,-0.908203,1.008217,0.236200,2.440277,0.142802,0.281746,1.929211,0.262559,2.270734,0
9580,0.590312,0.590804,0.532203,-0.859376,0.185397,-0.399709,-0.166210,-0.252432,1.000612,0.110772,...,-0.964702,0.646855,0.662454,0.496730,0.270920,-0.112815,0.662596,1.168572,-0.323894,0
9581,0.590312,2.918682,1.521933,-0.998317,-1.183760,-0.086790,-1.250295,-0.226441,0.028839,0.527156,...,-0.399709,-0.166210,-0.252432,1.000612,0.110772,-0.033903,1.549227,1.848082,-0.323894,0
9582,1.448866,2.453107,0.781416,-0.714646,1.041120,-0.699590,-0.346890,-0.574722,2.440277,0.110772,...,-0.490977,-1.069614,1.239457,0.280780,0.014684,0.123922,0.282611,0.942069,-0.323894,0


In [702]:
X = matchups_total.iloc[:, 0:-1]
Y = matchups_total[['Result']]

In [703]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [704]:
# Train logistic regression model
model = LogisticRegression()
model.fit(X_train, Y_train)

# Predict on test data
Y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(Y_test, Y_pred))

Accuracy: 0.6896191966614502


  y = column_or_1d(y, warn=True)


In [705]:
from sklearn.metrics import classification_report
print(classification_report(Y_test, Y_pred))

              precision    recall  f1-score   support

           0       0.69      0.69      0.69       958
           1       0.69      0.69      0.69       959

    accuracy                           0.69      1917
   macro avg       0.69      0.69      0.69      1917
weighted avg       0.69      0.69      0.69      1917



In [706]:
ian = np.array(fighters_standardized[fighters_standardized['Full Name'] == 'Ian Machado Garry']).ravel()[0: -1]

In [707]:
shavkat = np.array(fighters_standardized[fighters_standardized['Full Name'] == 'Shavkat Rakhmonov']).ravel()[0: -1]

In [708]:
ian_versus_shavkat = np.append(ian, shavkat)
ian_versus_shavkat = ian_versus_shavkat[np.newaxis, :]
len(ian_versus_shavkat)

1

In [709]:
model.predict(np.array(ian_versus_shavkat))



array([0])