In [47]:
import pandas as pd
import numpy as np
from datetime import datetime
import sklearn
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [48]:
# Loads in the fighter-stats df
fighters = pd.read_csv("data/fighter-stats-threading.csv")
fighters

Unnamed: 0,Full Name,Height(inches),Weight(lbs),Reach(inches),Stance,DOB,SLpM.,Str.Acc.,SApM,Str.Def,TD Avg.,TD Acc.,TD Def.,Sub. Avg.,Record
0,Tom Aaron,,155.0,,,"Jul 13, 1978",0.00,0%,0.00,0%,0.00,0%,0%,0.0,Record: 5-3-0
1,Danny Abbadi,5' 11,155.0,,Orthodox,"Jul 03, 1983",3.29,38%,4.41,57%,0.00,0%,77%,0.0,Record: 4-6-0
2,Nariman Abbasov,5' 8,155.0,66.0,Orthodox,"Feb 01, 1994",3.00,20%,5.67,46%,0.00,0%,66%,0.0,Record: 28-4-0
3,David Abbott,6' 0,265.0,,Switch,,1.35,30%,3.55,38%,1.07,33%,66%,0.0,Record: 10-15-0
4,Hamdy Abdelwahab,6' 2,264.0,72.0,Southpaw,"Jan 22, 1993",3.87,52%,3.13,59%,3.00,75%,0%,0.0,Record: 5-0-0 (1 NC)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4217,Dave Zitanick,,170.0,,,"Mar 05, 1980",0.00,0%,0.00,0%,0.00,0%,0%,0.0,Record: 5-7-0 (1 NC)
4218,Alex Zuniga,,145.0,,,,0.00,0%,0.00,0%,0.00,0%,0%,0.0,Record: 6-3-0
4219,George Zuniga,5' 9,185.0,,,,7.64,38%,5.45,37%,0.00,0%,100%,0.0,Record: 3-1-0
4220,Allan Zuniga,5' 7,155.0,70.0,Orthodox,"Apr 04, 1992",3.93,52%,1.80,61%,0.00,0%,57%,1.0,Record: 13-1-0


In [49]:
# Seperate the record into w, l, d and make three new columns
pattern = r"Record:\s(\d+)-(\d+)-(\d+)"
fighters[['win', 'loss', 'draw']] = fighters['Record'].str.extract(pattern).astype(int)
# Drop the record column
fighters = fighters.drop('Record', axis=1)

In [50]:
# Takes in height in ft and convert it to inches
def convert_to_inches(string):
    if pd.isna(string):
         return string
    string_list = string.split("'")
    ft = int(string_list[0].strip())
    inches = int(string_list[1].replace("\"", "").strip())
    return ft * 12 + inches

In [51]:
# Convert height to inches
fighters['Height(inches)'] = fighters['Height(inches)'].apply(convert_to_inches) 
fighters

Unnamed: 0,Full Name,Height(inches),Weight(lbs),Reach(inches),Stance,DOB,SLpM.,Str.Acc.,SApM,Str.Def,TD Avg.,TD Acc.,TD Def.,Sub. Avg.,win,loss,draw
0,Tom Aaron,,155.0,,,"Jul 13, 1978",0.00,0%,0.00,0%,0.00,0%,0%,0.0,5,3,0
1,Danny Abbadi,71.0,155.0,,Orthodox,"Jul 03, 1983",3.29,38%,4.41,57%,0.00,0%,77%,0.0,4,6,0
2,Nariman Abbasov,68.0,155.0,66.0,Orthodox,"Feb 01, 1994",3.00,20%,5.67,46%,0.00,0%,66%,0.0,28,4,0
3,David Abbott,72.0,265.0,,Switch,,1.35,30%,3.55,38%,1.07,33%,66%,0.0,10,15,0
4,Hamdy Abdelwahab,74.0,264.0,72.0,Southpaw,"Jan 22, 1993",3.87,52%,3.13,59%,3.00,75%,0%,0.0,5,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4217,Dave Zitanick,,170.0,,,"Mar 05, 1980",0.00,0%,0.00,0%,0.00,0%,0%,0.0,5,7,0
4218,Alex Zuniga,,145.0,,,,0.00,0%,0.00,0%,0.00,0%,0%,0.0,6,3,0
4219,George Zuniga,69.0,185.0,,,,7.64,38%,5.45,37%,0.00,0%,100%,0.0,3,1,0
4220,Allan Zuniga,67.0,155.0,70.0,Orthodox,"Apr 04, 1992",3.93,52%,1.80,61%,0.00,0%,57%,1.0,13,1,0


In [52]:
# Rename column names
fighters = fighters.rename(columns={'Height(inches)': 'Height', 'Weight(lbs)': 'Weight', 'Reach(inches)': 'Reach'})

In [53]:
fighters

Unnamed: 0,Full Name,Height,Weight,Reach,Stance,DOB,SLpM.,Str.Acc.,SApM,Str.Def,TD Avg.,TD Acc.,TD Def.,Sub. Avg.,win,loss,draw
0,Tom Aaron,,155.0,,,"Jul 13, 1978",0.00,0%,0.00,0%,0.00,0%,0%,0.0,5,3,0
1,Danny Abbadi,71.0,155.0,,Orthodox,"Jul 03, 1983",3.29,38%,4.41,57%,0.00,0%,77%,0.0,4,6,0
2,Nariman Abbasov,68.0,155.0,66.0,Orthodox,"Feb 01, 1994",3.00,20%,5.67,46%,0.00,0%,66%,0.0,28,4,0
3,David Abbott,72.0,265.0,,Switch,,1.35,30%,3.55,38%,1.07,33%,66%,0.0,10,15,0
4,Hamdy Abdelwahab,74.0,264.0,72.0,Southpaw,"Jan 22, 1993",3.87,52%,3.13,59%,3.00,75%,0%,0.0,5,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4217,Dave Zitanick,,170.0,,,"Mar 05, 1980",0.00,0%,0.00,0%,0.00,0%,0%,0.0,5,7,0
4218,Alex Zuniga,,145.0,,,,0.00,0%,0.00,0%,0.00,0%,0%,0.0,6,3,0
4219,George Zuniga,69.0,185.0,,,,7.64,38%,5.45,37%,0.00,0%,100%,0.0,3,1,0
4220,Allan Zuniga,67.0,155.0,70.0,Orthodox,"Apr 04, 1992",3.93,52%,1.80,61%,0.00,0%,57%,1.0,13,1,0


In [54]:
# Convert DOB column to datetime
fighters['DOB'] = pd.to_datetime(fighters['DOB'])

# Calculate age
today = datetime.today()
fighters['DOB'] = fighters['DOB'].apply(lambda x: today.year - x.year - ((today.month, today.day) < (x.month, x.day)))

In [55]:
# Rename DOB column to age
fighters.rename(columns={'DOB': 'Age'}, inplace=True)

In [56]:
# Convert percentages to float numbers
def percentages_to_float(column):
    return column.str.rstrip('%').astype(float) / 100
fighters[['Str.Acc.', 'Str.Def', 'TD Acc.', 'TD Def.']] = fighters[['Str.Acc.', 'Str.Def', 'TD Acc.', 'TD Def.']].apply(percentages_to_float)

In [57]:
# Set fighters' full name as the new index
fighters.set_index('Full Name', inplace=True)

In [58]:
# Impute missing height with weight class average height
fighters['Height'] = fighters.groupby('Weight')['Height'].transform(
    lambda x: x.fillna(x.mean())
)
# Impute missing reach with weight class average reach
fighters['Reach'] = fighters.groupby('Weight')['Reach'].transform(
    lambda x: x.fillna(x.mean())
)
# Drop rows with missing weight because it's a low percentage (2%)
fighters = fighters[~fighters['Weight'].isnull()]
# Impute missing stance value with the mode
fighters['Stance'] = fighters['Stance'].fillna('Orthodox')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fighters['Stance'] = fighters['Stance'].fillna('Orthodox')


In [59]:
columns_to_check = ['SLpM.', 'Str.Acc.', 'SApM', 'Str.Def']
fighters = fighters[(fighters['SLpM.'] > 0) & (fighters['Str.Acc.'] > 0) & (fighters['SApM'] > 0) & (fighters['Str.Def'] > 0)]
fighters.drop('Age', axis=1, inplace=True)

In [60]:
fighters

Unnamed: 0_level_0,Height,Weight,Reach,Stance,SLpM.,Str.Acc.,SApM,Str.Def,TD Avg.,TD Acc.,TD Def.,Sub. Avg.,win,loss,draw
Full Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Danny Abbadi,71.0,155.0,71.576503,Orthodox,3.29,0.38,4.41,0.57,0.00,0.00,0.77,0.0,4,6,0
Nariman Abbasov,68.0,155.0,66.000000,Orthodox,3.00,0.20,5.67,0.46,0.00,0.00,0.66,0.0,28,4,0
David Abbott,72.0,265.0,77.871795,Switch,1.35,0.30,3.55,0.38,1.07,0.33,0.66,0.0,10,15,0
Hamdy Abdelwahab,74.0,264.0,72.000000,Southpaw,3.87,0.52,3.13,0.59,3.00,0.75,0.00,0.0,5,0,0
Mansur Abdul-Malik,74.0,185.0,79.000000,Orthodox,6.61,0.54,4.21,0.53,0.00,0.00,0.75,0.0,7,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
James Zikic,74.0,205.0,75.846154,Orthodox,1.47,0.35,1.60,0.44,0.50,0.25,0.74,0.5,21,10,2
Cat Zingano,66.0,145.0,68.000000,Southpaw,2.57,0.61,1.63,0.47,2.77,0.65,0.42,0.8,10,4,0
George Zuniga,69.0,185.0,75.077821,Orthodox,7.64,0.38,5.45,0.37,0.00,0.00,1.00,0.0,3,1,0
Allan Zuniga,67.0,155.0,70.000000,Orthodox,3.93,0.52,1.80,0.61,0.00,0.00,0.57,1.0,13,1,0


In [61]:
fighters = fighters[~fighters['Reach'].isnull()]

In [62]:
full_name_column = np.array(fighters.index)

In [63]:
encoder = OneHotEncoder(sparse_output=False)

# Fit and transform the data
encoded_array = encoder.fit_transform(fighters[['Stance']])
print(encoded_array)

# Convert to a DataFrame for better readability
encoded_df = pd.DataFrame(encoded_array, columns=encoder.get_feature_names_out(['Stance']))
encoded_df

[[0. 1. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 1.]
 ...
 [0. 1. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 1. 0. 0. 0.]]


Unnamed: 0,Stance_Open Stance,Stance_Orthodox,Stance_Sideways,Stance_Southpaw,Stance_Switch
0,0.0,1.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,1.0,0.0
4,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...
3274,0.0,1.0,0.0,0.0,0.0
3275,0.0,0.0,0.0,1.0,0.0
3276,0.0,1.0,0.0,0.0,0.0
3277,0.0,1.0,0.0,0.0,0.0


In [64]:
fighters.drop('Stance', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fighters.drop('Stance', axis=1, inplace=True)


In [65]:
# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the data
fighters_standardized = scaler.fit_transform(fighters)

# Convert back to a DataFrame for better readability
fighters_standardized = pd.DataFrame(fighters_standardized, columns=fighters.columns)
fighters_standardized

Unnamed: 0,Height,Weight,Reach,SLpM.,Str.Acc.,SApM,Str.Def,TD Avg.,TD Acc.,TD Def.,Sub. Avg.,win,loss,draw
0,0.282398,-0.303697,-0.087593,0.132759,-0.488670,0.265847,0.499067,-0.773633,-1.159647,0.889281,-0.483597,-1.209842,0.037276,-0.327225
1,-0.570899,-0.303697,-1.480631,-0.037220,-2.006444,0.801253,-0.495895,-0.773633,-1.159647,0.538818,-0.483597,1.760264,-0.412916,-0.327225
2,0.566831,2.836891,1.485001,-1.004342,-1.163236,-0.099588,-1.219504,-0.230008,0.037894,0.538818,-0.483597,-0.467315,2.063141,-0.327225
3,1.135695,2.808340,0.018198,0.472717,0.691821,-0.278057,0.679970,0.750549,1.562036,-1.563964,-0.483597,-1.086087,-1.313301,-0.327225
4,1.135695,0.552827,1.766832,2.078726,0.860463,0.180862,0.137263,-0.773633,-1.159647,0.825560,-0.483597,-0.838579,-1.313301,-0.327225
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3274,1.135695,1.123843,0.978986,-0.934006,-0.741632,-0.928192,-0.676797,-0.519602,-0.252419,0.793700,-0.114912,0.893983,0.937660,2.402978
3275,-1.139764,-0.589205,-0.981021,-0.289258,1.450709,-0.915445,-0.405444,0.633695,1.199145,-0.225830,0.106299,-0.467315,-0.412916,-0.327225
3276,-0.286467,0.552827,0.787053,2.682444,-0.488670,0.707769,-1.309955,-0.773633,-1.159647,1.622069,-0.483597,-1.333596,-1.088205,-0.327225
3277,-0.855331,-0.303697,-0.481411,0.507885,0.691821,-0.843207,0.860872,-0.773633,-1.159647,0.252075,0.253773,-0.096052,-1.088205,-0.327225


In [66]:
fighters_standardized['Full Name'] = full_name_column
fighters_standardized.drop_duplicates(inplace=True)

In [72]:
# Move the full name column to the beginning
column_to_move = 'Full Name'
columns = [column_to_move] + [col for col in fighters_standardized.columns if col != column_to_move]
fighters_standardized = fighters_standardized[columns]

In [76]:
from IPython.display import FileLink
fighters_standardized.to_csv('processed.csv', index=False)
FileLink('processed.csv')

In [None]:
fights = pd.read_csv("data/fight-matchups.csv")
fights.head(10)

In [None]:
matchups = fights.merge(fighters_standardized, left_on = 'fighter1', right_on = 'Full Name')
fighter2 = matchups['fighter2']
matchups['Full Name'] = fighter2
matchups.drop('fighter2', axis=1, inplace=True)

In [None]:
matchups = matchups.merge(fighters_standardized, left_on = 'Full Name', right_on = 'Full Name')

In [None]:
matchups.rename(columns={'Full Name': 'fighter2'}, inplace=True)

In [None]:
matchups.drop(columns=['fighter1', 'fighter2'], axis=1, inplace=True)

In [None]:
matchups

In [None]:
# Split the DataFrame into two halves (vertically)
half = len(matchups.columns) // 2  # Number of columns to split

matchups1 = matchups.iloc[:, :half]  # First half
matchups2 = matchups.iloc[:, half:]  # Second half

matchups_reversed = pd.concat([matchups2, matchups1], axis=1)
matchups_reversed

In [None]:
matchups_reversed.columns = matchups.columns

In [None]:
matchups['Result'] = 1
matchups_reversed['Result'] = 0

In [None]:
matchups_reversed

In [None]:
matchups_total = pd.concat([matchups, matchups_reversed], ignore_index=True)

In [None]:
matchups_total

In [None]:
X = matchups_total.iloc[:, 0:-1]
Y = matchups_total[['Result']]

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
# Train logistic regression model
model = LogisticRegression()
model.fit(X_train, Y_train)

# Predict on test data
Y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(Y_test, Y_pred))

In [None]:
# Train random forest classifier
randomForest = RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=20, random_state=42)
randomForest.fit(X_train, Y_train)
Y_pred = randomForest.predict(X_test)
# Evaluate the model
print("Accuracy:", accuracy_score(Y_test, Y_pred))

In [None]:
# Train XGBoost
xgb_clf = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
xgb_clf.fit(X_train, Y_train)
Y_pred = xgb_clf.predict(X_test)
# Evaluate the model
print("Accuracy:", accuracy_score(Y_test, Y_pred))

In [None]:
from sklearn.metrics import classification_report
print(classification_report(Y_test, Y_pred))

In [None]:
ian = np.array(fighters_standardized[fighters_standardized['Full Name'] == 'Alexander Volkov']).ravel()[0: -1]
print(ian)

In [None]:
shavkat = np.array(fighters_standardized[fighters_standardized['Full Name'] == 'Ciryl Gane']).ravel()[0: -1]
print(shavkat)

In [None]:
shavkat_versus_ian = np.append(shavkat, ian)
shavkat_versus_ian = shavkat_versus_ian[np.newaxis, :]
len(shavkat_versus_ian)

In [None]:
model.predict(np.array(shavkat_versus_ian))

In [None]:
import joblib

In [None]:
joblib.dump(xgb_clf, "model.pickle")