https://machinelearningmastery.com/feature-selection-with-real-and-categorical-data/
https://www.youtube.com/watch?v=wElwOM88xJQ

In [1]:
import numpy as np # linear algebra
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
import pickle

from sklearn import metrics
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [2]:
test_set_size = 0.15

In [3]:
dataset_destination = "./ufc-master.csv"
UFC_Data = pd.read_csv(dataset_destination)
UFC_Data.head()

Unnamed: 0,R_fighter,B_fighter,R_odds,B_odds,R_ev,B_ev,date,location,country,Winner,...,finish_details,finish_round,finish_round_time,total_fight_time_secs,r_dec_odds,b_dec_odds,r_sub_odds,b_sub_odds,r_ko_odds,b_ko_odds
0,Uriah Hall,Sean Strickland,175,-210,175.0,47.619048,2021-07-31,"Las Vegas, Nevada, USA",USA,Blue,...,,5.0,5:00,1500.0,650.0,225.0,2500.0,800.0,275.0,165.0
1,Cheyanne Buys,Gloria de Paula,-145,125,68.965517,125.0,2021-07-31,"Las Vegas, Nevada, USA",USA,Red,...,Kick,1.0,1:00,60.0,100.0,200.0,800.0,1400.0,900.0,900.0
2,Niklas Stolze,Jared Gooden,-180,155,55.555556,155.0,2021-07-31,"Las Vegas, Nevada, USA",USA,Blue,...,Punch,1.0,1:08,68.0,180.0,300.0,600.0,1200.0,300.0,600.0
3,Collin Anglin,Melsik Baghdasaryan,135,-155,135.0,64.516129,2021-07-31,"Las Vegas, Nevada, USA",USA,Blue,...,Kick,2.0,1:50,410.0,240.0,250.0,1000.0,850.0,650.0,225.0
4,Bryan Barberena,Jason Witt,-265,215,37.735849,215.0,2021-07-31,"Las Vegas, Nevada, USA",USA,Blue,...,,3.0,5:00,900.0,300.0,500.0,550.0,750.0,120.0,850.0


In [4]:
df_1 = pd.DataFrame(UFC_Data, columns = ["height_dif", "reach_dif", "age_dif", "Winner"])

df_1.describe(include='all')

Unnamed: 0,height_dif,reach_dif,age_dif,Winner
count,4813.0,4813.0,4813.0,4813
unique,,,,2
top,,,,Red
freq,,,,2810
mean,0.020494,-0.25181,0.34573,
std,6.97488,9.450546,5.164193,
min,-187.96,-187.96,-17.0,
25%,-5.08,-5.08,-3.0,
50%,0.0,0.0,0.0,
75%,5.08,5.08,4.0,


In [5]:
def Categorize_Winner(df):
    df['Winner_Categorized'] = df['Winner'].astype('category')

    cat_columns = df.select_dtypes(['category']).columns
    df[cat_columns] = df[cat_columns].apply(lambda x: x.cat.codes)
    return df

df_1 = Categorize_Winner(df_1)
df_1 = df_1.drop(columns = ["Winner"])
df_1.head()

Unnamed: 0,height_dif,reach_dif,age_dif,Winner_Categorized
0,2.54,-7.62,-7,0
1,5.08,10.16,0,1
2,0.0,5.08,-1,0
3,0.0,-2.54,1,0
4,-5.08,-5.08,2,0


In [6]:
#0 = Blue
#1 = Red
#Winner Categorized

In [7]:
df_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4813 entries, 0 to 4812
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   height_dif          4813 non-null   float64
 1   reach_dif           4813 non-null   float64
 2   age_dif             4813 non-null   int64  
 3   Winner_Categorized  4813 non-null   int8   
dtypes: float64(2), int64(1), int8(1)
memory usage: 117.6 KB


In [8]:
df_1.isnull().sum()

height_dif            0
reach_dif             0
age_dif               0
Winner_Categorized    0
dtype: int64

In [9]:
def Decision_Tree_Classifier(X_train, y_train):
    DT = DecisionTreeClassifier(max_depth=200)
    DT.fit(X_train, y_train)
    return DT

def K_Nearest_Neighbors(X_train, y_train):
    kNN = KNeighborsClassifier(n_neighbors=2)
    kNN.fit(X_train, y_train)
    return kNN

def logistic_regression_CV(X_train, y_train):
    logregCV = LogisticRegressionCV()
    #logregCV.fit(X_train, y_train)

    pipe = make_pipeline(StandardScaler(), logregCV)
    pipe.fit(X_train, y_train)
    return pipe

def logistic_regression_CV_non_scaled(X_train, y_train):
    logregCV = LogisticRegressionCV()
    logregCV.fit(X_train, y_train)
    return logregCV

def naive_bayes(X_train, y_train):
    GNB = GaussianNB()
    GNB.fit(X_train, y_train)
    return GNB

def random_forest(X_train, y_train):
    randfor = RandomForestClassifier(max_features="auto",
                                        n_estimators=100,
                                        max_depth=None,
                                        n_jobs=-1)
    randfor.fit(X_train, y_train)
    return randfor

def Support_Vector_Machines(X_train, y_train):
    svm = SVC(probability=True)
    svm.fit(X_train, y_train)
    return svm

In [10]:
def highest_accuracy_model(X_train, X_test, y_train, y_test):
    model_list = []
    model_list.append(Decision_Tree_Classifier(X_train, y_train))
    model_list.append(K_Nearest_Neighbors(X_train, y_train))
    model_list.append(logistic_regression_CV(X_train, y_train))
    model_list.append(logistic_regression_CV_non_scaled(X_train, y_train))
    model_list.append(naive_bayes(X_train, y_train))
    model_list.append(random_forest(X_train, y_train))
    model_list.append(Support_Vector_Machines(X_train, y_train))

    model_hi = model_list[0]
    test_acc_hi = metrics.accuracy_score(y_test, model_hi.predict(X_test))
    
    for i in range(len(model_list)):
        model = model_list[i]
        train_acc_cur = metrics.accuracy_score(y_train, model.predict(X_train))
        test_acc_cur = metrics.accuracy_score(y_test, model.predict(X_test))
        print(model, " Accuracy Score (Training Set): ", train_acc_cur)
        print(model, " Accuracy Score (Test Set): ", test_acc_cur , "\n")
        if  test_acc_cur > test_acc_hi:
            test_acc_hi = test_acc_cur
            model_hi = model
            model_name_hi = i
    
    print("\nHighest Accuracy Model (On Test Set): " , model_hi , "\nTest Set Accuracy Score: " , test_acc_hi)
    return model_hi, test_acc_hi

In [11]:
def tts(df):
    features = df.drop(["Winner_Categorized"], axis=1)
    label = df['Winner_Categorized']
    return train_test_split(features, label, test_size=test_set_size, random_state=0)

In [12]:
X_train, X_test, y_train, y_test = tts(df_1)

In [13]:
model_hi, acc_hi = highest_accuracy_model(X_train, X_test, y_train, y_test)

DecisionTreeClassifier(max_depth=200)  Accuracy Score (Training Set):  0.7868491811293082
DecisionTreeClassifier(max_depth=200)  Accuracy Score (Test Set):  0.5069252077562327 

KNeighborsClassifier(n_neighbors=2)  Accuracy Score (Training Set):  0.6807626497188951
KNeighborsClassifier(n_neighbors=2)  Accuracy Score (Test Set):  0.4778393351800554 

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregressioncv', LogisticRegressionCV())])  Accuracy Score (Training Set):  0.5937423612808604
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregressioncv', LogisticRegressionCV())])  Accuracy Score (Test Set):  0.5858725761772853 

LogisticRegressionCV()  Accuracy Score (Training Set):  0.5912979711561965
LogisticRegressionCV()  Accuracy Score (Test Set):  0.5761772853185596 

GaussianNB()  Accuracy Score (Training Set):  0.5688095820092887
GaussianNB()  Accuracy Score (Test Set):  0.5554016620498615 

RandomForestClassifier(n_job

In [14]:
model_hi

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregressioncv', LogisticRegressionCV())])

In [15]:
acc_hi

0.5858725761772853

In [16]:
#Since we are taking mainly numerical inputs and applying them to a classification problem,
#it may be best to attempt to use ANOVA feature selection.

from sklearn.feature_selection import SelectKBest, f_classif, f_regression
import math

In [17]:
UFC_Data = Categorize_Winner(UFC_Data)

In [18]:
UFC_Data.isnull().sum()

R_fighter                0
B_fighter                0
R_odds                   0
B_odds                   0
R_ev                     0
                      ... 
r_sub_odds            1048
b_sub_odds            1060
r_ko_odds             1048
b_ko_odds             1061
Winner_Categorized       0
Length: 120, dtype: int64

In [19]:
df_2 = pd.DataFrame(UFC_Data.dropna(axis='columns')) #This line drops all columns that have missing values
df_2.isnull().sum()

R_fighter             0
B_fighter             0
R_odds                0
B_odds                0
R_ev                  0
                     ..
avg_td_dif            0
empty_arena           0
constant_1            0
better_rank           0
Winner_Categorized    0
Length: 70, dtype: int64

In [20]:
#we are attempting to use ANOVA feature selection. to do this, we need ONLY numeric columns
#the following lines drop all non-numeric columns.

non_num = []
for col in df_2:
    if df_2[col].dtypes != "float64" and df_2[col].dtypes != "int64":
        non_num.append(col)
print(non_num)

['R_fighter', 'B_fighter', 'date', 'location', 'country', 'Winner', 'title_bout', 'weight_class', 'gender', 'R_Stance', 'better_rank', 'Winner_Categorized']


In [21]:
non_num.remove("Winner_Categorized")
print(non_num)

['R_fighter', 'B_fighter', 'date', 'location', 'country', 'Winner', 'title_bout', 'weight_class', 'gender', 'R_Stance', 'better_rank']


In [22]:
df_2_X = pd.DataFrame(df_2.drop(columns = non_num+["Winner_Categorized"]))
df_2_Y = pd.DataFrame(df_2, columns = ["Winner_Categorized"])

In [23]:
df_2_X.head()

Unnamed: 0,R_odds,B_odds,R_ev,B_ev,no_of_rounds,B_current_lose_streak,B_current_win_streak,B_draw,B_longest_win_streak,B_losses,...,ko_dif,sub_dif,height_dif,reach_dif,age_dif,sig_str_dif,avg_sub_att_dif,avg_td_dif,empty_arena,constant_1
0,175,-210,175.0,47.619048,5,0,4,0,4,3,...,-5,1,2.54,-7.62,-7,1.98,0.1,0.74,1,1
1,-145,125,68.965517,125.0,3,1,0,0,1,1,...,0,0,5.08,10.16,0,-0.93,1.0,-0.48,1,1
2,-180,155,55.555556,155.0,3,2,0,0,0,2,...,0,0,0.0,5.08,-1,2.75,0.0,0.03,1,1
3,135,-155,135.0,64.516129,3,0,1,0,1,0,...,0,0,0.0,-2.54,1,1.51,0.0,-2.75,1,1
4,-265,215,37.735849,215.0,3,1,0,0,1,2,...,-3,0,-5.08,-5.08,2,-2.63,0.9,6.25,1,1


In [24]:
df_2_Y.head()

Unnamed: 0,Winner_Categorized
0,0
1,1
2,0
3,0
4,0


In [25]:
fs = SelectKBest(score_func=f_regression, k=2)
features_selected = fs.fit(df_2_X, df_2_Y)
#print(features_selected.shape)

  return f(*args, **kwargs)
  corr /= X_norms
  F = corr ** 2 / (1 - corr ** 2) * degrees_of_freedom


In [26]:
print(np.round(features_selected.pvalues_, 4))
print(np.round(features_selected.scores_, 3))

[0.000e+00 0.000e+00 0.000e+00 0.000e+00 1.580e-02 2.466e-01 1.200e-03
 1.234e-01 7.338e-01 3.240e-02 1.626e-01 3.800e-03 7.853e-01 1.710e-02
 4.011e-01 2.141e-01 8.022e-01 4.758e-01 4.832e-01 4.690e-02 4.900e-03
 1.734e-01 0.000e+00 0.000e+00 8.919e-01 3.690e-02 0.000e+00 4.740e-02
 8.120e-02 2.795e-01 0.000e+00 5.627e-01 2.679e-01 9.651e-01 5.050e-02
 3.196e-01 9.366e-01 2.660e-01 7.668e-01 0.000e+00 0.000e+00 4.400e-03
 0.000e+00 1.040e-02 9.260e-02 0.000e+00 2.400e-03 8.244e-01 1.700e-02
 8.324e-01 6.100e-03 0.000e+00 0.000e+00 2.000e-04 4.220e-02 0.000e+00
 9.279e-01       nan]
[5.76009e+02 5.99661e+02 5.07375e+02 4.99659e+02 5.83100e+00 1.34300e+00
 1.04530e+01 2.37500e+00 1.16000e-01 4.58200e+00 1.95100e+00 8.39900e+00
 7.40000e-02 5.69000e+00 7.05000e-01 1.54400e+00 6.30000e-02 5.09000e-01
 4.92000e-01 3.95200e+00 7.93500e+00 1.85300e+00 1.69460e+01 2.14220e+01
 1.80000e-02 4.35700e+00 3.49070e+01 3.93400e+00 3.04100e+00 1.17000e+00
 2.24900e+01 3.35000e-01 1.22800e+00 2.00000e

In [27]:
features_score = pd.DataFrame(features_selected.scores_)
features_pvalue = pd.DataFrame(np.round(features_selected.pvalues_, 4))
features = pd.DataFrame(df_2_X.columns)
feature_score = pd.concat([features, features_score, features_pvalue], axis = 1)
#Assign the Column Name


feature_score.columns = ["Input_Features", "F_Score", "P_Value"]
print(feature_score.nlargest(len(df_2_X.columns), columns="F_Score"))

                  Input_Features     F_Score  P_Value
1                         B_odds  599.661490   0.0000
0                         R_odds  576.008930   0.0000
2                           R_ev  507.375300   0.0000
3                           B_ev  499.659180   0.0000
52                       age_dif   75.879507   0.0000
39                         R_age   53.707290   0.0000
40                         B_age   50.987989   0.0000
42                win_streak_dif   49.158096   0.0000
55                    avg_td_dif   39.465522   0.0000
26                      R_losses   34.906813   0.0000
51                     reach_dif   24.894919   0.0000
30       R_win_by_Decision_Split   22.490424   0.0000
23          R_current_win_streak   21.422466   0.0000
45                      loss_dif   20.274556   0.0000
22         R_current_lose_streak   16.945714   0.0000
53                   sig_str_dif   14.123544   0.0002
6           B_current_win_streak   10.453033   0.0012
46               total_round

In [28]:
fs = SelectKBest(score_func=f_classif, k=2)
features_selected = fs.fit(df_2_X, df_2_Y)

  return f(*args, **kwargs)
  f = msb / msw


In [29]:
#The top 2 columns will be True. (k = 2)
#the other columns will be false.

fs.get_support()

array([ True,  True, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False])

In [30]:
def list_ceil(x):
    return [math.ceil(i) for i in x]

In [31]:
print("P_Values: ")
print(np.round(fs.pvalues_, 4))
print("F_Values: ")
print(fs.scores_)

P_Values: 
[0.000e+00 0.000e+00 0.000e+00 0.000e+00 1.580e-02 2.466e-01 1.200e-03
 1.234e-01 7.338e-01 3.240e-02 1.626e-01 3.800e-03 7.853e-01 1.710e-02
 4.011e-01 2.141e-01 8.022e-01 4.758e-01 4.832e-01 4.690e-02 4.900e-03
 1.734e-01 0.000e+00 0.000e+00 8.919e-01 3.690e-02 0.000e+00 4.740e-02
 8.120e-02 2.795e-01 0.000e+00 5.627e-01 2.679e-01 9.651e-01 5.050e-02
 3.196e-01 9.366e-01 2.660e-01 7.668e-01 0.000e+00 0.000e+00 4.400e-03
 0.000e+00 1.040e-02 9.260e-02 0.000e+00 2.400e-03 8.244e-01 1.700e-02
 8.324e-01 6.100e-03 0.000e+00 0.000e+00 2.000e-04 4.220e-02 0.000e+00
 9.279e-01       nan]
F_Values: 
[5.76008930e+02 5.99661490e+02 5.07375300e+02 4.99659180e+02
 5.83127510e+00 1.34273072e+00 1.04530327e+01 2.37508369e+00
 1.15643406e-01 4.58165998e+00 1.95058289e+00 8.39865256e+00
 7.42072661e-02 5.68956221e+00 7.05107505e-01 1.54370312e+00
 6.27640795e-02 5.08647667e-01 4.91716877e-01 3.95184889e+00
 7.93516186e+00 1.85343060e+00 1.69457136e+01 2.14224655e+01
 1.84737677e-02 4.3570

In [32]:
list_ceil(np.nan_to_num(fs.scores_))

[577,
 600,
 508,
 500,
 6,
 2,
 11,
 3,
 1,
 5,
 2,
 9,
 1,
 6,
 1,
 2,
 1,
 1,
 1,
 4,
 8,
 2,
 17,
 22,
 1,
 5,
 35,
 4,
 4,
 2,
 23,
 1,
 2,
 1,
 4,
 1,
 1,
 2,
 1,
 54,
 51,
 9,
 50,
 7,
 3,
 21,
 10,
 1,
 6,
 1,
 8,
 25,
 76,
 15,
 5,
 40,
 1,
 0]

In [33]:
features_score = pd.DataFrame(features_selected.scores_)
features_pvalue = pd.DataFrame(np.round(features_selected.pvalues_, 4))
features = pd.DataFrame(df_2_X.columns)
feature_score = pd.concat([features, features_score, features_pvalue], axis = 1)
#Assign the Column Name

feature_score.columns = ["Input_Features", "F_Score", "P_Value"]
print(feature_score.nlargest(len(df_2_X.columns), columns="F_Score"))

                  Input_Features     F_Score  P_Value
1                         B_odds  599.661490   0.0000
0                         R_odds  576.008930   0.0000
2                           R_ev  507.375300   0.0000
3                           B_ev  499.659180   0.0000
52                       age_dif   75.879507   0.0000
39                         R_age   53.707290   0.0000
40                         B_age   50.987989   0.0000
42                win_streak_dif   49.158096   0.0000
55                    avg_td_dif   39.465522   0.0000
26                      R_losses   34.906813   0.0000
51                     reach_dif   24.894919   0.0000
30       R_win_by_Decision_Split   22.490424   0.0000
23          R_current_win_streak   21.422466   0.0000
45                      loss_dif   20.274556   0.0000
22         R_current_lose_streak   16.945714   0.0000
53                   sig_str_dif   14.123544   0.0002
6           B_current_win_streak   10.453033   0.0012
46               total_round

In [34]:
df_3 = pd.DataFrame(df_2.drop(columns = non_num))
df_3.head()

Unnamed: 0,R_odds,B_odds,R_ev,B_ev,no_of_rounds,B_current_lose_streak,B_current_win_streak,B_draw,B_longest_win_streak,B_losses,...,sub_dif,height_dif,reach_dif,age_dif,sig_str_dif,avg_sub_att_dif,avg_td_dif,empty_arena,constant_1,Winner_Categorized
0,175,-210,175.0,47.619048,5,0,4,0,4,3,...,1,2.54,-7.62,-7,1.98,0.1,0.74,1,1,0
1,-145,125,68.965517,125.0,3,1,0,0,1,1,...,0,5.08,10.16,0,-0.93,1.0,-0.48,1,1,1
2,-180,155,55.555556,155.0,3,2,0,0,0,2,...,0,0.0,5.08,-1,2.75,0.0,0.03,1,1,0
3,135,-155,135.0,64.516129,3,0,1,0,1,0,...,0,0.0,-2.54,1,1.51,0.0,-2.75,1,1,0
4,-265,215,37.735849,215.0,3,1,0,0,1,2,...,0,-5.08,-5.08,2,-2.63,0.9,6.25,1,1,0


In [35]:
#R_ev, B_ev represent the following:
#R_ev = (Probability of Red Winning) * (Payout if Red Wins) - (Probability of Blue Winning) * 100
#"Probability of Red Winning" is the author's model prediction. Since we have no
#Access to this model to recreate this stat for the new fighters, we can not use these metrics

#Looking at the ANOVA Feature Selection method, the remaining factors coorelate with a correct prediction
#most often (Highest F_Scores and P_Value < 0.05)

#We will use the remaining metrics to determine winner.


#R_odds, B_odds represent the fighting odds generated by 3rd party gambling sources (MGM, DraftKings, etc.)
#Though the F_Score is high for these objects, we would like to predict winner without that information

df_3 = df_3.drop(columns = ["R_ev", "B_ev", "R_odds", "B_odds"])
df_3.head()

Unnamed: 0,no_of_rounds,B_current_lose_streak,B_current_win_streak,B_draw,B_longest_win_streak,B_losses,B_total_rounds_fought,B_total_title_bouts,B_win_by_Decision_Majority,B_win_by_Decision_Split,...,sub_dif,height_dif,reach_dif,age_dif,sig_str_dif,avg_sub_att_dif,avg_td_dif,empty_arena,constant_1,Winner_Categorized
0,5,0,4,0,4,3,33,0,0,2,...,1,2.54,-7.62,-7,1.98,0.1,0.74,1,1,0
1,3,1,0,0,1,1,6,0,0,0,...,0,5.08,10.16,0,-0.93,1.0,-0.48,1,1,1
2,3,2,0,0,0,2,6,0,0,0,...,0,0.0,5.08,-1,2.75,0.0,0.03,1,1,0
3,3,0,1,0,1,0,3,0,0,0,...,0,0.0,-2.54,1,1.51,0.0,-2.75,1,1,0
4,3,1,0,0,1,2,4,0,0,0,...,0,-5.08,-5.08,2,-2.63,0.9,6.25,1,1,0


In [36]:
df_3.shape

(4813, 55)

In [37]:
X_train, X_test, y_train, y_test = tts(df_3)

In [38]:
model_hi, acc_hi = highest_accuracy_model(X_train, X_test, y_train, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

DecisionTreeClassifier(max_depth=200)  Accuracy Score (Training Set):  1.0
DecisionTreeClassifier(max_depth=200)  Accuracy Score (Test Set):  0.5152354570637119 

KNeighborsClassifier(n_neighbors=2)  Accuracy Score (Training Set):  0.7631385969200685
KNeighborsClassifier(n_neighbors=2)  Accuracy Score (Test Set):  0.48476454293628807 

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregressioncv', LogisticRegressionCV())])  Accuracy Score (Training Set):  0.6184307015399658
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregressioncv', LogisticRegressionCV())])  Accuracy Score (Test Set):  0.6080332409972299 

LogisticRegressionCV()  Accuracy Score (Training Set):  0.6159863114153019
LogisticRegressionCV()  Accuracy Score (Test Set):  0.6038781163434903 

GaussianNB()  Accuracy Score (Training Set):  0.5409435345881203
GaussianNB()  Accuracy Score (Test Set):  0.5221606648199446 

RandomForestClassifier(n_jobs=-1)  Accurac

In [39]:
df_3_X = pd.DataFrame(df_3.drop(columns = ["Winner_Categorized"]))
df_3_Y = pd.DataFrame(df_3, columns = ["Winner_Categorized"])

In [40]:
df_3_X.head()

Unnamed: 0,no_of_rounds,B_current_lose_streak,B_current_win_streak,B_draw,B_longest_win_streak,B_losses,B_total_rounds_fought,B_total_title_bouts,B_win_by_Decision_Majority,B_win_by_Decision_Split,...,ko_dif,sub_dif,height_dif,reach_dif,age_dif,sig_str_dif,avg_sub_att_dif,avg_td_dif,empty_arena,constant_1
0,5,0,4,0,4,3,33,0,0,2,...,-5,1,2.54,-7.62,-7,1.98,0.1,0.74,1,1
1,3,1,0,0,1,1,6,0,0,0,...,0,0,5.08,10.16,0,-0.93,1.0,-0.48,1,1
2,3,2,0,0,0,2,6,0,0,0,...,0,0,0.0,5.08,-1,2.75,0.0,0.03,1,1
3,3,0,1,0,1,0,3,0,0,0,...,0,0,0.0,-2.54,1,1.51,0.0,-2.75,1,1
4,3,1,0,0,1,2,4,0,0,0,...,-3,0,-5.08,-5.08,2,-2.63,0.9,6.25,1,1


In [41]:
df_3_Y.head()

Unnamed: 0,Winner_Categorized
0,0
1,1
2,0
3,0
4,0


In [42]:
fs = SelectKBest(score_func=f_classif, k=2)
features_selected = fs.fit(df_3_X, df_3_Y)

  return f(*args, **kwargs)
  f = msb / msw


fs.get_support()

print("P_Values: ")
print(np.round(fs.pvalues_, 4))
print("F_Values: ")
print(fs.scores_)

list_ceil(np.nan_to_num(fs.scores_))

In [43]:
features_score = pd.DataFrame(features_selected.scores_)
features_pvalue = pd.DataFrame(np.round(features_selected.pvalues_, 4))
features = pd.DataFrame(df_3_X.columns)
feature_score = pd.concat([features, features_score, features_pvalue], axis = 1)
#Assign the Column Name

feature_score.columns = ["Input_Features", "F_Score", "P_Value"]
print(feature_score.nlargest(len(df_3_X.columns), columns="F_Score"))

                  Input_Features    F_Score  P_Value
48                       age_dif  75.879507   0.0000
35                         R_age  53.707290   0.0000
36                         B_age  50.987989   0.0000
38                win_streak_dif  49.158096   0.0000
51                    avg_td_dif  39.465522   0.0000
22                      R_losses  34.906813   0.0000
47                     reach_dif  24.894919   0.0000
26       R_win_by_Decision_Split  22.490424   0.0000
19          R_current_win_streak  21.422466   0.0000
41                      loss_dif  20.274556   0.0000
18         R_current_lose_streak  16.945714   0.0000
49                   sig_str_dif  14.123544   0.0002
2           B_current_win_streak  10.453033   0.0012
42               total_round_dif   9.196557   0.0024
7            B_total_title_bouts   8.398653   0.0038
37               lose_streak_dif   8.117703   0.0044
16                   B_Reach_cms   7.935162   0.0049
46                    height_dif   7.539636   

In [44]:
top_half = feature_score.nlargest(int(len(df_3_X.columns)/2),columns="F_Score")
print(top_half)

             Input_Features    F_Score  P_Value
48                  age_dif  75.879507   0.0000
35                    R_age  53.707290   0.0000
36                    B_age  50.987989   0.0000
38           win_streak_dif  49.158096   0.0000
51               avg_td_dif  39.465522   0.0000
22                 R_losses  34.906813   0.0000
47                reach_dif  24.894919   0.0000
26  R_win_by_Decision_Split  22.490424   0.0000
19     R_current_win_streak  21.422466   0.0000
41                 loss_dif  20.274556   0.0000
18    R_current_lose_streak  16.945714   0.0000
49              sig_str_dif  14.123544   0.0002
2      B_current_win_streak  10.453033   0.0012
42          total_round_dif   9.196557   0.0024
7       B_total_title_bouts   8.398653   0.0038
37          lose_streak_dif   8.117703   0.0044
16              B_Reach_cms   7.935162   0.0049
46               height_dif   7.539636   0.0061
39   longest_win_streak_dif   6.572635   0.0104
0              no_of_rounds   5.831275  

In [45]:
bottom_half = feature_score.nsmallest(int(len(df_3_X.columns)/2),columns="F_Score")
print(bottom_half)

                  Input_Features   F_Score  P_Value
29           R_win_by_Submission  0.001913   0.9651
32                  R_Height_cms  0.006334   0.9366
52                   empty_arena  0.008185   0.9279
20                        R_draw  0.018474   0.8919
45                       sub_dif  0.044791   0.8324
43          total_title_bout_dif  0.049261   0.8244
12           B_win_by_Submission  0.062764   0.8022
8     B_win_by_Decision_Majority  0.074207   0.7853
34                  R_Weight_lbs  0.087928   0.7668
4           B_longest_win_streak  0.115643   0.7338
27   R_win_by_Decision_Unanimous  0.335135   0.5627
14                        B_wins  0.491717   0.4832
13  B_win_by_TKO_Doctor_Stoppage  0.508648   0.4758
10   B_win_by_Decision_Unanimous  0.705108   0.4011
31                        R_wins  0.990646   0.3196
25    R_win_by_Decision_Majority  1.169954   0.2795
28               R_win_by_KO/TKO  1.227757   0.2679
33                   R_Reach_cms  1.237499   0.2660
1          B

In [46]:
bottom_half_feature_names = bottom_half["Input_Features"]
df_4 = pd.DataFrame(df_3.drop(columns = list(bottom_half_feature_names)))
#print(df_4)
df_4.head()

Unnamed: 0,no_of_rounds,B_current_win_streak,B_losses,B_total_title_bouts,B_win_by_Decision_Split,B_Height_cms,B_Reach_cms,R_current_lose_streak,R_current_win_streak,R_longest_win_streak,...,total_round_dif,ko_dif,height_dif,reach_dif,age_dif,sig_str_dif,avg_sub_att_dif,avg_td_dif,constant_1,Winner_Categorized
0,5,4,3,0,2,185.42,193.04,0,4,4,...,-4,-5,2.54,-7.62,-7,1.98,0.1,0.74,1,0
1,3,0,1,0,0,165.1,170.18,1,0,1,...,0,0,5.08,10.16,0,-0.93,1.0,-0.48,1,1
2,3,0,2,0,0,182.88,195.58,1,0,0,...,3,0,0.0,5.08,-1,2.75,0.0,0.03,1,0
3,3,1,0,0,0,175.26,177.8,0,1,1,...,0,0,0.0,-2.54,1,1.51,0.0,-2.75,1,0
4,3,0,2,0,0,177.8,177.8,0,1,2,...,-24,-3,-5.08,-5.08,2,-2.63,0.9,6.25,1,0


In [47]:
df_4.shape

(4813, 28)

In [48]:
X_train, X_test, y_train, y_test = tts(df_4)

In [49]:
model_hi, acc_hi = highest_accuracy_model(X_train, X_test, y_train, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

DecisionTreeClassifier(max_depth=200)  Accuracy Score (Training Set):  1.0
DecisionTreeClassifier(max_depth=200)  Accuracy Score (Test Set):  0.5207756232686981 

KNeighborsClassifier(n_neighbors=2)  Accuracy Score (Training Set):  0.7638719139574676
KNeighborsClassifier(n_neighbors=2)  Accuracy Score (Test Set):  0.4778393351800554 

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregressioncv', LogisticRegressionCV())])  Accuracy Score (Training Set):  0.6132974822781716
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregressioncv', LogisticRegressionCV())])  Accuracy Score (Test Set):  0.5997229916897507 

LogisticRegressionCV()  Accuracy Score (Training Set):  0.6150085553654363
LogisticRegressionCV()  Accuracy Score (Test Set):  0.6066481994459834 

GaussianNB()  Accuracy Score (Training Set):  0.5604986555854314
GaussianNB()  Accuracy Score (Test Set):  0.5235457063711911 

RandomForestClassifier(n_jobs=-1)  Accuracy

In [50]:
df_4_X = pd.DataFrame(df_4.drop(columns = ["Winner_Categorized"]))
df_4_Y = pd.DataFrame(df_4, columns = ["Winner_Categorized"])

In [51]:
df_4_X.head()

Unnamed: 0,no_of_rounds,B_current_win_streak,B_losses,B_total_title_bouts,B_win_by_Decision_Split,B_Height_cms,B_Reach_cms,R_current_lose_streak,R_current_win_streak,R_longest_win_streak,...,loss_dif,total_round_dif,ko_dif,height_dif,reach_dif,age_dif,sig_str_dif,avg_sub_att_dif,avg_td_dif,constant_1
0,5,4,3,0,2,185.42,193.04,0,4,4,...,-4,-4,-5,2.54,-7.62,-7,1.98,0.1,0.74,1
1,3,0,1,0,0,165.1,170.18,1,0,1,...,0,0,0,5.08,10.16,0,-0.93,1.0,-0.48,1
2,3,0,2,0,0,182.88,195.58,1,0,0,...,1,3,0,0.0,5.08,-1,2.75,0.0,0.03,1
3,3,1,0,0,0,175.26,177.8,0,1,1,...,0,0,0,0.0,-2.54,1,1.51,0.0,-2.75,1
4,3,0,2,0,0,177.8,177.8,0,1,2,...,-3,-24,-3,-5.08,-5.08,2,-2.63,0.9,6.25,1


In [52]:
df_4_Y.head()

Unnamed: 0,Winner_Categorized
0,0
1,1
2,0
3,0
4,0


In [53]:
fs = SelectKBest(score_func=f_classif, k=2)
features_selected = fs.fit(df_4_X, df_4_Y)

  return f(*args, **kwargs)
  f = msb / msw


In [54]:
features_score = pd.DataFrame(features_selected.scores_)
features_pvalue = pd.DataFrame(np.round(features_selected.pvalues_, 4))
features = pd.DataFrame(df_4_X.columns)
feature_score = pd.concat([features, features_score, features_pvalue], axis = 1)
#Assign the Column Name

feature_score.columns = ["Input_Features", "F_Score", "P_Value"]
print(feature_score.nlargest(len(df_4_X.columns), columns="F_Score"))

             Input_Features    F_Score  P_Value
22                  age_dif  75.879507   0.0000
12                    R_age  53.707290   0.0000
13                    B_age  50.987989   0.0000
15           win_streak_dif  49.158096   0.0000
25               avg_td_dif  39.465522   0.0000
10                 R_losses  34.906813   0.0000
21                reach_dif  24.894919   0.0000
11  R_win_by_Decision_Split  22.490424   0.0000
8      R_current_win_streak  21.422466   0.0000
17                 loss_dif  20.274556   0.0000
7     R_current_lose_streak  16.945714   0.0000
23              sig_str_dif  14.123544   0.0002
1      B_current_win_streak  10.453033   0.0012
18          total_round_dif   9.196557   0.0024
3       B_total_title_bouts   8.398653   0.0038
14          lose_streak_dif   8.117703   0.0044
6               B_Reach_cms   7.935162   0.0049
20               height_dif   7.539636   0.0061
16   longest_win_streak_dif   6.572635   0.0104
0              no_of_rounds   5.831275  

In [55]:
top_half = feature_score.nlargest(int(len(df_4_X.columns)/2),columns="F_Score")
print(top_half)

             Input_Features    F_Score  P_Value
22                  age_dif  75.879507   0.0000
12                    R_age  53.707290   0.0000
13                    B_age  50.987989   0.0000
15           win_streak_dif  49.158096   0.0000
25               avg_td_dif  39.465522   0.0000
10                 R_losses  34.906813   0.0000
21                reach_dif  24.894919   0.0000
11  R_win_by_Decision_Split  22.490424   0.0000
8      R_current_win_streak  21.422466   0.0000
17                 loss_dif  20.274556   0.0000
7     R_current_lose_streak  16.945714   0.0000
23              sig_str_dif  14.123544   0.0002
1      B_current_win_streak  10.453033   0.0012


In [56]:
bottom_half = feature_score.nsmallest(int(len(df_4_X.columns)/2),columns="F_Score")
print(bottom_half)

             Input_Features   F_Score  P_Value
5              B_Height_cms  3.951849   0.0469
24          avg_sub_att_dif  4.129335   0.0422
9      R_longest_win_streak  4.357039   0.0369
2                  B_losses  4.581660   0.0324
4   B_win_by_Decision_Split  5.689562   0.0171
19                   ko_dif  5.697688   0.0170
0              no_of_rounds  5.831275   0.0158
16   longest_win_streak_dif  6.572635   0.0104
20               height_dif  7.539636   0.0061
6               B_Reach_cms  7.935162   0.0049
14          lose_streak_dif  8.117703   0.0044
3       B_total_title_bouts  8.398653   0.0038
18          total_round_dif  9.196557   0.0024


In [57]:
#Here, we will do something a bit different instead of eliminating 1/2 of the low F-Score features
#age_dif is calculated by subtracting R_age from B_age.
#though R_age could be a viable feature for the model due to age and still being in
#the red corner (red corner is typically favorite in UFC)
#I would like to eliminate those specific age params and keep only the age difference

rd_4_elim = []
rd_4_elim.append('R_age')
rd_4_elim.append('B_age')

#Kinda the same concept/logic with many of the other features 
#in the set that cite specific Red or Blue corners

rd_4_elim.append('R_losses')
rd_4_elim.append('R_win_by_Decision_Split')
rd_4_elim.append('R_current_win_streak')
rd_4_elim.append('R_current_lose_streak')
rd_4_elim.append('B_current_win_streak')
rd_4_elim.append('B_total_title_bouts')
rd_4_elim.append('B_Reach_cms')
rd_4_elim.append('B_win_by_Decision_Split')
rd_4_elim.append('B_losses')
rd_4_elim.append('R_longest_win_streak')
rd_4_elim.append('constant_1')
#rd_4_elim.append('')

In [58]:
print(rd_4_elim)

['R_age', 'B_age', 'R_losses', 'R_win_by_Decision_Split', 'R_current_win_streak', 'R_current_lose_streak', 'B_current_win_streak', 'B_total_title_bouts', 'B_Reach_cms', 'B_win_by_Decision_Split', 'B_losses', 'R_longest_win_streak', 'constant_1']


In [59]:
df_5 = pd.DataFrame(df_4.drop(columns = rd_4_elim))
df_5.head()

Unnamed: 0,no_of_rounds,B_Height_cms,lose_streak_dif,win_streak_dif,longest_win_streak_dif,loss_dif,total_round_dif,ko_dif,height_dif,reach_dif,age_dif,sig_str_dif,avg_sub_att_dif,avg_td_dif,Winner_Categorized
0,5,185.42,0,0,0,-4,-4,-5,2.54,-7.62,-7,1.98,0.1,0.74,0
1,3,165.1,0,0,0,0,0,0,5.08,10.16,0,-0.93,1.0,-0.48,1
2,3,182.88,1,0,0,1,3,0,0.0,5.08,-1,2.75,0.0,0.03,0
3,3,175.26,0,0,0,0,0,0,0.0,-2.54,1,1.51,0.0,-2.75,0
4,3,177.8,1,-1,-1,-3,-24,-3,-5.08,-5.08,2,-2.63,0.9,6.25,0


In [60]:
df_5.shape

(4813, 15)

In [61]:
X_train, X_test, y_train, y_test = tts(df_5)

In [62]:
model_hi, acc_hi = highest_accuracy_model(X_train, X_test, y_train, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

DecisionTreeClassifier(max_depth=200)  Accuracy Score (Training Set):  1.0
DecisionTreeClassifier(max_depth=200)  Accuracy Score (Test Set):  0.5304709141274239 

KNeighborsClassifier(n_neighbors=2)  Accuracy Score (Training Set):  0.7599608897580054
KNeighborsClassifier(n_neighbors=2)  Accuracy Score (Test Set):  0.48753462603878117 

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregressioncv', LogisticRegressionCV())])  Accuracy Score (Training Set):  0.6086531410413102
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregressioncv', LogisticRegressionCV())])  Accuracy Score (Test Set):  0.6066481994459834 

LogisticRegressionCV()  Accuracy Score (Training Set):  0.6010755316548522
LogisticRegressionCV()  Accuracy Score (Test Set):  0.6149584487534626 

GaussianNB()  Accuracy Score (Training Set):  0.5631874847225617
GaussianNB()  Accuracy Score (Test Set):  0.5277008310249307 

RandomForestClassifier(n_jobs=-1)  Accurac

In [63]:
df_5_X = pd.DataFrame(df_5.drop(columns = ["Winner_Categorized"]))
df_5_Y = pd.DataFrame(df_5, columns = ["Winner_Categorized"])

In [64]:
df_5_X.head()

Unnamed: 0,no_of_rounds,B_Height_cms,lose_streak_dif,win_streak_dif,longest_win_streak_dif,loss_dif,total_round_dif,ko_dif,height_dif,reach_dif,age_dif,sig_str_dif,avg_sub_att_dif,avg_td_dif
0,5,185.42,0,0,0,-4,-4,-5,2.54,-7.62,-7,1.98,0.1,0.74
1,3,165.1,0,0,0,0,0,0,5.08,10.16,0,-0.93,1.0,-0.48
2,3,182.88,1,0,0,1,3,0,0.0,5.08,-1,2.75,0.0,0.03
3,3,175.26,0,0,0,0,0,0,0.0,-2.54,1,1.51,0.0,-2.75
4,3,177.8,1,-1,-1,-3,-24,-3,-5.08,-5.08,2,-2.63,0.9,6.25


In [65]:
df_5_Y.head()

Unnamed: 0,Winner_Categorized
0,0
1,1
2,0
3,0
4,0


In [66]:
fs = SelectKBest(score_func=f_classif, k=2)
features_selected = fs.fit(df_5_X, df_5_Y)

  return f(*args, **kwargs)


In [67]:
features_score = pd.DataFrame(features_selected.scores_)
features_pvalue = pd.DataFrame(np.round(features_selected.pvalues_, 4))
features = pd.DataFrame(df_5_X.columns)
feature_score = pd.concat([features, features_score, features_pvalue], axis = 1)
#Assign the Column Name

feature_score.columns = ["Input_Features", "F_Score", "P_Value"]
print(feature_score.nlargest(len(df_5_X.columns), columns="F_Score"))

            Input_Features    F_Score  P_Value
10                 age_dif  75.879507   0.0000
3           win_streak_dif  49.158096   0.0000
13              avg_td_dif  39.465522   0.0000
9                reach_dif  24.894919   0.0000
5                 loss_dif  20.274556   0.0000
11             sig_str_dif  14.123544   0.0002
6          total_round_dif   9.196557   0.0024
2          lose_streak_dif   8.117703   0.0044
8               height_dif   7.539636   0.0061
4   longest_win_streak_dif   6.572635   0.0104
0             no_of_rounds   5.831275   0.0158
7                   ko_dif   5.697688   0.0170
12         avg_sub_att_dif   4.129335   0.0422
1             B_Height_cms   3.951849   0.0469


In [68]:
top_half = feature_score.nlargest(int(len(df_5_X.columns)/2),columns="F_Score")
print(top_half)

     Input_Features    F_Score  P_Value
10          age_dif  75.879507   0.0000
3    win_streak_dif  49.158096   0.0000
13       avg_td_dif  39.465522   0.0000
9         reach_dif  24.894919   0.0000
5          loss_dif  20.274556   0.0000
11      sig_str_dif  14.123544   0.0002
6   total_round_dif   9.196557   0.0024


In [69]:
bottom_half = feature_score.nsmallest(int(len(df_5_X.columns)/2),columns="F_Score")
print(bottom_half)

            Input_Features   F_Score  P_Value
1             B_Height_cms  3.951849   0.0469
12         avg_sub_att_dif  4.129335   0.0422
7                   ko_dif  5.697688   0.0170
0             no_of_rounds  5.831275   0.0158
4   longest_win_streak_dif  6.572635   0.0104
8               height_dif  7.539636   0.0061
2          lose_streak_dif  8.117703   0.0044


In [70]:
bottom_half_feature_names = bottom_half["Input_Features"]
df_6 = pd.DataFrame(df_5.drop(columns = list(bottom_half_feature_names)))
#print(df_4)
df_6.head()

Unnamed: 0,win_streak_dif,loss_dif,total_round_dif,reach_dif,age_dif,sig_str_dif,avg_td_dif,Winner_Categorized
0,0,-4,-4,-7.62,-7,1.98,0.74,0
1,0,0,0,10.16,0,-0.93,-0.48,1
2,0,1,3,5.08,-1,2.75,0.03,0
3,0,0,0,-2.54,1,1.51,-2.75,0
4,-1,-3,-24,-5.08,2,-2.63,6.25,0


In [71]:
df_6.shape

(4813, 8)

In [72]:
X_train, X_test, y_train, y_test = tts(df_6)

In [73]:
model_hi, acc_hi = highest_accuracy_model(X_train, X_test, y_train, y_test)

DecisionTreeClassifier(max_depth=200)  Accuracy Score (Training Set):  0.9953556587631386
DecisionTreeClassifier(max_depth=200)  Accuracy Score (Test Set):  0.5429362880886427 

KNeighborsClassifier(n_neighbors=2)  Accuracy Score (Training Set):  0.7650941090197996
KNeighborsClassifier(n_neighbors=2)  Accuracy Score (Test Set):  0.5138504155124654 

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregressioncv', LogisticRegressionCV())])  Accuracy Score (Training Set):  0.6049865558543144
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregressioncv', LogisticRegressionCV())])  Accuracy Score (Test Set):  0.5969529085872576 

LogisticRegressionCV()  Accuracy Score (Training Set):  0.6054754338792472
LogisticRegressionCV()  Accuracy Score (Test Set):  0.6163434903047091 

GaussianNB()  Accuracy Score (Training Set):  0.5878758249816671
GaussianNB()  Accuracy Score (Test Set):  0.5734072022160664 

RandomForestClassifier(n_job

In [74]:
df_6_X = pd.DataFrame(df_6.drop(columns = ["Winner_Categorized"]))
df_6_Y = pd.DataFrame(df_6, columns = ["Winner_Categorized"])

In [75]:
df_6_X.head()

Unnamed: 0,win_streak_dif,loss_dif,total_round_dif,reach_dif,age_dif,sig_str_dif,avg_td_dif
0,0,-4,-4,-7.62,-7,1.98,0.74
1,0,0,0,10.16,0,-0.93,-0.48
2,0,1,3,5.08,-1,2.75,0.03
3,0,0,0,-2.54,1,1.51,-2.75
4,-1,-3,-24,-5.08,2,-2.63,6.25


In [76]:
df_6_Y.head()

Unnamed: 0,Winner_Categorized
0,0
1,1
2,0
3,0
4,0


In [77]:
fs = SelectKBest(score_func=f_classif, k=2)
features_selected = fs.fit(df_6_X, df_6_Y)

  return f(*args, **kwargs)


In [78]:
features_score = pd.DataFrame(features_selected.scores_)
features_pvalue = pd.DataFrame(np.round(features_selected.pvalues_, 4))
features = pd.DataFrame(df_6_X.columns)
feature_score = pd.concat([features, features_score, features_pvalue], axis = 1)
#Assign the Column Name

feature_score.columns = ["Input_Features", "F_Score", "P_Value"]
print(feature_score.nlargest(len(df_6_X.columns), columns="F_Score"))

    Input_Features    F_Score  P_Value
4          age_dif  75.879507   0.0000
0   win_streak_dif  49.158096   0.0000
6       avg_td_dif  39.465522   0.0000
3        reach_dif  24.894919   0.0000
1         loss_dif  20.274556   0.0000
5      sig_str_dif  14.123544   0.0002
2  total_round_dif   9.196557   0.0024


In [79]:
top_half = feature_score.nlargest(int(len(df_6_X.columns)/2),columns="F_Score")
print(top_half)

   Input_Features    F_Score  P_Value
4         age_dif  75.879507      0.0
0  win_streak_dif  49.158096      0.0
6      avg_td_dif  39.465522      0.0


In [80]:
bottom_half = feature_score.nsmallest(int(len(df_6_X.columns)/2),columns="F_Score")
print(bottom_half)

    Input_Features    F_Score  P_Value
2  total_round_dif   9.196557   0.0024
5      sig_str_dif  14.123544   0.0002
1         loss_dif  20.274556   0.0000


In [81]:
bottom_half_feature_names = bottom_half["Input_Features"]
df_7 = pd.DataFrame(df_6.drop(columns = list(bottom_half_feature_names)))
df_7.head()

Unnamed: 0,win_streak_dif,reach_dif,age_dif,avg_td_dif,Winner_Categorized
0,0,-7.62,-7,0.74,0
1,0,10.16,0,-0.48,1
2,0,5.08,-1,0.03,0
3,0,-2.54,1,-2.75,0
4,-1,-5.08,2,6.25,0


In [82]:
df_7.shape

(4813, 5)

In [83]:
X_train, X_test, y_train, y_test = tts(df_7)

In [84]:
model_hi, acc_hi = highest_accuracy_model(X_train, X_test, y_train, y_test)

DecisionTreeClassifier(max_depth=200)  Accuracy Score (Training Set):  0.972378391591298
DecisionTreeClassifier(max_depth=200)  Accuracy Score (Test Set):  0.53601108033241 

KNeighborsClassifier(n_neighbors=2)  Accuracy Score (Training Set):  0.7589831337081399
KNeighborsClassifier(n_neighbors=2)  Accuracy Score (Test Set):  0.5277008310249307 

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregressioncv', LogisticRegressionCV())])  Accuracy Score (Training Set):  0.6074309459789783
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregressioncv', LogisticRegressionCV())])  Accuracy Score (Test Set):  0.592797783933518 

LogisticRegressionCV()  Accuracy Score (Training Set):  0.6064531899291127
LogisticRegressionCV()  Accuracy Score (Test Set):  0.5914127423822715 

GaussianNB()  Accuracy Score (Training Set):  0.5939868002933268
GaussianNB()  Accuracy Score (Test Set):  0.592797783933518 

RandomForestClassifier(n_jobs=-1)

In [85]:
X_train, X_test, y_train, y_test = tts(df_6)

In [86]:
model_hi, acc_hi = highest_accuracy_model(X_train, X_test, y_train, y_test)

DecisionTreeClassifier(max_depth=200)  Accuracy Score (Training Set):  0.9953556587631386
DecisionTreeClassifier(max_depth=200)  Accuracy Score (Test Set):  0.5373961218836565 

KNeighborsClassifier(n_neighbors=2)  Accuracy Score (Training Set):  0.7650941090197996
KNeighborsClassifier(n_neighbors=2)  Accuracy Score (Test Set):  0.5138504155124654 

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregressioncv', LogisticRegressionCV())])  Accuracy Score (Training Set):  0.6049865558543144
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregressioncv', LogisticRegressionCV())])  Accuracy Score (Test Set):  0.5969529085872576 

LogisticRegressionCV()  Accuracy Score (Training Set):  0.6054754338792472
LogisticRegressionCV()  Accuracy Score (Test Set):  0.6163434903047091 

GaussianNB()  Accuracy Score (Training Set):  0.5878758249816671
GaussianNB()  Accuracy Score (Test Set):  0.5734072022160664 

RandomForestClassifier(n_job

In [87]:
features_score = pd.DataFrame(features_selected.scores_)
features_pvalue = pd.DataFrame(np.round(features_selected.pvalues_, 4))
features = pd.DataFrame(df_6_X.columns)
feature_score = pd.concat([features, features_score, features_pvalue], axis = 1)
#Assign the Column Name

feature_score.columns = ["Input_Features", "F_Score", "P_Value"]
print(feature_score.nlargest(len(df_6_X.columns), columns="F_Score"))

    Input_Features    F_Score  P_Value
4          age_dif  75.879507   0.0000
0   win_streak_dif  49.158096   0.0000
6       avg_td_dif  39.465522   0.0000
3        reach_dif  24.894919   0.0000
1         loss_dif  20.274556   0.0000
5      sig_str_dif  14.123544   0.0002
2  total_round_dif   9.196557   0.0024


In [88]:
df_6_2 = pd.DataFrame(df_6)
df_6_2["R_odds"] = UFC_Data["R_odds"]
df_6_2["B_odds"] = UFC_Data["B_odds"]
df_6_2 = df_6_2.drop(columns = ["sig_str_dif"])
df_6_2.head()

Unnamed: 0,win_streak_dif,loss_dif,total_round_dif,reach_dif,age_dif,avg_td_dif,Winner_Categorized,R_odds,B_odds
0,0,-4,-4,-7.62,-7,0.74,0,175,-210
1,0,0,0,10.16,0,-0.48,1,-145,125
2,0,1,3,5.08,-1,0.03,0,-180,155
3,0,0,0,-2.54,1,-2.75,0,135,-155
4,-1,-3,-24,-5.08,2,6.25,0,-265,215


In [89]:
X_train, X_test, y_train, y_test = tts(df_6_2)
model_hi, acc_hi = highest_accuracy_model(X_train, X_test, y_train, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


DecisionTreeClassifier(max_depth=200)  Accuracy Score (Training Set):  1.0
DecisionTreeClassifier(max_depth=200)  Accuracy Score (Test Set):  0.538781163434903 

KNeighborsClassifier(n_neighbors=2)  Accuracy Score (Training Set):  0.7917379613786361
KNeighborsClassifier(n_neighbors=2)  Accuracy Score (Test Set):  0.5263157894736842 

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregressioncv', LogisticRegressionCV())])  Accuracy Score (Training Set):  0.6550965534099242
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregressioncv', LogisticRegressionCV())])  Accuracy Score (Test Set):  0.6301939058171745 

LogisticRegressionCV()  Accuracy Score (Training Set):  0.6514299682229284
LogisticRegressionCV()  Accuracy Score (Test Set):  0.646814404432133 

GaussianNB()  Accuracy Score (Training Set):  0.6489855780982645
GaussianNB()  Accuracy Score (Test Set):  0.6509695290858726 

RandomForestClassifier(n_jobs=-1)  Accuracy S

In [90]:
df_6_2_X = pd.DataFrame(df_6_2.drop(columns = ["Winner_Categorized"]))
df_6_2_Y = pd.DataFrame(df_6_2, columns = ["Winner_Categorized"])

fs = SelectKBest(score_func=f_classif, k=2)
features_selected = fs.fit(df_6_2_X, df_6_2_Y)

features_score = pd.DataFrame(features_selected.scores_)
features_pvalue = pd.DataFrame(np.round(features_selected.pvalues_, 4))
features = pd.DataFrame(df_6_2_X.columns)
feature_score = pd.concat([features, features_score, features_pvalue], axis = 1)
#Assign the Column Name

feature_score.columns = ["Input_Features", "F_Score", "P_Value"]
print(feature_score.nlargest(len(df_6_2_X.columns), columns="F_Score"))

    Input_Features     F_Score  P_Value
7           B_odds  599.661490   0.0000
6           R_odds  576.008930   0.0000
4          age_dif   75.879507   0.0000
0   win_streak_dif   49.158096   0.0000
5       avg_td_dif   39.465522   0.0000
3        reach_dif   24.894919   0.0000
1         loss_dif   20.274556   0.0000
2  total_round_dif    9.196557   0.0024


  return f(*args, **kwargs)


In [91]:
#Now we need to find a way to represent these values for each fighter in database.columns
#The way I am approaching doing this is:

#Creating a DataFrame that holds 
#Columns = Weight Classes
#Rows: people that fight in each specific weight class

#Creating a second DataFrame that:
#Columns = Stats of fighters, (Age, Win Streak, Average TD, Reach, Loss Count, )

In [92]:
#How do you find out which one is the person winning and which one the person is losing?
#That is my main question. For the moment, i am assuming that the former value is 
#the person losing and the latter value is the person winning

lrCV = LogisticRegressionCV()
lrCV.fit(X_train, y_train)
lrCV.predict_proba(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


array([[0.26629612, 0.73370388],
       [0.3907097 , 0.6092903 ],
       [0.46751665, 0.53248335],
       ...,
       [0.35284363, 0.64715637],
       [0.18586451, 0.81413549],
       [0.40367494, 0.59632506]])

In [93]:
#its gonna be like this
#the model makes a prediction WITH and WITHOUT ODDS
#The model then posts the accuracy of the model on testing set with and without odds
#Ask for how much money you would consider to bet?
#If you bet this amount on red with x odds you win this much
#If you bet this amount on blue with x odds you win this much
#(Given they dont bet money)(leaves blank) if $1 was bet on Red, you would win this much
#(Given they dont bet money)(leaves blank) if $1 was bet on Blue, you would win this much
#(Given they dont bet money)(leaves blank) if $100 was bet on Red, you would win this much
#(Given they dont bet money)(leaves blank) if $100 was bet on Blue, you would win this much
#(Given they dont bet money)(leaves blank) if $1000 was bet on Red, you would win this much
#(Given they dont bet money)(leaves blank) if $1000 was bet on Blue, you would win this much
#The risk-reward factor advises you to put money on this person.

#The Harsh reality: UFC is pretty balanced. Though the model does show higher accuracy when betting
#odds are introduced, the general concept that is introduced by odds is that people believe
#X will win and that is why you are less likely to win more money given they dont.
#The general safe rule is that you should bet on person that is favorite to win
#That said, the model is biased towards believing that given F_Score of odds and
#spike of accuracy when taking those odds into account.

#I've created a model that does NOT take into account the 
#betting odds of other people. that feature taken out, the accuracy goes down a considerable amount
#this means that its pretty close to a coin flip regarding who wins, but generally
#there are characteristics that lean towards more success. see the algorithm to view those
#characteristics


In [94]:
#one thing to note:
#With df_6_2, I experimented with R_odds and B_odds.
#In order to get the LogisticRegressionCV to converge, i removed the 2 lowest F_scores from the list (sig_str_dif, total_round_dif)
#The accuracy suffered as a result (albeit not by much)
#When i intended to add the higher f_score back to the list (sig_str_dif), i accidentally added (total_round_dif) back to the list and omitted the former
#The resulting accuracy metric increased to 0.65...
#When i went back, corrected my code, the accuracy dropped down to 0.649...
#This means that just because you keep in the larger accuracy scores, does not necessarily mean
#that it is good for the model.
#Its weird, but very interesting as well.
#this means that combinations of various features might perform better than just cutting out smaller f_scores


so if odds are negative (favorite)
this is how you calculate payout:
x = betting amount

x*(100/odds)+x

if odds are positive (underdog)
this is how you calculate payout:
x = betting amount

x*(odds/100)+x

In [95]:
Mw = [] 
ws = [] 
ww = []
fw = [] 
lw = []
flyw = []
bw = []
wflyw = []
wbw = []
hw = []
lhw = []
wfw = [] 
catchw = []
for row in range (len(UFC_Data)):
    #rowIndex = UFC_Data.index[row]
    cur_weight_class = UFC_Data.weight_class[row]

    if cur_weight_class.lower() == 'middleweight': 
        Mw.append(UFC_Data.at[row, 'R_fighter'])
        Mw.append(UFC_Data.at[row, 'B_fighter'])
    elif cur_weight_class.lower() == 'women\'s strawweight': 
        ws.append(UFC_Data.at[row, 'R_fighter'])
        ws.append(UFC_Data.at[row, 'B_fighter'])
    elif cur_weight_class.lower() == 'welterweight': 
        ww.append(UFC_Data.at[row, 'R_fighter'])
        ww.append(UFC_Data.at[row, 'B_fighter'])
    elif cur_weight_class.lower() == 'featherweight': 
        fw.append(UFC_Data.at[row, 'R_fighter'])
        fw.append(UFC_Data.at[row, 'B_fighter'])
    elif cur_weight_class.lower() == 'lightweight': 
        lw.append(UFC_Data.at[row, 'R_fighter'])
        lw.append(UFC_Data.at[row, 'B_fighter'])
    elif cur_weight_class.lower() == 'flyweight': 
        flyw.append(UFC_Data.at[row, 'R_fighter'])
        flyw.append(UFC_Data.at[row, 'B_fighter'])
    elif cur_weight_class.lower() == 'bantamweight': 
        bw.append(UFC_Data.at[row, 'R_fighter'])
        bw.append(UFC_Data.at[row, 'B_fighter'])
    elif cur_weight_class.lower() == 'women\'s flyweight': 
        wflyw.append(UFC_Data.at[row, 'R_fighter'])
        wflyw.append(UFC_Data.at[row, 'B_fighter'])
    elif cur_weight_class.lower() == 'women\'s bantamweight': 
        wbw.append(UFC_Data.at[row, 'R_fighter'])
        wbw.append(UFC_Data.at[row, 'B_fighter'])
    elif cur_weight_class.lower() == 'heavyweight': 
        hw.append(UFC_Data.at[row, 'R_fighter'])
        hw.append(UFC_Data.at[row, 'B_fighter'])
    elif cur_weight_class.lower() == 'light heavyweight': 
        lhw.append(UFC_Data.at[row, 'R_fighter'])
        lhw.append(UFC_Data.at[row, 'B_fighter'])
    elif cur_weight_class.lower() == 'women\'s featherweight': 
        wfw.append(UFC_Data.at[row, 'R_fighter'])
        wfw.append(UFC_Data.at[row, 'B_fighter'])
    elif cur_weight_class.lower() == 'catch weight': 
        catchw.append(UFC_Data.at[row, 'R_fighter'])
        catchw.append(UFC_Data.at[row, 'B_fighter'])

Mw = list(set(Mw))
ws = list(set(ws))
ww = list(set(ww))
fw = list(set(fw))
lw = list(set(lw))
flyw = list(set(flyw))
bw = list(set(bw))
wflyw = list(set(wflyw))
wbw = list(set(wbw))
hw = list(set(hw))
lhw = list(set(lhw))
wfw = list(set(wfw))
catchw = list(set(catchw))

fighter_list = Mw + ws + ww + fw + lw + flyw + bw + wflyw + wbw + hw + lhw + wfw + catchw
fighter_list = list(set(fighter_list))


In [96]:
stats = ['current_win_streak', 'losses', 'total_rounds_fought', 'Reach_cms', 'age', 'avg_TD_landed', 'sig_strikes_landed']

In [97]:
fighter_stats = pd.DataFrame(columns = stats) #colums are not the same as df_6
fighter_stats.insert(0, 'Name', fighter_list)
fighter_stats

Unnamed: 0,Name,current_win_streak,losses,total_rounds_fought,Reach_cms,age,avg_TD_landed,sig_strikes_landed
0,Victoria Leonardo,,,,,,,
1,Livinha Souza,,,,,,,
2,Shavkat Rakhmonov,,,,,,,
3,Mark Scanlon,,,,,,,
4,Alex Garcia,,,,,,,
...,...,...,...,...,...,...,...,...
1723,Amanda Lemos,,,,,,,
1724,Dennis Bermudez,,,,,,,
1725,Dooho Choi,,,,,,,
1726,Khama Worthy,,,,,,,


In [98]:
fighter_list_size = fighter_stats.shape[0]
ds_size = UFC_Data.shape[0]
for i in range(fighter_list_size):
    fighter_name = fighter_stats['Name'][i]
    for j in range(ds_size):
        if fighter_name == UFC_Data['R_fighter'][j]:
            fighter_stats['current_win_streak'][i] = UFC_Data['R_current_win_streak'][j]
            fighter_stats['losses'][i] = UFC_Data['R_losses'][j]
            fighter_stats['total_rounds_fought'][i] = UFC_Data['R_total_rounds_fought'][j]
            fighter_stats['Reach_cms'][i] = UFC_Data['R_Reach_cms'][j]
            fighter_stats['age'][i] = UFC_Data['R_age'][j]
            fighter_stats['avg_TD_landed'][i] = UFC_Data['R_avg_TD_landed'][j]
            fighter_stats['sig_strikes_landed'][i] = UFC_Data['R_avg_SIG_STR_landed'][j]
            break
        elif fighter_name == UFC_Data['B_fighter'][j]:
            fighter_stats['current_win_streak'][i] = UFC_Data['B_current_win_streak'][j]
            fighter_stats['losses'][i] = UFC_Data['B_losses'][j]
            fighter_stats['total_rounds_fought'][i] = UFC_Data['B_total_rounds_fought'][j]
            fighter_stats['Reach_cms'][i] = UFC_Data['B_Reach_cms'][j]
            fighter_stats['age'][i] = UFC_Data['B_age'][j]
            fighter_stats['avg_TD_landed'][i] = UFC_Data['B_avg_TD_landed'][j]
            fighter_stats['sig_strikes_landed'][i] = UFC_Data['B_avg_SIG_STR_landed'][j]
            break

fighter_stats


Unnamed: 0,Name,current_win_streak,losses,total_rounds_fought,Reach_cms,age,avg_TD_landed,sig_strikes_landed
0,Victoria Leonardo,1,0,2,162.56,30,4.65,4.85
1,Livinha Souza,1,1,10,160.02,29,2.4,2.06
2,Shavkat Rakhmonov,1,0,1,195.58,26,0.0,2.79
3,Mark Scanlon,0,0,0,177.8,27,,
4,Alex Garcia,0,4,22,182.88,31,3.111111,20.555556
...,...,...,...,...,...,...,...,...
1723,Amanda Lemos,3,1,7,165.1,34,1.95,6.32
1724,Dennis Bermudez,0,7,39,167.64,32,2.875,50.4375
1725,Dooho Choi,0,2,8,177.8,28,0.55,
1726,Khama Worthy,0,1,5,187.96,34,0.0,4.22


In [99]:
Mw = list(set(Mw))
ws = list(set(ws))
ww = list(set(ww))
fw = list(set(fw))
lw = list(set(lw))
flyw = list(set(flyw))
bw = list(set(bw))
wflyw = list(set(wflyw))
wbw = list(set(wbw))
hw = list(set(hw))
lhw = list(set(lhw))
wfw = list(set(wfw))
catchw = list(set(catchw))

In [100]:
largelist = len(Mw)
if largelist < len(ws): largelist = len(ws)
if largelist < len(ww): largelist = len(ww)
if largelist < len(fw): largelist = len(fw)
if largelist < len(lw): largelist = len(lw)
if largelist < len(flyw): largelist = len(flyw)
if largelist < len(bw): largelist = len(bw)
if largelist < len(wflyw): largelist = len(wflyw)
if largelist < len(wbw): largelist = len(wbw)
if largelist < len(hw): largelist = len(hw)
if largelist < len(lhw): largelist = len(lhw)
if largelist < len(wfw): largelist = len(wfw)
if largelist < len(catchw): largelist = len(catchw)

Mw_2 = [" "]*largelist
ws_2 = [" "]*largelist
ww_2 = [" "]*largelist
fw_2 = [" "]*largelist
lw_2 = [" "]*largelist
flyw_2 = [" "]*largelist
bw_2 = [" "]*largelist
wflyw_2 = [" "]*largelist
wbw_2 = [" "]*largelist
hw_2 = [" "]*largelist
lhw_2 = [" "]*largelist
wfw_2 = [" "]*largelist
catchw_2 = [" "]*largelist

for i in range(len(Mw)): Mw_2[i] = Mw[i]
for i in range(len(ws)): ws_2[i] = ws[i]
for i in range(len(ww)): ww_2[i] = ww[i]
for i in range(len(fw)): fw_2[i] = fw[i]
for i in range(len(lw)): lw_2[i] = lw[i]
for i in range(len(flyw)): flyw_2[i] = flyw[i]
for i in range(len(bw)): bw_2[i] = bw[i]
for i in range(len(wflyw)): wflyw_2[i] = wflyw[i]
for i in range(len(wbw)): wbw_2[i] = wbw[i]
for i in range(len(hw)): hw_2[i] = hw[i]
for i in range(len(lhw)): lhw_2[i] = lhw[i]
for i in range(len(wfw)): wfw_2[i] = wfw[i]
for i in range(len(catchw)): catchw_2[i] = catchw[i]

fighter_classes = pd.DataFrame(data = {
    'middleweight': Mw_2, 
    'womens_strawweight': ws_2, 
    'welterweight': ww_2, 
    'featherweight': fw_2, 
    'lightweight': lw_2, 
    'flyweight': flyw_2, 
    'bantamweight': bw_2, 
    'womens_flyweight': wflyw_2, 
    'womens_bantamweight': wbw_2, 
    'heavyweight': hw_2, 
    'light_heavyweight': lhw_2, 
    'womens_featherweight': wfw_2, 
    'catch_weight': catchw_2
    })
fighter_classes

Unnamed: 0,middleweight,womens_strawweight,welterweight,featherweight,lightweight,flyweight,bantamweight,womens_flyweight,womens_bantamweight,heavyweight,light_heavyweight,womens_featherweight,catch_weight
0,Tom Breese,Ericka Almeida,Tom Breese,Terrion Ware,Nasrat Haqparast,Elias Garcia,Terrion Ware,Jessica Eye,Irene Aldana,Randy Couture,Randy Couture,Felicia Spencer,Sean Strickland
1,Papy Abedi,Alexa Grasso,Jake Ellenberger,Mirsad Bektic,Paul Sass,Darren Uyenoyama,Sean O'Malley,Alexa Grasso,Aspen Ladd,Fabio Maldonado,Gadzhimurad Antigulov,Macy Chiasson,Fabio Maldonado
2,Deron Winn,Valerie Letourneau,Shavkat Rakhmonov,Zubaira Tukhugov,Chris Saunders,Ulysses Gomez,Jin Soo Son,Jennifer Maia,Elizabeth Phillips,Antonio Silva,Fabio Maldonado,Leah Letson,Clay Collard
3,Michael Bisping,Livinha Souza,Cezar Ferreira,Matt Bessette,Aaron Wilkinson,Jon Delos Reyes,Darren Uyenoyama,Victoria Leonardo,Lucie Pudilova,Damian Grabowski,James Irvin,Norma Dumont,Deron Winn
4,Bartosz Fabinski,Marina Rodriguez,Diego Sanchez,Ricardo Ramos,Joshua Culibao,Amir Albazi,Ken Stone,Valerie Letourneau,Jessica Eye,Junior Dos Santos,Karl Roberson,Megan Anderson,Nik Lentz
...,...,...,...,...,...,...,...,...,...,...,...,...,...
367,,,Benson Henderson,,Devonte Smith,,,,,,,,
368,,,Yoshihiro Akiyama,,Benson Henderson,,,,,,,,
369,,,Court McGee,,Vagner Rocha,,,,,,,,
370,,,Brian Melancon,,Katsunori Kikuno,,,,,,,,


In [101]:
df_6 = df_6.drop(['R_odds', 'B_odds'], axis = 'columns')

In [102]:
X_train, X_test, y_train, y_test = tts(df_6)
model_hi, acc_hi = highest_accuracy_model(X_train, X_test, y_train, y_test)

DecisionTreeClassifier(max_depth=200)  Accuracy Score (Training Set):  0.9953556587631386
DecisionTreeClassifier(max_depth=200)  Accuracy Score (Test Set):  0.538781163434903 

KNeighborsClassifier(n_neighbors=2)  Accuracy Score (Training Set):  0.7650941090197996
KNeighborsClassifier(n_neighbors=2)  Accuracy Score (Test Set):  0.5138504155124654 

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregressioncv', LogisticRegressionCV())])  Accuracy Score (Training Set):  0.6049865558543144
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregressioncv', LogisticRegressionCV())])  Accuracy Score (Test Set):  0.5969529085872576 

LogisticRegressionCV()  Accuracy Score (Training Set):  0.6054754338792472
LogisticRegressionCV()  Accuracy Score (Test Set):  0.6163434903047091 

GaussianNB()  Accuracy Score (Training Set):  0.5878758249816671
GaussianNB()  Accuracy Score (Test Set):  0.5734072022160664 

RandomForestClassifier(n_jobs

In [103]:
weight_classes = UFC_Data.weight_class.unique()

In [104]:
data = {
    "model": model_hi,
    "weight_classes": weight_classes,
    "fighter_stats": fighter_stats,
    "fighter_list": fighter_list,
    "fighter_classes": fighter_classes,
    "model_acc": acc_hi,
}
with open('saved_steps.pkl', 'wb') as file:
    pickle.dump(data, file)
