# Predicting UFC Winners

## Overview

## Business Problem

## Data Understanding

In [180]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [159]:
# Importing everything needed
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import (accuracy_score, precision_score, ConfusionMatrixDisplay, 
recall_score, confusion_matrix, precision_score)
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer

In [125]:
# Read in the data and display a preview
df = pd.read_csv('../data/ufc-master.csv')
df.head()

Unnamed: 0,R_fighter,B_fighter,R_odds,B_odds,R_ev,B_ev,date,location,country,Winner,...,finish_details,finish_round,finish_round_time,total_fight_time_secs,r_dec_odds,b_dec_odds,r_sub_odds,b_sub_odds,r_ko_odds,b_ko_odds
0,Thiago Santos,Johnny Walker,-150.0,130,66.666667,130.0,2021-10-02,"Las Vegas, Nevada, USA",USA,Red,...,,5.0,5:00,1500.0,800.0,900.0,2000.0,1600.0,-110.0,175.0
1,Alex Oliveira,Niko Price,170.0,-200,170.0,50.0,2021-10-02,"Las Vegas, Nevada, USA",USA,Blue,...,,3.0,5:00,900.0,450.0,350.0,700.0,1100.0,550.0,120.0
2,Misha Cirkunov,Krzysztof Jotko,110.0,-130,110.0,76.923077,2021-10-02,"Las Vegas, Nevada, USA",USA,Blue,...,,3.0,5:00,900.0,550.0,275.0,275.0,1400.0,600.0,185.0
3,Alexander Hernandez,Mike Breeden,-675.0,475,14.814815,475.0,2021-10-02,"Las Vegas, Nevada, USA",USA,Red,...,Punch,1.0,1:20,80.0,175.0,900.0,500.0,3500.0,110.0,1100.0
4,Joe Solecki,Jared Gordon,-135.0,115,74.074074,115.0,2021-10-02,"Las Vegas, Nevada, USA",USA,Blue,...,,3.0,5:00,900.0,165.0,200.0,400.0,1200.0,900.0,600.0


In [126]:
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4896 entries, 0 to 4895
Data columns (total 119 columns):
 #   Column                        Dtype  
---  ------                        -----  
 0   R_fighter                     object 
 1   B_fighter                     object 
 2   R_odds                        float64
 3   B_odds                        int64  
 4   R_ev                          float64
 5   B_ev                          float64
 6   date                          object 
 7   location                      object 
 8   country                       object 
 9   Winner                        object 
 10  title_bout                    bool   
 11  weight_class                  object 
 12  gender                        object 
 13  no_of_rounds                  int64  
 14  B_current_lose_streak         int64  
 15  B_current_win_streak          int64  
 16  B_draw                        int64  
 17  B_avg_SIG_STR_landed          float64
 18  B_avg_SIG_STR_pct          

## Data Preparation

I want to change date column to a datetime object but they are in different formats

In [127]:
df['date']

0       2021-10-02
1       2021-10-02
2       2021-10-02
3       2021-10-02
4       2021-10-02
           ...    
4891     3/21/2010
4892     3/21/2010
4893     3/21/2010
4894     3/21/2010
4895     3/21/2010
Name: date, Length: 4896, dtype: object

In [128]:
# Changing all / to - in date column
df['date'] = df['date'].str.replace('/', '-')

The first 58 dates are in a different format. They have year month day instead of month day year.

In [129]:
df['date'][:59]

0     2021-10-02
1     2021-10-02
2     2021-10-02
3     2021-10-02
4     2021-10-02
5     2021-10-02
6     2021-10-02
7     2021-10-02
8     2021-10-02
9     2021-10-02
10    2021-10-02
11    2021-09-25
12    2021-09-25
13    2021-09-25
14    2021-09-25
15    2021-09-25
16    2021-09-25
17    2021-09-25
18    2021-09-25
19    2021-09-25
20    2021-09-25
21    2021-09-25
22    2021-09-25
23    2021-09-25
24    2021-09-18
25    2021-09-18
26    2021-09-18
27    2021-09-18
28    2021-09-18
29    2021-09-18
30    2021-09-18
31    2021-09-18
32    2021-09-18
33    2021-09-18
34    2021-09-18
35    2021-09-18
36    2021-09-18
37    2021-09-04
38    2021-09-04
39    2021-09-04
40    2021-09-04
41    2021-09-04
42    2021-09-04
43    2021-09-04
44    2021-09-04
45    2021-09-04
46    2021-08-28
47    2021-08-28
48    2021-08-28
49    2021-08-28
50    2021-08-28
51    2021-08-28
52    2021-08-28
53    2021-08-28
54    2021-08-28
55    2021-08-28
56    2021-08-28
57    2021-08-28
58     8-21-20

Moving the year to end to have all dates in same format

In [130]:
df['date'][0][8:]

'02'

In [131]:
# Only want to change the first 58 values
for x in range(58):
    # Accessing each value in date column
    string = df.loc[x, 'date']
    # Slicing to get day, month, and year separated
    day = string[8:]
    month = string[5:7]
    year = string[:4]
    # Moving components around to get month-day-year format
    new_format = month + '-' + day + '-' + year
    # Update the DataFrame
    df.loc[x, 'date'] = new_format

In [132]:
df['date'][:59]

0     10-02-2021
1     10-02-2021
2     10-02-2021
3     10-02-2021
4     10-02-2021
5     10-02-2021
6     10-02-2021
7     10-02-2021
8     10-02-2021
9     10-02-2021
10    10-02-2021
11    09-25-2021
12    09-25-2021
13    09-25-2021
14    09-25-2021
15    09-25-2021
16    09-25-2021
17    09-25-2021
18    09-25-2021
19    09-25-2021
20    09-25-2021
21    09-25-2021
22    09-25-2021
23    09-25-2021
24    09-18-2021
25    09-18-2021
26    09-18-2021
27    09-18-2021
28    09-18-2021
29    09-18-2021
30    09-18-2021
31    09-18-2021
32    09-18-2021
33    09-18-2021
34    09-18-2021
35    09-18-2021
36    09-18-2021
37    09-04-2021
38    09-04-2021
39    09-04-2021
40    09-04-2021
41    09-04-2021
42    09-04-2021
43    09-04-2021
44    09-04-2021
45    09-04-2021
46    08-28-2021
47    08-28-2021
48    08-28-2021
49    08-28-2021
50    08-28-2021
51    08-28-2021
52    08-28-2021
53    08-28-2021
54    08-28-2021
55    08-28-2021
56    08-28-2021
57    08-28-2021
58     8-21-20

In [133]:
# Converting column to datetime from object
df['date'] = pd.to_datetime(df['date'], format='%m-%d-%Y')

In [134]:
df['date']

0      2021-10-02
1      2021-10-02
2      2021-10-02
3      2021-10-02
4      2021-10-02
          ...    
4891   2010-03-21
4892   2010-03-21
4893   2010-03-21
4894   2010-03-21
4895   2010-03-21
Name: date, Length: 4896, dtype: datetime64[ns]

In [135]:
df.describe()

Unnamed: 0,R_odds,B_odds,R_ev,B_ev,no_of_rounds,B_current_lose_streak,B_current_win_streak,B_draw,B_avg_SIG_STR_landed,B_avg_SIG_STR_pct,...,B_Flyweight_rank,B_Pound-for-Pound_rank,finish_round,total_fight_time_secs,r_dec_odds,b_dec_odds,r_sub_odds,b_sub_odds,r_ko_odds,b_ko_odds
count,4895.0,4896.0,4895.0,4896.0,4896.0,4896.0,4896.0,4896.0,3966.0,4131.0,...,95.0,35.0,4274.0,4274.0,4093.0,4077.0,3847.0,3835.0,3847.0,3834.0
mean,-117.640449,66.030637,94.827397,167.083323,3.181985,0.477941,0.875408,0.010621,26.308553,0.444741,...,8.473684,9.485714,2.408049,652.313758,294.064745,416.544027,843.010138,1064.543155,514.231869,647.257173
std,268.881452,247.803928,82.843409,136.944643,0.571515,0.769386,1.311379,0.108333,20.935885,0.121332,...,4.259763,4.300283,0.996643,357.911423,230.583958,306.571299,550.126761,627.285034,413.622768,458.846643
min,-1700.0,-1200.0,5.882353,8.333333,3.0,0.0,0.0,0.0,0.0,0.0,...,1.0,2.0,1.0,5.0,-440.0,-200.0,-370.0,-1250.0,-550.0,-275.0
25%,-255.0,-145.0,39.215686,68.965517,3.0,0.0,0.0,0.0,5.61,0.3875,...,5.0,5.0,1.0,297.0,167.0,225.0,435.0,590.0,240.0,325.0
50%,-150.0,130.0,66.666667,130.0,3.0,0.0,0.0,0.0,24.759615,0.45,...,8.0,10.0,3.0,900.0,250.0,349.0,720.0,975.0,435.0,548.5
75%,126.5,220.0,126.5,220.0,3.0,1.0,1.0,0.0,39.075,0.51,...,12.0,13.5,3.0,900.0,400.0,525.0,1200.0,1400.0,700.0,880.75
max,775.0,1300.0,775.0,1300.0,5.0,6.0,12.0,2.0,154.0,1.0,...,15.0,15.0,5.0,1500.0,2200.0,2600.0,4665.0,4785.0,2675.0,3200.0


In [136]:
df.isna().sum()

R_fighter        0
B_fighter        0
R_odds           1
B_odds           0
R_ev             1
              ... 
b_dec_odds     819
r_sub_odds    1049
b_sub_odds    1061
r_ko_odds     1049
b_ko_odds     1062
Length: 119, dtype: int64

In [137]:
nulls = df.isna().sum()
nulls.sort_values(ascending=False)[:30]

B_Women's Featherweight_rank    4896
R_Women's Featherweight_rank    4889
B_Pound-for-Pound_rank          4861
B_Women's Flyweight_rank        4852
R_Women's Flyweight_rank        4837
B_Women's Strawweight_rank      4835
B_Women's Bantamweight_rank     4818
B_Bantamweight_rank             4811
B_Lightweight_rank              4809
B_Welterweight_rank             4807
B_Featherweight_rank            4806
B_Light Heavyweight_rank        4803
B_Flyweight_rank                4801
B_Middleweight_rank             4794
R_Women's Strawweight_rank      4792
B_Heavyweight_rank              4786
R_Women's Bantamweight_rank     4778
R_Featherweight_rank            4763
R_Middleweight_rank             4762
R_Bantamweight_rank             4759
R_Lightweight_rank              4757
R_Welterweight_rank             4756
R_Light Heavyweight_rank        4755
R_Heavyweight_rank              4754
R_Flyweight_rank                4754
R_Pound-for-Pound_rank          4730
B_match_weightclass_rank        4019
R

In [138]:
columns_to_drop = list(nulls.sort_values(ascending=False)[:29].index)
columns_to_drop

["B_Women's Featherweight_rank",
 "R_Women's Featherweight_rank",
 'B_Pound-for-Pound_rank',
 "B_Women's Flyweight_rank",
 "R_Women's Flyweight_rank",
 "B_Women's Strawweight_rank",
 "B_Women's Bantamweight_rank",
 'B_Bantamweight_rank',
 'B_Lightweight_rank',
 'B_Welterweight_rank',
 'B_Featherweight_rank',
 'B_Light Heavyweight_rank',
 'B_Flyweight_rank',
 'B_Middleweight_rank',
 "R_Women's Strawweight_rank",
 'B_Heavyweight_rank',
 "R_Women's Bantamweight_rank",
 'R_Featherweight_rank',
 'R_Middleweight_rank',
 'R_Bantamweight_rank',
 'R_Lightweight_rank',
 'R_Welterweight_rank',
 'R_Light Heavyweight_rank',
 'R_Heavyweight_rank',
 'R_Flyweight_rank',
 'R_Pound-for-Pound_rank',
 'B_match_weightclass_rank',
 'R_match_weightclass_rank',
 'finish_details']

In [139]:
df.drop(columns_to_drop, axis=1, inplace=True)

In [140]:
nulls2 = df.isna().sum()
nulls2.sort_values(ascending=False)

b_ko_odds                      1062
b_sub_odds                     1061
r_ko_odds                      1049
r_sub_odds                     1049
B_avg_SIG_STR_landed            930
                               ... 
R_win_by_Decision_Split           0
R_win_by_Decision_Unanimous       0
R_win_by_KO/TKO                   0
R_win_by_Submission               0
R_fighter                         0
Length: 90, dtype: int64

In [141]:
clean_df = df.dropna()
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2615 entries, 0 to 4286
Data columns (total 90 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   R_fighter                     2615 non-null   object        
 1   B_fighter                     2615 non-null   object        
 2   R_odds                        2615 non-null   float64       
 3   B_odds                        2615 non-null   int64         
 4   R_ev                          2615 non-null   float64       
 5   B_ev                          2615 non-null   float64       
 6   date                          2615 non-null   datetime64[ns]
 7   location                      2615 non-null   object        
 8   country                       2615 non-null   object        
 9   Winner                        2615 non-null   object        
 10  title_bout                    2615 non-null   bool          
 11  weight_class                  

In [142]:
# Creating a column that has the name of the underdog by looking at the odds columns
clean_df['underdog'] = clean_df.apply(
    lambda row: row['R_fighter'] if row['R_odds'] > row['B_odds'] else row['B_fighter'], axis=1)
# Creating a column that has the name of the favored fighter by looking at the odds columns
clean_df['favored'] = clean_df.apply(
    lambda row: row['R_fighter'] if row['R_odds'] < row['B_odds'] else row['B_fighter'], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_df['underdog'] = clean_df.apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_df['favored'] = clean_df.apply(


In [143]:
clean_df['underdog']

0        Johnny Walker
1        Alex Oliveira
2       Misha Cirkunov
3         Mike Breeden
4         Jared Gordon
             ...      
4275     Igor Pokrajac
4283         Nate Diaz
4284     Josh Koscheck
4285      Alan Belcher
4286     Lavar Johnson
Name: underdog, Length: 2615, dtype: object

In [144]:
clean_df['Winner']

0        Red
1       Blue
2       Blue
3        Red
4       Blue
        ... 
4275     Red
4283     Red
4284    Blue
4285    Blue
4286    Blue
Name: Winner, Length: 2615, dtype: object

In [145]:
# Changing Red winners to either underdog or favored
clean_df.loc[(clean_df['Winner'] == 'Red') & (clean_df['R_fighter'] == clean_df['underdog']), 'Winner'] = 'underdog'
clean_df.loc[(clean_df['Winner'] == 'Red') & (clean_df['R_fighter'] != clean_df['underdog']), 'Winner'] = 'favored'
# Changing Blue winners to either underdog or favored
clean_df.loc[(clean_df['Winner'] == 'Blue') & (clean_df['B_fighter'] == clean_df['underdog']), 'Winner'] = 'underdog'
clean_df.loc[(clean_df['Winner'] == 'Blue') & (clean_df['B_fighter'] != clean_df['underdog']), 'Winner'] = 'favored'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


In [146]:
clean_df['Winner'].value_counts(normalize=True)

favored     0.640918
underdog    0.359082
Name: Winner, dtype: float64

Wanted to compare this to the whole dataset before dropping nulls just to make sure it was close to the same amount of winners in each category.

In [147]:
df['R_odds'].dropna(inplace=True)
df['B_odds'].dropna(inplace=True)
df['R_fighter'].dropna(inplace=True)
df['B_fighter'].dropna(inplace=True)

# Creating a column that has the name of the underdog by looking at the odds columns
df['underdog'] = df.apply(
    lambda row: row['R_fighter'] if row['R_odds'] > row['B_odds'] else row['B_fighter'], axis=1)
# Creating a column that has the name of the favored fighter by looking at the odds columns
df['favored'] = df.apply(
    lambda row: row['R_fighter'] if row['R_odds'] < row['B_odds'] else row['B_fighter'], axis=1)

# Changing Red winners to either underdog or favored
df.loc[(df['Winner'] == 'Red') & (df['R_fighter'] == df['underdog']), 'Winner'] = 'underdog'
df.loc[(df['Winner'] == 'Red') & (df['R_fighter'] != df['underdog']), 'Winner'] = 'favored'
# Changing Blue winners to either underdog or favored
df.loc[(df['Winner'] == 'Blue') & (df['B_fighter'] == df['underdog']), 'Winner'] = 'underdog'
df.loc[(df['Winner'] == 'Blue') & (df['B_fighter'] != df['underdog']), 'Winner'] = 'favored'

df['Winner'].value_counts(normalize=True)

favored     0.652982
underdog    0.347018
Name: Winner, dtype: float64

In [149]:
clean_df.columns

Index(['R_fighter', 'B_fighter', 'R_odds', 'B_odds', 'R_ev', 'B_ev', 'date',
       'location', 'country', 'Winner', 'title_bout', 'weight_class', 'gender',
       'no_of_rounds', 'B_current_lose_streak', 'B_current_win_streak',
       'B_draw', 'B_avg_SIG_STR_landed', 'B_avg_SIG_STR_pct', 'B_avg_SUB_ATT',
       'B_avg_TD_landed', 'B_avg_TD_pct', 'B_longest_win_streak', 'B_losses',
       'B_total_rounds_fought', 'B_total_title_bouts',
       'B_win_by_Decision_Majority', 'B_win_by_Decision_Split',
       'B_win_by_Decision_Unanimous', 'B_win_by_KO/TKO', 'B_win_by_Submission',
       'B_win_by_TKO_Doctor_Stoppage', 'B_wins', 'B_Stance', 'B_Height_cms',
       'B_Reach_cms', 'B_Weight_lbs', 'R_current_lose_streak',
       'R_current_win_streak', 'R_draw', 'R_avg_SIG_STR_landed',
       'R_avg_SIG_STR_pct', 'R_avg_SUB_ATT', 'R_avg_TD_landed', 'R_avg_TD_pct',
       'R_longest_win_streak', 'R_losses', 'R_total_rounds_fought',
       'R_total_title_bouts', 'R_win_by_Decision_Majority',
  

In [197]:
correlation = clean_df.copy()
correlation['Winner'] = correlation['Winner'].map({'favored': 0, 'underdog': 1})
correlation.corr()

Unnamed: 0,R_odds,B_odds,R_ev,B_ev,Winner,title_bout,no_of_rounds,B_current_lose_streak,B_current_win_streak,B_draw,B_avg_SIG_STR_landed,B_avg_SIG_STR_pct,B_avg_SUB_ATT,B_avg_TD_landed,B_avg_TD_pct,B_longest_win_streak,B_losses,B_total_rounds_fought,B_total_title_bouts,B_win_by_Decision_Majority,B_win_by_Decision_Split,B_win_by_Decision_Unanimous,B_win_by_KO/TKO,B_win_by_Submission,B_win_by_TKO_Doctor_Stoppage,B_wins,B_Height_cms,B_Reach_cms,B_Weight_lbs,R_current_lose_streak,R_current_win_streak,R_draw,R_avg_SIG_STR_landed,R_avg_SIG_STR_pct,R_avg_SUB_ATT,R_avg_TD_landed,R_avg_TD_pct,R_longest_win_streak,R_losses,R_total_rounds_fought,R_total_title_bouts,R_win_by_Decision_Majority,R_win_by_Decision_Split,R_win_by_Decision_Unanimous,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_wins,R_Height_cms,R_Reach_cms,R_Weight_lbs,R_age,B_age,lose_streak_dif,win_streak_dif,longest_win_streak_dif,win_dif,loss_dif,total_round_dif,total_title_bout_dif,ko_dif,sub_dif,height_dif,reach_dif,age_dif,sig_str_dif,avg_sub_att_dif,avg_td_dif,empty_arena,constant_1,finish_round,total_fight_time_secs,r_dec_odds,b_dec_odds,r_sub_odds,b_sub_odds,r_ko_odds,b_ko_odds
R_odds,1.0,-0.969685,0.820451,-0.93646,0.137722,-0.196095,-0.15793,-0.027647,0.024328,0.01085,0.038014,0.108025,0.001856,0.061827,0.070142,0.020461,-0.116079,-0.07555,-0.041624,0.006505,-0.091063,-0.031058,-0.006231,-0.017103,-0.024957,-0.038141,0.075254,0.091279,0.040339,0.154609,-0.230309,0.019911,-0.078028,-0.149505,0.012893,-0.142352,-0.090321,-0.115931,0.268107,0.112649,-0.145552,0.010142,0.117939,0.014614,0.024388,0.002703,0.017938,0.044176,0.025823,-0.006776,0.023663,0.258094,-0.231561,0.12143,0.242379,0.137611,-0.078328,0.229784,-0.166773,0.105532,-0.032887,-0.014456,0.068832,0.116274,0.304149,0.114318,-0.00809,0.150323,-0.000702,,-0.036744,-0.019585,0.356382,-0.67555,0.325443,-0.392195,0.508602,-0.601188
B_odds,-0.969685,1.0,-0.878302,0.892607,-0.120407,0.17693,0.153233,0.036696,-0.036244,-0.006083,-0.038942,-0.107301,0.001029,-0.074257,-0.076881,-0.016464,0.129913,0.083375,0.050828,0.000988,0.083494,0.031495,0.023634,0.022456,0.029467,0.046714,-0.074261,-0.089303,-0.037035,-0.158784,0.213624,-0.016656,0.086494,0.143693,-0.006313,0.134691,0.087562,0.108433,-0.270953,-0.119088,0.128178,-0.010822,-0.117411,-0.016035,-0.022997,-0.008838,-0.015182,-0.046776,-0.022311,0.00838,-0.019204,-0.268371,0.243356,-0.130898,-0.234805,-0.126359,0.088527,-0.23796,0.179119,-0.084254,0.046116,0.023354,-0.072458,-0.116346,-0.316723,-0.123448,0.005439,-0.153969,-0.010033,,0.035713,0.019816,-0.395788,0.640625,-0.339169,0.382441,-0.531754,0.586239
R_ev,0.820451,-0.878302,1.0,-0.639877,0.029997,-0.117862,-0.117817,-0.043298,0.046354,-0.008758,0.067365,0.094799,-0.005313,0.087414,0.071848,-0.001902,-0.14674,-0.102077,-0.058228,-0.003337,-0.078972,-0.033251,-0.054075,-0.029309,-0.021866,-0.06829,0.055117,0.072469,0.015707,0.140729,-0.159438,0.007906,-0.042687,-0.119993,-0.015292,-0.099965,-0.08218,-0.08703,0.244972,0.113722,-0.063802,0.008101,0.112918,0.020478,0.013437,0.008285,0.029361,0.041364,-0.001931,-0.023257,0.001133,0.259234,-0.233015,0.120831,0.189341,0.088125,-0.102113,0.22941,-0.189063,0.021769,-0.063051,-0.027793,0.080211,0.114647,0.300107,0.110216,0.007222,0.138331,-0.02816,,-0.053342,-0.048055,0.463185,-0.465919,0.345663,-0.303867,0.532764,-0.476239
B_ev,-0.93646,0.892607,-0.639877,1.0,-0.177689,0.216062,0.171694,0.010249,-0.008402,-0.024734,-0.010176,-0.099775,0.001214,-0.045714,-0.058339,-0.021866,0.088281,0.054085,0.033669,-0.009994,0.079467,0.029774,-0.006884,0.001601,0.034693,0.022528,-0.075813,-0.089008,-0.046221,-0.142301,0.241583,-0.032139,0.10464,0.142461,-0.019672,0.148668,0.086757,0.123325,-0.2534,-0.103415,0.178151,-0.022725,-0.108102,-0.008682,-0.026894,-0.000295,-0.012869,-0.040411,-0.035954,-0.004484,-0.031637,-0.227486,0.204755,-0.100889,-0.241813,-0.146476,0.060734,-0.207463,0.140853,-0.139752,0.02473,0.001388,-0.055136,-0.098147,-0.276278,-0.111532,0.01543,-0.142948,-0.040005,,0.019564,-0.002365,-0.267006,0.715771,-0.281434,0.397608,-0.445321,0.596485
Winner,0.137722,-0.120407,0.029997,-0.177689,1.0,-0.032849,0.004986,0.003296,0.008674,0.030653,1.1e-05,0.03648,0.033685,-0.02091,-0.006411,0.000722,-0.039958,-0.030937,-0.015461,-0.01119,-0.045436,-0.039186,0.010597,0.007996,-0.021922,-0.016911,0.019938,0.029455,-0.005084,0.002439,-0.026853,-0.012498,-0.026714,-0.009305,0.037469,-0.027552,0.010355,-0.020759,0.009583,-0.010394,-0.025404,0.014306,-0.01179,-0.020528,-0.000797,-0.00318,0.001786,-0.009506,0.005029,0.006965,-0.009078,0.042518,-0.032825,0.0037,0.032384,0.022053,-0.005329,0.032422,-0.015018,0.013399,0.007408,0.008344,0.020828,0.03135,0.052902,0.025826,-0.001931,0.004608,-0.022126,,0.001386,-0.000242,0.035015,-0.098816,-0.004819,-0.057663,0.058712,-0.118863
title_bout,-0.196095,0.17693,-0.117862,0.216062,-0.032849,1.0,0.633835,-0.11343,0.284542,0.037935,0.08623,0.049516,-0.001302,0.039177,0.034071,0.296054,-0.008342,0.17032,0.28608,0.051342,0.03905,0.189384,0.223308,0.057743,0.011644,0.219432,-0.048433,-0.040441,-0.008267,-0.155774,0.54623,-0.011451,0.16508,0.093936,-0.003593,0.111301,0.081469,0.371483,-0.122699,0.14769,0.464153,0.013831,0.060531,0.18828,0.181137,0.070072,-0.033926,0.208244,-0.055379,-0.018861,-0.009739,-0.00361,0.063091,-0.032686,-0.324532,-0.122943,-0.016642,-0.065003,-0.002586,-0.242604,0.015659,-0.016687,0.011109,-0.024421,-0.047145,-0.071236,0.001656,-0.05235,-0.0405,,0.239782,0.19796,0.032659,0.266988,-0.027241,0.144181,-0.151452,0.032418
no_of_rounds,-0.15793,0.153233,-0.117817,0.171694,0.004986,0.633835,1.0,-0.116146,0.305234,0.038543,0.083929,0.075734,-0.008248,0.006159,0.024334,0.392788,0.080229,0.268398,0.30466,0.074162,0.082055,0.237663,0.346758,0.128052,0.07119,0.338855,0.060332,0.072716,0.115994,-0.147655,0.421595,-0.020124,0.16017,0.105246,-0.029122,0.056281,0.053023,0.426778,-0.001075,0.257538,0.441037,0.051545,0.059986,0.2518,0.331169,0.103618,-0.00884,0.334828,0.075182,0.09842,0.122272,0.086376,0.137234,-0.038989,-0.187876,-0.094927,-0.039088,-0.06351,-0.02745,-0.210413,-0.02377,0.005811,-0.022697,-0.02488,-0.047133,-0.068847,0.01518,-0.036639,-0.034343,,0.254469,0.195777,0.149323,0.307702,0.023813,0.154251,-0.228356,-0.061498
B_current_lose_streak,-0.027647,0.036696,-0.043298,0.010249,0.003296,-0.11343,-0.116146,1.0,-0.545956,-0.029832,-0.067847,-0.097281,-0.056474,-0.087752,-0.060019,-0.080167,0.35119,0.118171,0.089748,-0.013515,0.018764,-0.064604,-0.018972,-0.032351,0.05648,-0.030556,-0.00506,-0.008683,0.002879,0.306985,-0.243917,0.029597,-0.073034,-0.053215,-0.013197,-0.052503,-0.071951,-0.14442,0.094513,-0.038269,-0.086127,-0.038301,0.021617,-0.107854,-0.080713,-0.056189,-0.03379,-0.102276,0.012369,-0.006116,0.015441,0.003645,0.145741,-0.377066,-0.161728,0.078513,0.076368,-0.100079,0.130856,0.132507,0.07081,0.02335,-0.024793,0.001106,-0.083916,0.001112,-0.033346,-0.026898,0.047076,,-0.041671,-0.03826,-0.031598,-0.013366,-0.005925,-0.002111,-0.032987,0.012078
B_current_win_streak,0.024328,-0.036244,0.046354,-0.008402,0.008674,0.284542,0.305234,-0.545956,1.0,0.008914,0.125507,0.154482,0.050789,0.132035,0.08832,0.439398,-0.158148,0.123449,0.027634,0.042425,0.095726,0.271761,0.21586,0.135666,0.020422,0.285616,0.055408,0.065586,0.047604,-0.256474,0.341022,-0.004738,0.135651,0.062827,-0.006158,0.08157,0.070508,0.334891,-0.005336,0.226043,0.26335,0.099052,0.040893,0.277264,0.232727,0.135942,0.035142,0.298526,0.028266,0.058342,0.040317,0.077153,-0.02727,0.15403,0.399087,0.04087,-0.049423,0.077788,-0.114031,-0.220146,-0.04698,-0.01548,0.037452,0.015292,0.057352,-0.002588,0.04328,0.038588,-0.080794,,0.115369,0.097914,0.043466,0.020481,0.055512,0.048009,-0.004079,-0.064514
B_draw,0.01085,-0.006083,-0.008758,-0.024734,0.030653,0.037935,0.038543,-0.029832,0.008914,1.0,-0.10822,-0.016323,0.001773,0.007868,0.027104,0.035194,0.011648,0.06573,0.022571,-0.012637,0.038134,0.061146,-0.016333,0.018112,-0.017559,0.028907,-0.055741,-0.056895,-0.05463,-0.005411,-0.00113,0.10576,-0.115938,-0.021528,0.01187,0.023825,0.041102,0.046842,0.037003,0.051812,0.028573,0.022011,0.042975,0.065932,0.016766,0.021369,-0.020852,0.051968,-0.064428,-0.048623,-0.054226,0.009106,0.021829,2.2e-05,0.009381,-0.017382,-0.026986,-0.057502,0.00389,-0.011896,-0.029059,-0.00473,0.013774,-0.010614,-0.028081,0.001236,-0.007399,-0.011592,0.157824,,0.048539,0.05109,-0.043415,-0.044818,0.022883,-0.003688,0.035701,0.00475


In [169]:
clean_df['no_of_rounds'].value_counts()

3    2294
5     320
Name: no_of_rounds, dtype: int64

Dropped this row because there is no such thing as a 4 round fight. They are either 3 or 5 rounds long.

In [163]:
# Dropping this row
clean_df.drop(clean_df.loc[clean_df['no_of_rounds'] == 4].index, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [175]:
clean_df['B_win_by_KO/TKO'].value_counts()

0     1204
1      633
2      339
3      156
4      112
5       56
6       34
7       24
8       20
9       18
11       8
10       8
12       2
Name: B_win_by_KO/TKO, dtype: int64

In [154]:
# Setting all wanted features equal to X
X = clean_df.drop(['Winner', 'R_fighter', 'B_fighter', 'R_ev', 'B_ev', 'location', 'B_avg_SIG_STR_landed', 
                   'B_avg_SIG_STR_pct', 'B_avg_SUB_ATT', 'B_avg_TD_landed', 'B_avg_TD_pct', 
                  'B_total_title_bouts', ''], axis=1)
# Setting target equal to y
y = clean_df['Winner']
# Performing the train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=333)

In [None]:
num_feats = ['R_odds', 'B_odds', 'date', 'B_current_lose_streak', 'B_current_win_streak', 'B_draw', 'B_longest_win_streak',
            'B_losses', 'B_total_rounds_fought', 'B_win_by_Decision_Majority', 'B_win_by_Decision_Split', 
             'B_win_by_Decision_Unanimous', 'B_win_by_KO/TKO', 'B_win_by_Submission']
ordinal_feats = ['']
categorical feats = ['country', 'title_bout', 'weight_class', 'gender', 'no_of_rounds', ]

## Modeling

## Evaluation

A dummy model would pick the underdog every time and be correct about 36% of the time. This also displays an expected yet slight class imbalance, nothing dramatic.

In [156]:
y_train.value_counts(normalize=True)

favored     0.63998
underdog    0.36002
Name: Winner, dtype: float64

In [None]:
# Creating a Pipeline for a decision tree model
tree_steps = [(scaler)]