# 08 Assemble Training Data

- The purpose of this notebook is to aggregate all of the processed data into a single training set
- We will use the training stub as the basis for the dataframe and merge the other prepped data onto it
- Data that will be appended to it:
    - Static Fighter Stats (including info from fighter page info)
    - Historical Fight Averages

## Imports

In [67]:
import pandas as pd
import numpy as np
from scipy import stats

## Functions

In [68]:
def check_nulls(df):    
    return df.loc[:,df.isnull().sum()!=0].isnull().sum()

## Pull in Training Stub

In [69]:
train = pd.read_csv('../../02_Data/02_Processed_Data/train_stub.csv', index_col=0)

## Get Fighter_Static_Stats

In [70]:
fighter_static_stats = pd.read_csv('../../02_Data/02_Processed_Data/fighter_static_stats.csv', index_col=0)

## Get Historical Fight Averages

In [71]:
hist_avg = pd.read_csv('../../02_Data/02_Processed_Data/historical_avgs.csv', index_col=0)

## Append static fighter stats to stub

In [73]:
# Prep dataframe for merge for fighter 1
f1_to_merge = fighter_static_stats.copy()
f1_to_merge.columns = ['f1_'+ col for col in f1_to_merge.columns]

# merge
train = train.merge(f1_to_merge, how='left', on='f1_fighterid')

# Prep dataframe for merge for fighter 2
f2_to_merge = fighter_static_stats.copy()
f2_to_merge.columns = ['f2_'+ col for col in f2_to_merge.columns]

# merge
train = train.merge(f2_to_merge, how='left', on='f2_fighterid')


# Prep dataframe for merge for fighter 1
hist_avg_f1 = hist_avg.copy()

# First append 'F1_' for all fighter 1 data
hist_avg_f1.columns = ['eventid','fightid','f1_fighterid','date'] + \
                    ['f1_' + col for col in hist_avg_f1.columns if col not in \
                    ['eventid','fightid','f1_fighterid','date']]

# Merge F1 Expanding Means
train = train.merge(hist_avg_f1, how='inner',left_on=['eventid','fightid','f1_fighterid'],
                    right_on=['eventid','fightid','f1_fighterid'])

#Setup 
hist_avg_f2 = hist_avg.drop(columns=['date']).copy()
hist_avg_f2.columns = ['eventid','fightid','f2_fighterid'] + \
                    ['f2_' + col for col in hist_avg_f2.columns if col not in \
                    ['eventid','fightid','f1_fighterid']]

# Merge em for fighter 2
train = train.merge(hist_avg_f2, how='inner',left_on=['eventid','fightid','f2_fighterid'], 
                    right_on=['eventid','fightid','f2_fighterid'])

# drop columns w/o outcome
#train = train.dropna(axis=0, how='any')

In [74]:
train.shape

(3214, 454)

In [75]:
(3214, 454)

(3214, 454)

In [76]:
check_nulls(train)

f1_outcome     34
f1_reach       17
f1_height_y     1
f2_reach       17
f2_height_y     1
dtype: int64

In [78]:
[col for col in train if 'x' in col]

['f1_height_x', 'f1_weight_x', 'f2_height_x', 'f2_weight_x']

In [79]:
[col for col in train if 'heigh' in col]

['f1_height_x', 'f2_height_x', 'f1_height_y', 'f2_height_y']

## I have values for height and weight from 2 sources
- Let's see what is different

In [96]:
list(train.columns)

['eventid',
 'fightid',
 'f1_fullname',
 'f2_fullname',
 'f1_fighterid',
 'f2_fighterid',
 'f1_outcome',
 'f1_slug',
 'f2_slug',
 'f1_height_x',
 'f1_weight_x',
 'f1_stance',
 'f2_height_x',
 'f2_weight_x',
 'f2_stance',
 'f1_Unnamed: 0',
 'f1_reach',
 'f1_weight_y',
 'f1_height_y',
 'f2_Unnamed: 0',
 'f2_reach',
 'f2_weight_y',
 'f2_height_y',
 'date',
 'f1_f1_body_significant_strikes_attempts_em',
 'f1_f1_body_significant_strikes_landed_em',
 'f1_f1_body_total_strikes_attempts_em',
 'f1_f1_body_total_strikes_landed_em',
 'f1_f1_clinch_body_strikes_attempts_em',
 'f1_f1_clinch_body_strikes_landed_em',
 'f1_f1_clinch_head_strikes_attempts_em',
 'f1_f1_clinch_head_strikes_landed_em',
 'f1_f1_clinch_leg_strikes_attempts_em',
 'f1_f1_clinch_leg_strikes_landed_em',
 'f1_f1_clinch_significant_strikes_attempts_em',
 'f1_f1_clinch_significant_strikes_landed_em',
 'f1_f1_clinch_total_strikes_attempts_em',
 'f1_f1_clinch_total_strikes_landed_em',
 'f1_f1_distance_body_strikes_attempts_em',
 'f1

In [98]:
train[['f1_weight_x','f1_weight_y'] + list(train.columns)]

Unnamed: 0,f1_weight_x,f1_weight_y,eventid,fightid,f1_fullname,f2_fullname,f1_fighterid,f2_fighterid,f1_outcome,f1_slug,...,f2_total_strikes_percent_diff_em,f2_significant_strikes_percent_diff_em,f2_clinch_significant_strikes_percent_diff_em,f2_distance_body_strikes_percent_diff_em,f2_distance_leg_strikes_percent_diff_em,f2_clinch_leg_strikes_percent_diff_em,f2_distance_strikes_percent_diff_em,f2_clinch_head_strikes_percent_diff_em,f2_distance_head_strikes_percent_diff_em,f2_clinch_body_strikes_percent_diff_em
0,205,205.0,644,4531,Anthony Perosh,Ryan Bader,175,503,Loss,anthonyperosh,...,-0.142857,-0.177570,-0.294118,0.000000,0.000000,1.000000,-0.075630,1.000000,-0.075630,-1.000000
1,170,170.0,701,5191,Brandon Thatch,Benson Henderson,1331,1066,Loss,brandonthatch,...,0.033836,0.058221,0.015409,-0.131298,0.199837,0.600000,0.016903,-0.018182,-0.162089,0.166667
2,145,145.0,701,5126,Cole Miller,Max Holloway,522,1936,Loss,colemiller,...,0.150662,0.138108,0.113008,0.116013,0.095316,0.000000,0.105778,0.105494,0.129372,0.200000
3,170,170.0,701,5192,Kiichi Kunimoto,Neil Magny,2131,2047,Loss,kiichikunimoto,...,0.009283,0.050124,0.319231,-0.067327,-0.258916,0.285714,0.032183,0.425325,0.190398,0.336538
4,205,185.0,701,5193,Patrick Walsh,Dan Kelly,2283,2324,Loss,patrickwalsh,...,0.045501,0.020408,-0.166667,0.000000,0.000000,0.000000,0.010513,-0.166667,-0.055901,0.000000
5,155,155.0,701,5194,Kevin Lee,Michel Prazeres,2157,2074,Win,kevinlee,...,0.168854,0.105557,-0.056522,-0.416667,0.000000,0.000000,0.069763,0.500000,0.193856,0.071429
6,125,125.0,701,5128,Chris Kelades,Ray Borg,2332,2217,Loss,chriskelades,...,0.085226,0.435967,-0.240385,0.000000,0.000000,-0.500000,0.537190,-0.166667,0.607843,-0.500000
7,155,155.0,701,5195,Rodrigo de Lima,Efrain Escudero,2257,542,Loss,rodrigodelima,...,0.037037,0.073631,0.411765,0.250000,0.000000,0.000000,0.227378,1.000000,0.320228,-0.333333
8,145,145.0,701,5270,Jim Alers,Chas Skelly,2202,2214,Loss,jimalers,...,-0.162911,-0.155084,0.369458,-0.666667,0.000000,0.333333,-0.401827,0.370370,-0.399582,-0.222222
9,125,125.0,701,5127,Timothy Elliott,Zach Makovsky,1970,2152,Loss,timelliott,...,-0.031645,0.053921,-0.487179,0.111111,0.677419,-0.333333,0.099199,-1.000000,0.092835,-0.666667


In [94]:
cols_to_show = ['f1_weight_x','f1_weight_y'].extend([col for col in train.columns if 'weight' not in col])
#train[cols_to_show]
cols_to_show

In [101]:
train[train.apply(lambda x: True if x['f1_weight_x'] != x['f1_weight_y'] else False , 
                  axis=1)][['f1_weight_x','f1_weight_y'] + list(train.columns)].head()

Unnamed: 0,f1_weight_x,f1_weight_y,eventid,fightid,f1_fullname,f2_fullname,f1_fighterid,f2_fighterid,f1_outcome,f1_slug,...,f2_total_strikes_percent_diff_em,f2_significant_strikes_percent_diff_em,f2_clinch_significant_strikes_percent_diff_em,f2_distance_body_strikes_percent_diff_em,f2_distance_leg_strikes_percent_diff_em,f2_clinch_leg_strikes_percent_diff_em,f2_distance_strikes_percent_diff_em,f2_clinch_head_strikes_percent_diff_em,f2_distance_head_strikes_percent_diff_em,f2_clinch_body_strikes_percent_diff_em
4,205,185.0,701,5193,Patrick Walsh,Dan Kelly,2283,2324,Loss,patrickwalsh,...,0.045501,0.020408,-0.166667,0.0,0.0,0.0,0.010513,-0.166667,-0.055901,0.0
20,155,170.0,869,7155,Chad Laprise,Vicente Luque,2222,2588,Loss,chadlaprise,...,0.118147,0.092867,0.252026,0.108075,-0.08775,0.171429,0.052793,-0.170068,-0.078212,0.008403
21,155,170.0,869,7195,Michel Prazeres,Zak Cummings,2074,1422,Win,michelprazeres,...,-0.216969,-0.2518,0.260033,-0.244818,0.186143,0.625,-0.290281,0.006579,-0.342595,-0.013889
42,155,170.0,717,5436,Yancy Medeiros,Dustin Poirier,1481,1595,Loss,yancymedeiros,...,0.180799,0.185059,0.044451,0.000902,0.127273,-0.133333,0.227766,0.229487,0.193634,-0.011765
47,170,185.0,717,5410,Omari Akhmedov,Brian Ebersole,2104,1209,Win,omariakhmedov,...,-0.032695,-0.041684,0.327815,-0.049296,-0.218391,0.461538,-0.089057,0.481481,-0.085157,0.030303


In [103]:
train[train.apply(lambda x: True if x['f1_height_x'] != x['f1_height_y'] else False , 
                  axis=1)][['f1_height_x','f1_height_y'] + list(train.columns)]

Unnamed: 0,f1_height_x,f1_height_y,eventid,fightid,f1_fullname,f2_fullname,f1_fighterid,f2_fighterid,f1_outcome,f1_slug,...,f2_total_strikes_percent_diff_em,f2_significant_strikes_percent_diff_em,f2_clinch_significant_strikes_percent_diff_em,f2_distance_body_strikes_percent_diff_em,f2_distance_leg_strikes_percent_diff_em,f2_clinch_leg_strikes_percent_diff_em,f2_distance_strikes_percent_diff_em,f2_clinch_head_strikes_percent_diff_em,f2_distance_head_strikes_percent_diff_em,f2_clinch_body_strikes_percent_diff_em
24,71,73.0,869,7212,Humberto Bandenay,Gabriel Benitez,2960,2360,Loss,humbertobandenay,...,0.129619,0.159548,-0.351214,0.191144,0.049718,-0.333333,0.184637,-0.291317,0.111206,-0.166667
31,73,71.0,740,5699,George Roop,Takeya Mizugaki,454,1051,Loss,georgeroop,...,-0.152856,-0.199400,-0.061704,-0.295463,0.000000,-0.181818,-0.173780,-0.216393,-0.072039,0.000000
35,68,69.0,652,4604,Jeremy Stephens,Darren Elkins,545,1486,Win,jeremystephens,...,-0.135217,-0.148148,-0.333333,-0.232877,-1.000000,0.000000,-0.262369,-0.500000,-0.146919,-0.200000
117,68,69.0,795,6343,Magomed Mustafaev,Kevin Lee,2529,2157,Loss,magomedmustafaev,...,-0.040832,-0.070165,0.207261,0.151197,-0.461111,-0.125000,-0.173092,0.158694,-0.147686,0.109706
120,63,65.0,795,6376,Milana Dudieva,Marion Reneau,2235,2376,Loss,milanadudieva,...,0.115764,0.089892,-0.071780,0.085473,0.055323,0.000000,0.140245,0.207113,0.140723,-0.255814
127,76,74.0,845,6954,Mark Godbeer,Walt Harris,2827,2106,,markgodbeer,...,-0.030356,-0.008465,0.276931,-0.058294,0.035693,0.000000,-0.040608,0.335743,-0.096686,0.017857
141,66,67.0,865,7211,Ricky Simon,Merab Dvalishvili,2971,3020,Win,rickysimon,...,0.003356,-0.157450,-0.070339,-0.304348,-0.230769,0.000000,-0.132353,0.100917,-0.011259,-0.056911
154,78,79.0,808,6523,Travis Browne,Derrick Lewis,1515,2082,Loss,travisbrowne,...,-0.101068,0.017632,0.261318,0.333333,-0.366667,-0.100000,-0.019982,0.383448,0.496654,0.165714
170,68,69.0,721,5418,Jeremy Stephens,Dennis Bermudez,545,1907,Win,jeremystephens,...,-0.005499,-0.051242,-0.060540,-0.060606,0.219388,0.537234,-0.063877,0.231325,-0.214971,0.125000
194,66,64.0,776,6147,Matheus Nicolau,John Moraga,2578,2007,Win,matheusnicolau,...,-0.100831,-0.125325,0.183742,-0.129527,-0.112252,0.357143,-0.152349,0.200000,-0.148536,0.226013


In [None]:
df.apply(lambda x : x['one'] if x['one'] >= x['two'] and x['one'] <= x['three'] else "", axis=1)

## Okay, missing data isn't too too bad now...
- 34 outcomes => look up
- 17 reach => impute with height
- 1 alt height => check other side


## Export the training data set

In [11]:
train.to_csv('../../02_Data/02_Processed_Data/train.csv')

## There are some more transformations I need to do to the dataset...
- differentials between f1 and f2  - how to do this??