# 03 Transform V2 Fight Details to add Percentages and Differentials
- Calculate historical average of fight stats for each fight
- Everything is from the perspective of the F1 fighter
- Stats for the F1 fighter are basically an "offensive" rating for the F1 fighter
- Stats for the F2 fighter are basically an inverse "defensive" rating for the F1 fighter
- We also want to look at the difference in the stats for each fight, and then historical average difference

## Imports

In [57]:
import pandas as pd
import re

## Pull in V2_Fight_Details CSV

In [58]:
df = pd.read_csv('../../02_Data/02_Processed_Data/V2_Fight_Details.csv',index_col=0)

#### Notes on this initial set of data:
- No nulls currently since I dropped them all already
- No time in position details since they had nulls
- Only 1 side.  Not flipped and appended yet

## Need to add and flip before the transformations

In [59]:
# Setup Columns Again
F1_Columns = [col for col in df.columns if 'f1' in col.lower()]
F2_Columns = [col for col in df.columns if 'f2' in col.lower()]
Other_Columns = [col for col in df.columns if not 'f2' in col.lower() and not 'f1' in col.lower()]

Ordered_Columns = Other_Columns + F1_Columns + F2_Columns
Flipped_Columns = Other_Columns + F2_Columns + F1_Columns

# Put Columns in Order
df = df[Ordered_Columns]

# Create Flipped df
flipped_df = df[Flipped_Columns]
flipped_df.columns = Ordered_Columns

# Concatenate df and flipped_df
df = pd.concat([df, flipped_df])

# Reset the index
df = df.reset_index().drop(columns=['index'])

## Setup Column References for Easy Access  *Do not delete*

In [60]:
# These columns are used in the transformations
F1_Columns = [col for col in df.columns if 'f1' in col.lower()]
F2_Columns = [col for col in df.columns if 'f2' in col.lower()]
Other_Columns = [col for col in df.columns if not 'f2' in col.lower() and not 'f1' in col.lower()]

F1_Strikes = [col for col in F1_Columns if 'strikes' in col.lower()] + ['F1_Knock_Down_Landed']
F1_Grappling = [col for col in F1_Columns if 'grappling' in col.lower()]
F1_TIP = [col for col in F1_Columns if 'tip' in col.lower()]
F1_Identification = ['F1_FighterID','F1_Name'] # This is what its supposed to be incase I add more columns
F1_Identification = list(set(F1_Columns) - set(F1_Strikes) - set(F1_Grappling) - set(F1_TIP))

F2_Strikes = [col for col in F2_Columns if 'strikes' in col.lower()] + ['F2_Knock_Down_Landed']
F2_Grappling = [col for col in F2_Columns if 'grappling' in col.lower()]
F2_TIP = [col for col in F2_Columns if 'tip' in col.lower()]
F2_Identification = ['F2_FighterID','F2_Name']
F2_Identification = list(set(F2_Columns) - set(F2_Strikes) - set(F2_Grappling) - set(F2_TIP))

## Perform Transformation:  Calculate success percent for every pair of words that have "landed" and "attempted"

In [61]:
def Perform_calc_success_percent(df):
    Both_Landed_Attempts, Attempts_Only, Landed_Only = get_attempts_landed_columns(df)
    num_columns_before = df.shape[1]
    columns_before = df.columns
    print(f'num columns before: {num_columns_before}')
    for col in Both_Landed_Attempts:
        df = calc_success_percent(df, col)
    num_columns_after = df.shape[1]
    columns_after = df.columns
    print(f"# Columns Added: {num_columns_after-num_columns_before}")
    print(f'num columns after: {num_columns_after}')
    new_columns = list(set(columns_after) - set(columns_before))
    return df, new_columns

In [62]:
def calc_success_percent(df, col_root):
    attempts = col_root + '_attempts'
    landed = col_root + '_landed'
    success_percent = col_root + '_percent'
    
    df[success_percent] = df.apply(lambda x: x[landed]/x[attempts] if x[attempts] != 0 else 0, axis=1)
    
    return df

#calc_success_percent(df, 'F1_Body_Significant_Strikes')

In [63]:
def get_attempts_landed_columns(df):
    list_cols_attempts = []
    list_cols_landed = []
    for column in df.columns: 
        if '_attempts' in column.lower():
            list_cols_attempts.append(column)
        if '_landed' in column.lower():
            list_cols_landed.append(column)
    list_cols_attempts = [re.sub('_attempts','',col) for col in list_cols_attempts] # Remove "_Attempts"
    list_cols_landed = [re.sub('_landed','',col) for col in list_cols_landed]
    Attempts_Only = set(list_cols_attempts) - set(list_cols_landed)
    Landed_Only =  set(list_cols_landed) - set(list_cols_attempts)
    Both_Landed_Attempts = set(list_cols_landed) & set(list_cols_attempts)
    return Both_Landed_Attempts, Attempts_Only, Landed_Only

### Run the Function

In [64]:
df, Landed_Percent = Perform_calc_success_percent(df)

num columns before: 107
# Columns Added: 46
num columns after: 153


## Perform Transformation:  Calculate In Fight Differentials

In [65]:
f1_calc_columns = list(set(F1_Columns) - set(F1_Identification) - set(F1_TIP))

In [66]:
# I also need to get the percent columns
# At this point the df already has the percent columns... so just grab them from the df directly
percent_cols = [col[3:] for col in df.columns if 'percent' in col and 'f1' in col]

In [67]:
def Perform_calc_fight_stat_differential(df):
    percent_cols = [col[3:] for col in df.columns if 'percent' in col and 'f1' in col]
    f1_calc_columns = list(set(F1_Columns) - set(F1_Identification) - set(F1_TIP))
    for_calc_columns = [col[3:] for col in f1_calc_columns] # Remove the F1_ part
    for_calc_columns.extend(percent_cols)
    num_columns_before = df.shape[1]
    columns_before = df.columns
    print(f'num columns before: {num_columns_before}')
    for col in for_calc_columns:
        df = calc_fight_stat_differential(df, col)
    num_columns_after = df.shape[1]
    columns_after = df.columns
    print(f"# Columns Added: {num_columns_after-num_columns_before}")
    print(f'num columns after: {num_columns_after}')
    new_columns = list(set(columns_after) - set(columns_before))
    return df, new_columns

In [68]:
def calc_fight_stat_differential(df, col_root):
    f1 = 'f1_' + col_root
    f2 = 'f2_' + col_root
    diff = col_root + '_diff'
    df[diff] = df.apply(lambda x: (x[f1] - x[f2])/(x[f1] + x[f2]) if (x[f1] + x[f2]) != 0 else 0, axis=1)
    return df

# calc_fight_stat_differential(df, col_root)

### Run the Function

In [69]:
df, stat_diff = Perform_calc_fight_stat_differential(df)

num columns before: 153
# Columns Added: 71
num columns after: 224


## This dataset is currently missing the date.  Join it from the Events_df

In [72]:
path = '../../02_Data/01_Raw_Scraped_Data/Events/events_df.csv'

events_df = pd.read_csv(path, index_col=0)
events_df = events_df[['EventId','Date']]
events_df = events_df.rename(index=str, columns={"EventId": "eventid", 'Date':'date'})

In [73]:
df = df.merge(events_df, on='eventid')

In [74]:
df.shape

(4774, 225)

## Processing for V2 Df is done.  Export it

In [75]:
df.to_csv('../../02_Data/02_Processed_Data/V2_Processed.csv')

In [44]:
!ls ../../02_Data/02_Processed_Data/

V1_DF.csv                       df_ems.csv
V1_DF_w_flipped.csv             fighter_page_info.csv
V2_Fight_Details.csv            fighter_static_stats.csv
V2_Fight_Details_Munged0711.csv train.csv
V2_Processed.csv                train_stub.csv


In [67]:
# Training Stub Data
train_stub = ['eventid','fightid','f1_fullname','f2_fullname','f1_fighterid','f2_fighterid','f1_outcome']
train = V1_df[train_stub].reset_index().drop(columns='index')


df_ems_f1 = df_ems.copy()

# First append 'F1_' for all fighter 1 data
df_ems_f1.columns = ['eventid','fightid','f1_fighterid','date'] + \
                    ['f1_' + col for col in df_ems_f1.columns if col not in \
                    ['eventid','fightid','f1_fighterid','date']]

# Merge F1 Expanding Means
train = train.merge(df_ems_f1, left_on=['eventid','fightid','f1_fighterid'],
                    right_on=['eventid','fightid','f1_fighterid'])

#Setup 
df_ems_f2 = df_ems.drop(columns=['date']).copy()
df_ems_f2.columns = ['eventid','fightid','f2_fighterid'] + \
                    ['f2_' + col for col in df_ems_f2.columns if col not in \
                    ['eventid','fightid','f1_fighterid']]

# Merge em for fighter 2
train = train.merge(df_ems_f2, left_on=['eventid','fightid','f2_fighterid'], 
                    right_on=['eventid','fightid','f2_fighterid'])

# drop columns w/o outcome
train = train.dropna(axis=0, how='any')

# Check empty columns:
check_nulls(train)

Series([], dtype: float64)

# Need to clean up data a bit more before I can model
- Label from win/loss to 1/0

In [69]:
train.f1_outcome.value_counts()

Win     1590
Loss    1590
Name: f1_outcome, dtype: int64

In [71]:
train['outcome'] = train.f1_outcome.map(lambda x: 1 if x=='Win' else 0)

# Export Train dataset for some EDA

In [72]:
train.to_csv('../../02_Data/02_Processed_Data/train.csv')

# Now the data is "prepped" and I can try to train test split it

In [74]:
unique_ids = set(train.f1_fighterid)

In [77]:
unique_ids = list(set(train.f1_fighterid))
test_full = train[0:0]
train_full = train[0:0]


for fighter_id in unique_ids:
    sub_df = train[train.f1_fighterid == fighter_id].sort_values('date', ascending=False)
#     print(sub_df.shape[0])
#     print(fighter_id)
    if sub_df.shape[0] > 6:
        test_records = sub_df[:2]
        train_records = sub_df[2:]
        test_full = pd.concat([test_full, test_records], axis=0)
        train_full = pd.concat([train_full, train_records], axis=0)
    elif sub_df.shape[0] > 3:
        test_records = sub_df[:1]
        train_records = sub_df[1:]
        test_full = pd.concat([test_full, test_records], axis=0)
        train_full = pd.concat([train_full, train_records], axis=0)
    else:
        train_records = sub_df
        train_full = pd.concat([train_full, train_records], axis=0)

In [78]:
 len(test_full)/len(train_full)

0.18745332337565349

# Split out X and Y

In [79]:
label = 'outcome'
features = [col for col in train_full.columns if col != label]
y_train = train_full[label]
X_train = train_full[features]

# Drop non-number features
X_train = X_train.select_dtypes(exclude='object')
# Drop non-feature columns
X_train = X_train.drop(columns=['eventid','fightid','f1_fighterid','f2_fighterid'])

In [80]:
X_train.shape

(2678, 384)

In [81]:
label = 'outcome'
features = [col for col in test_full.columns if col != label]
y_test = test_full[label]
X_test = test_full[features]

# Drop non-number features
X_test = X_test.select_dtypes(exclude='object')
# Drop non-feature columns
X_test = X_test.drop(columns=['eventid','fightid','f1_fighterid','f2_fighterid'])

## Let's build a quick and dirty model and see what happens

In [87]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression

In [83]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
print('Train:', lr.score(X_train,y_train))
print('Test:', lr.score(X_test,y_test))

Train: 0.6631814787154593
Test: 0.5677290836653387


In [84]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
print('Train:', rf.score(X_train,y_train))
print('Test:',rf.score(X_test,y_test))

Train: 0.9873039581777446
Test: 0.547808764940239


In [85]:
ada = AdaBoostClassifier(random_state=42)
ada_params = {}
gs = GridSearchCV(ada, param_grid=ada_params)
gs.fit(X_train, y_train)
print('Best Score:', gs.best_score_)
print('Best Parameters:', gs.best_params_)
print('Test:',gs.score(X_test,y_test))

Best Score: 0.5418222554144885
Best Parameters: {}
Test: 0.5597609561752988


### GradientBoosting with GridSearch

In [976]:
%%time
gb = GradientBoostingClassifier(random_state=42)
gb_params = {
    'learning_rate': [0.05, 0.1, 0.3],
    'max_depth': [3,4,5]
}
gb_gs = GridSearchCV(gb, param_grid=gb_params, verbose=2, n_jobs=3 )
gb_gs.fit(X_train, y_train)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] learning_rate=0.05, max_depth=3 .................................
[CV] learning_rate=0.05, max_depth=3 .................................
[CV] learning_rate=0.05, max_depth=3 .................................
[CV] .................. learning_rate=0.05, max_depth=3, total=   3.0s
[CV] learning_rate=0.05, max_depth=4 .................................
[CV] .................. learning_rate=0.05, max_depth=3, total=   3.1s
[CV] learning_rate=0.05, max_depth=4 .................................
[CV] .................. learning_rate=0.05, max_depth=3, total=   3.2s
[CV] learning_rate=0.05, max_depth=4 .................................
[CV] .................. learning_rate=0.05, max_depth=4, total=   4.7s
[CV] learning_rate=0.05, max_depth=5 .................................
[CV] .................. learning_rate=0.05, max_depth=4, total=   4.6s
[CV] learning_rate=0.05, max_depth=5 .................................
[CV] ............

[Parallel(n_jobs=3)]: Done  27 out of  27 | elapsed:   43.7s finished


CPU times: user 7.1 s, sys: 99.8 ms, total: 7.2 s
Wall time: 50.5 s


In [977]:
print('Best Params:', gb_gs.best_params_)
print('Train:', gb_gs.score(X_train,y_train))
print('Test:', gb_gs.score(X_test,y_test))

Best Params: {'learning_rate': 0.05, 'max_depth': 4}
Train: 0.9213528932355338
Test: 0.5772727272727273


### AdaBoost with GridSearch

In [89]:
%%time
ada = AdaBoostClassifier(random_state=42)
ada_params = {
    'n_estimators':[50,100],
    'learning_rate': [0.5, 1.0]
}
ada_gs = GridSearchCV(ada, param_grid=ada_params, verbose=2, n_jobs=3)
ada_gs.fit(X_train, y_train)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV] learning_rate=0.5, n_estimators=50 ..............................
[CV] learning_rate=0.5, n_estimators=50 ..............................
[CV] learning_rate=0.5, n_estimators=50 ..............................
[CV] ............... learning_rate=0.5, n_estimators=50, total=   2.1s
[CV] ............... learning_rate=0.5, n_estimators=50, total=   2.1s
[CV] learning_rate=0.5, n_estimators=100 .............................
[CV] ............... learning_rate=0.5, n_estimators=50, total=   2.1s
[CV] learning_rate=0.5, n_estimators=100 .............................
[CV] learning_rate=0.5, n_estimators=100 .............................
[CV] .............. learning_rate=0.5, n_estimators=100, total=   4.2s
[CV] learning_rate=1.0, n_estimators=50 ..............................
[CV] .............. learning_rate=0.5, n_estimators=100, total=   4.3s
[CV] learning_rate=1.0, n_estimators=50 ..............................
[CV] ............

[Parallel(n_jobs=3)]: Done  12 out of  12 | elapsed:   13.1s finished


CPU times: user 3.31 s, sys: 67.1 ms, total: 3.38 s
Wall time: 16.3 s


In [90]:
print('Best Score:', ada_gs.best_score_)
print('Best Parameters:', ada_gs.best_params_)
print('Train:',ada_gs.score(X_train,y_train))
print('Test:',ada_gs.score(X_test,y_test))

Best Score: 0.5418222554144885
Best Parameters: {'learning_rate': 1.0, 'n_estimators': 50}
Train: 0.6747572815533981
Test: 0.5597609561752988
