# 02 Process V2 Json to Fight Details

In [243]:
pd.options.display.max_rows = 70

## Imports

In [244]:
import os
import json
import pandas as pd
import re
import numpy as np

## Functions

## TODO:
- Rescrape, including "Ground" Stats
- Everything is going lower case (Knock down vs. Knock Down)

In [245]:
def Process_All_V2_Fight_Details(rootdir):
    list_V2_fight_dicts = []
    counter = 1
    for subdir, directory, files in os.walk(rootdir):
        for file in files:
            filepath = subdir + os.sep + file
            with open(filepath) as f:
                jsn = json.load(f)
            v2_fight_dict = Process_One_V2_Fight_Details(jsn)
            if len(v2_fight_dict) > 30:
                list_V2_fight_dicts.append(v2_fight_dict)
            else:
                #print(f"Encounter short json:  {len(v2_fight_dict)}, {file}")
                v2_fight_dict = Other_Process_One_V2_Fight_Details(jsn)
                list_V2_fight_dicts.append(v2_fight_dict)

            if counter%100==0:
                print(f"{counter}: {file}")
            
            counter +=1
            
        if counter > 300000:
            break
            
    return list_V2_fight_dicts

In [246]:
def Other_Process_One_V2_Fight_Details(jsn):
    pass

In [247]:
def Process_One_V2_Fight_Details(jsn):
#   Takes a json file representing 1 fight, returns a processed dictionary
    df = pd.io.json.json_normalize(jsn)
    
    # Mask Event Round columns to avoid dropping them
    df = df.rename(columns = {'FMLiveFeed.CurrentRound':'FMLiveFeed.CurrentRnd',
                              'FMLiveFeed.CurrentRoundTime': 'FMLiveFeed.CurrentRndTime',
                              'FMLiveFeed.MaxRounds':'FMLiveFeed.MaxRnds'})

    list_words_to_drop = ['punch','kick','roundstats','type']
    list_columns_to_keep = remove_columns(df.columns, list_words_to_drop)
    df = df[list_columns_to_keep]

    # Clean up column names
    df.columns = cleanse_column_names(df.columns)
    
    # Create Dictionary
    final_dict = df.T.to_dict()[0]
    
    # Convert all keys to lowercase
    final_dict = {k.lower():v for k, v in final_dict.items()}

    return final_dict

In [248]:
def remove_columns(list_columns, list_words_to_drop):
#   Take in list of columns and list of words to drop.  Return list of columns with columns dropped
    list_columns_to_keep = []
    for column in list_columns:
        if not any(drop_word in column.lower() for drop_word in list_words_to_drop):
            list_columns_to_keep.append(column) 
    return list_columns_to_keep

In [249]:
def cleanse_column_names(list_columns_to_keep):
#   Take in "dirty" list of column names, return "clean" list of column names
    list_clean_column_names = [cleanse_column_name(column) for column in list_columns_to_keep]
    return list_clean_column_names

In [250]:
def cleanse_column_name(column):
#   takes in column name.  Returns cleaned up column name
    column = re.sub(' ','_', column) # Get rid of spaces
    column = re.sub('FMLiveFeed.','', column) # Get rid of FMLiveFeed
    column = re.sub('.Blue.','.F1.', column) # Get rid of Blue
    column = re.sub('.Red.','.F2.', column) # Get rid of Red
    column = re.sub('FightStats.','', column) # Get rid of FightStats
    column = re.sub('\.Strikes\.','.', column) # Get rid of Strikes
    column = re.sub('Fighters\.','', column) # Get rid of Strikes  
    column = re.sub('\.','_', column) # periods to underscores
    column = re.sub('Leg_Total_Strikes','Legs_Total_Strikes', column ) # Fix column naming error for V2a
    return column

In [251]:
def Other_Process_One_V2_Fight_Details(jsn):
    # Split the data into lists for easier management
    list_f1_strikes = jsn['FMLiveFeed']['FightCard']['Fight'][2]['FightStats'][0]['Fighter'][0]['Strikes']
    list_f1_grappling = jsn['FMLiveFeed']['FightCard']['Fight'][2]['FightStats'][0]['Fighter'][1]['Grappling']
    list_f1_tip = jsn['FMLiveFeed']['FightCard']['Fight'][2]['FightStats'][0]['Fighter'][2]['TIP']
    list_f2_strikes = jsn['FMLiveFeed']['FightCard']['Fight'][2]['FightStats'][1]['Fighter'][0]['Strikes']
    list_f2_grappling = jsn['FMLiveFeed']['FightCard']['Fight'][2]['FightStats'][1]['Fighter'][1]['Grappling']
    list_f2_tip = jsn['FMLiveFeed']['FightCard']['Fight'][2]['FightStats'][1]['Fighter'][2]['TIP']  

    # Create new clean dictionary
    clean_dict = {}

    # Process each dataset
    clean_dict = parse_list_strikes_tip(clean_dict, list_f1_strikes, 'F1')
    clean_dict = parse_list_grappling(clean_dict, list_f1_grappling, 'F1')
    clean_dict = parse_list_strikes_tip(clean_dict, list_f1_tip, 'F1')
    clean_dict = parse_list_strikes_tip(clean_dict, list_f2_strikes, 'F2')
    clean_dict = parse_list_grappling(clean_dict, list_f2_grappling, 'F2')
    clean_dict = parse_list_strikes_tip(clean_dict, list_f2_tip, 'F2')
    clean_dict = get_other_keys(clean_dict, jsn)
    clean_dict = add_IDs_and_names(clean_dict, jsn)
    
    # Convert all keys to lowercase
    clean_dict = {k.lower():v for k, v in clean_dict.items()}
    
    return clean_dict

In [252]:
def parse_list_grappling(clean_dict, list_dicts, fighter_string): 
    for one_dict in list_dicts:
        
        # Ignore any dictionary for kicks and punches (since always null)
        if 'kick' in one_dict['Name'].lower() or 'punch' in one_dict['Name'].lower():
            continue

        # Put the values into the new dictionary
        if 'Attempts' in one_dict:
            attempts_key = f"{fighter_string}_Grappling_{re.sub(' ','_',one_dict['Name'].strip())}_Attempts"
            clean_dict[attempts_key] = try_int(one_dict['Attempts'])

        if 'Landed' in one_dict:
            landed_key = f"{fighter_string}_Grappling_{re.sub(' ','_',one_dict['Name'].strip())}_Landed"
            clean_dict[landed_key] = try_int(one_dict['Landed'])

        if 'Success' in one_dict:
            landed_key = f"{fighter_string}_Grappling_{re.sub(' ','_',one_dict['Name'].strip())}_Landed"
            clean_dict[landed_key] = try_int(one_dict['Success'])  
            
    return clean_dict

In [253]:
def parse_list_strikes_tip(clean_dict, list_dicts, fighter_string): 
    for one_dict in list_dicts:
        
        # Ignore any dictionary for kicks and punches (since always null)
        if 'kick' in one_dict['Name'].lower() or 'punch' in one_dict['Name'].lower():
            continue

        # Put the values into the new dictionary
        if 'Attempts' in one_dict:
            attempts_key = f"{fighter_string}_{re.sub(' ','_',one_dict['Name'].strip())}_Attempts"
            clean_dict[attempts_key] = try_int(one_dict['Attempts'])

        if 'Landed' in one_dict:
            landed_key = f"{fighter_string}_{re.sub(' ','_',one_dict['Name'].strip())}_Landed"
            clean_dict[landed_key] = try_int(one_dict['Landed'])

        if 'Success' in one_dict:
            landed_key = f"{fighter_string}_{re.sub(' ','_',one_dict['Name'].strip())}_Landed"
            clean_dict[landed_key] = try_int(one_dict['Success'])  
        
        if 'Time' in one_dict:
            tip_key = f"{fighter_string}_TIP_{re.sub(' ','_',one_dict['Name'].strip())}"
            tip_key = re.sub('\.','_',tip_key)
            clean_dict[tip_key] = one_dict['Time']
            
    return clean_dict

In [254]:
def get_other_keys(clean_dict, jsn): 
    Other_Columns_Map = {'FMLiveFeed_FightCard_Accolade':'Accolade',
     'FMLiveFeed_FightCard_CurRound':'CurrentRnd',
     'Missing1':'CurrentRndTime',
     'FMLiveFeed_EventID':'EventID',
     'FMLiveFeed_FightCard_FightID':'FightID',
     'FMLiveFeed_FightCard_TotalNumOfRounds':'MaxRnds',
     'FMLiveFeed_FightCard_Referee':'Referee',
     'FMLiveFeed_FightCard_Status':'Status',
     'Timestamp':'Timestamp',
     'FMLiveFeed_FightCard_WeightClass':'WeightClass'}

    # Flatten the file 
    flat_jsn = flatten_json(jsn)
    
    # delete unwanted keys
    keys_to_delete = []
    for key in flat_jsn.keys():
        if 'RoundStats' in key or 'FightStats' in key or 'Fighters' in key:
            keys_to_delete.append(key)
    for key in keys_to_delete:
        del flat_jsn[key]
        
     # Append to clean_dict
    for key, value in flat_jsn.items():
        if key in Other_Columns_Map:
            clean_dict[Other_Columns_Map[key]] = value   
    
    return clean_dict

In [255]:
def add_IDs_and_names(clean_dict, jsn):
    name_dict = {}
    for f_dict in jsn['FMLiveFeed']['FightCard']['Fight'][0]['Fighters']:
        name_dict[f_dict['FighterID']] = f_dict['Name']
        
    f1_fighterID = jsn['FMLiveFeed']['FightCard']['Fight'][2]['FightStats'][0]['FighterID']
    f2_fighterID = jsn['FMLiveFeed']['FightCard']['Fight'][2]['FightStats'][1]['FighterID']
    
    #Add FighterIDs and Names to dictionary
    clean_dict['F1_FighterID'] = f1_fighterID
    clean_dict['F2_FighterID'] = f2_fighterID
    clean_dict['F1_Name'] = name_dict[f1_fighterID]
    clean_dict['F2_Name'] = name_dict[f2_fighterID]
    
    return clean_dict

In [256]:
def flatten_json(y):
    out = {}

    def flatten(x, name=''):
        if type(x) is dict:
            for a in x:
                flatten(x[a], name + a + '_')
        elif type(x) is list:
            i = 0
            for a in x:
                flatten(a, name + str(i) + '_')
                i += 1
        else:
            out[name[:-1]] = x

    flatten(y)
    return out

In [257]:
def check_nulls(df):    
    return df.loc[:,df.isnull().sum()!=0].isnull().sum()

In [258]:
def try_int(value):
    try:
        return int(value)
         
    except:
        return value

## Call my Functions and Test the Code

In [259]:
rootdir = '../../02_Data/01_Raw_Scraped_Data/V2_Jsons/'

In [260]:
list_V2_fight_dicts = Process_All_V2_Fight_Details(rootdir)

100: fight_5967.json
200: fight_5762.json
300: fight_6688.json
400: fight_4490.json
500: fight_4996.json
600: fight_5109.json
700: fight_4537.json
800: fight_4764.json
900: fight_7021.json
1000: fight_6100.json
1100: fight_6187.json
1200: fight_5367.json
1300: fight_5548.json
1400: fight_5614.json
1500: fight_6634.json
1600: fight_6930.json
1700: fight_4995.json
1800: fight_4391.json
1900: fight_6399.json
2000: fight_7153.json
2100: fight_5030.json
2200: fight_5289.json
2300: fight_5428.json


In [263]:
df = pd.DataFrame(list_V2_fight_dicts)

In [264]:
df.shape

(2387, 140)

In [265]:
check_nulls(df)

currentrndtime    969
dtype: int64

In [230]:
df.dtypes

accolade                                  object
currentrnd                                object
currentrndtime                            object
eventid                                   object
f1_body_significant_strikes_attempts      object
f1_body_significant_strikes_landed        object
f1_body_total_strikes_attempts            object
f1_body_total_strikes_landed              object
f1_clinch_body_strikes_attempts           object
f1_clinch_body_strikes_landed             object
f1_clinch_head_strikes_attempts           object
f1_clinch_head_strikes_landed             object
f1_clinch_leg_strikes_attempts            object
f1_clinch_leg_strikes_landed              object
f1_clinch_significant_strikes_attempts    object
f1_clinch_significant_strikes_landed      object
f1_clinch_total_strikes_attempts          object
f1_clinch_total_strikes_landed            object
f1_distance_body_strikes_attempts         object
f1_distance_body_strikes_landed           object
f1_distance_head_str

In [235]:
df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,300
accolade,,UFC Lightweight Title,,,,,,,,UFC Women's Strawweight Title,...,,,,,,,,,,
currentrnd,1,5,3,2,3,2,3,2,2,2,...,3,1,3,3,3,3,3,1,3,2
currentrndtime,,,,,,,,,,,...,05:00,02:47,05:00,05:00,05:00,05:00,05:00,02:18,05:00,01:50
eventid,704,704,704,704,704,704,704,704,704,704,...,825,825,825,825,825,825,825,825,825,825
f1_body_significant_strikes_attempts,1,37,33,16,9,10,29,4,13,4,...,16,11,7,22,8,26,4,3,20,1
f1_body_significant_strikes_landed,0,27,24,8,7,6,21,3,11,3,...,9,9,6,13,6,21,3,2,13,1
f1_body_total_strikes_attempts,1,56,33,16,12,10,44,4,14,4,...,17,11,12,22,15,27,10,3,20,1
f1_body_total_strikes_landed,0,45,24,8,10,6,36,3,12,3,...,10,9,11,13,12,22,9,2,13,1
f1_clinch_body_strikes_attempts,0,5,2,1,5,0,14,0,1,2,...,1,7,3,7,2,3,0,1,2,1
f1_clinch_body_strikes_landed,0,5,2,1,4,0,10,0,1,1,...,1,6,3,6,2,3,0,1,2,1


In [236]:
df.f1_grappling_reversals_landed.astype('int64')

ValueError: invalid literal for int() with base 10: ''

In [268]:
df.shape

(2387, 140)

In [266]:
df.replace('', np.nan, inplace=True)

In [267]:
check_nulls(df)

accolade                            2269
currentrndtime                       969
f1_grappling_reversals_landed        852
f1_tip_back_control_time             211
f1_tip_clinch_time                   211
f1_tip_control_time                  211
f1_tip_distance_time                 211
f1_tip_ground_control_time           211
f1_tip_ground_time                   211
f1_tip_guard_control_time            211
f1_tip_half_guard_control_time       211
f1_tip_misc__ground_control_time     211
f1_tip_mount_control_time            211
f1_tip_neutral_time                  211
f1_tip_side_control_time             211
f1_tip_standing_time                 211
f2_grappling_reversals_landed        852
f2_tip_back_control_time             211
f2_tip_clinch_time                   211
f2_tip_control_time                  211
f2_tip_distance_time                 211
f2_tip_ground_control_time           211
f2_tip_ground_time                   211
f2_tip_guard_control_time            211
f2_tip_half_guar

In [32]:
df[df['F2_TIP_Misc._Ground_Control_Time'].notnull()]

Unnamed: 0,Accolade,CurrentRnd,CurrentRndTime,EventID,FightID,MaxRnds,Referee,Status,Timestamp,WeightClass,...,F2_TIP_Ground_Time,F2_TIP_Guard_Control_Time,F2_TIP_Half_Guard_Control_Time,F2_TIP_Misc._Ground_Control_Time,F2_TIP_Mount_Control_Time,F2_TIP_Neutral_Time,F2_TIP_Side_Control_Time,F2_TIP_Standing_Time,F2_Total_Strikes_Attempts,F2_Total_Strikes_Landed
0,,1,,704,5268,3,Herb Dean,official,02:26:26 06/28/2016,Lightweight,...,,,,,,,,,40,15
16,,1,,703,5202,3,Camila Albuquerque,official,01:42:46 11/17/2016,Women's Bantamweight,...,,,,,,,,,20,9
22,,3,,703,5181,3,Wernei Cardoso,official,05:30:02 11/30/2015,Welterweight,...,,,,,,,,,169,46
54,,3,,766,5962,3,Steve Perceval,official,08:00:55 02/13/2017,Welterweight,...,,,,,,,,,250,196
62,,2,,759,5930,3,Herb Dean,official,07:02:07 09/10/2016,Bantamweight,...,,,,,,,,,75,19
75,,2,,761,5872,3,Gary Copeland,official,04:28:42 09/19/2016,Heavyweight,...,,,,,,,,,94,38
99,,3,,768,5967,3,Chris Tognoni,official,06:28:31 04/18/2017,Lightweight,...,,,,,,,,,120,61
105,,3,,768,5959,3,John McCarthy,official,04:02:48 12/10/2016,Light Heavyweight,...,,,,,,,,,100,32
112,,2,,750,5777,3,John Hosegood,official,10:01:12 04/05/2016,Heavyweight,...,,,,,,,,,40,33
121,,1,,750,5733,3,Jorge Alonso,official,04:28:41 09/19/2016,Welterweight,...,,,,,,,,,26,12


## Setup Column Order: [Other Columns] [F1 Columns] [F2 Columns]

In [192]:
F1_Columns = [col for col in df.columns if 'f1' in col.lower()]
F2_Columns = [col for col in df.columns if 'f2' in col.lower()]
Other_Columns = [col for col in df.columns if not 'f2' in col.lower() and not 'f1' in col.lower()]


Ordered_Columns = Other_Columns + F1_Columns + F2_Columns
df = df[Ordered_Columns]

## Setup subsets of columns for easy access

In [191]:
F1_Strikes = [col for col in F1_Columns if 'strikes' in col.lower()] + ['F1_Knock_Down_Landed']
F1_Grappling = [col for col in F1_Columns if 'grappling' in col.lower()]
F1_TIP = [col for col in F1_Columns if 'tip' in col.lower()]
F1_TIP = [col for col in F1_Columns if 'tip' in col.lower()]
F1_Identification = ['F1_FighterID','F1_Name']
F1_Identification = set(F1_Columns) - set(F1_Strikes) - set(F1_Grappling) - set(F1_TIP)



## Export Data

In [54]:
df.to_csv('../../02_Data/02_Processed_Data/V2_Fight_Details.csv')

# Look at 1 JSON

In [None]:
rootdir = '../../02_Data/01_Raw_Scraped_Data/V2_Jsons/'



In [3]:
!ls ../../02_Data/01_Raw_Scraped_Data/V2_Jsons/event_738/fight_5619.json

../../02_Data/01_Raw_Scraped_Data/V2_Jsons/event_738/fight_5619.json


In [91]:
import json
import pandas as pd

In [92]:
filepath = '../../02_Data/01_Raw_Scraped_Data/V2_Jsons/event_738/fight_5619.json'
with open(filepath) as f:
    jsn = json.load(f)

In [105]:
final_dict = Process_One_V2_Fight_Details(jsn)

{k.lower():v for k, v in final_dict.items()}

{'accolade': '',
 'currentrnd': '3',
 'currentrndtime': '05:00',
 'eventid': '738',
 'f1_body_significant_strikes_attempts': '16',
 'f1_body_significant_strikes_landed': '13',
 'f1_body_total_strikes_attempts': '22',
 'f1_body_total_strikes_landed': '19',
 'f1_clinch_body_strikes_attempts': '4',
 'f1_clinch_body_strikes_landed': '4',
 'f1_clinch_head_strikes_attempts': '8',
 'f1_clinch_head_strikes_landed': '3',
 'f1_clinch_leg_strikes_attempts': '1',
 'f1_clinch_leg_strikes_landed': '1',
 'f1_clinch_significant_strikes_attempts': '13',
 'f1_clinch_significant_strikes_landed': '8',
 'f1_clinch_total_strikes_attempts': '21',
 'f1_clinch_total_strikes_landed': '15',
 'f1_distance_body_strikes_attempts': '12',
 'f1_distance_body_strikes_landed': '9',
 'f1_distance_head_strikes_attempts': '74',
 'f1_distance_head_strikes_landed': '26',
 'f1_distance_leg_strikes_attempts': '14',
 'f1_distance_leg_strikes_landed': '12',
 'f1_distance_strikes_attempts': '100',
 'f1_distance_strikes_landed': '

In [None]:
{k.lower(): v for k, v in alphabet.items()}

In [None]:
alphlower = {k.lower(): v for k, v in alphabet.iteritems()}

# Try to fix the First V2 Process JSON

What are the problems:
- First put in "Other Process"
- Rescrape, including "Ground" Stats
- Fix "leg" vs. "legs" error
- Everything is going lower case (Knock down vs. Knock Down)

In [None]:
F1_Leg_Total_Strikes_Attempts
F1_Leg_Total_Strikes_Landed

In [60]:
F1_Columns = [col for col in df.columns if 'f1' in col.lower()]
F2_Columns = [col for col in df.columns if 'f2' in col.lower()]
Other_Columns = [col for col in df.columns if not 'f2' in col.lower() and not 'f1' in col.lower()]

In [61]:
F1_Columns

['F1_Body_Significant_Strikes_Attempts',
 'F1_Body_Significant_Strikes_Landed',
 'F1_Body_Total_Strikes_Attempts',
 'F1_Body_Total_Strikes_Landed',
 'F1_Clinch_Body_Strikes_Attempts',
 'F1_Clinch_Body_Strikes_Landed',
 'F1_Clinch_Head_Strikes_Attempts',
 'F1_Clinch_Head_Strikes_Landed',
 'F1_Clinch_Leg_Strikes_Attempts',
 'F1_Clinch_Leg_Strikes_Landed',
 'F1_Clinch_Significant_Strikes_Attempts',
 'F1_Clinch_Significant_Strikes_Landed',
 'F1_Clinch_Total_Strikes_Attempts',
 'F1_Clinch_Total_Strikes_Landed',
 'F1_Distance_Body_Strikes_Attempts',
 'F1_Distance_Body_Strikes_Landed',
 'F1_Distance_Head_Strikes_Attempts',
 'F1_Distance_Head_Strikes_Landed',
 'F1_Distance_Leg_Strikes_Attempts',
 'F1_Distance_Leg_Strikes_Landed',
 'F1_Distance_Strikes_Attempts',
 'F1_Distance_Strikes_Landed',
 'F1_FighterID',
 'F1_Grappling_Reversals_Landed',
 'F1_Grappling_Standups_Landed',
 'F1_Grappling_Submissions_Attempts',
 'F1_Grappling_Takedowns_Attempts',
 'F1_Grappling_Takedowns_Landed',
 'F1_Ground_