# 02 Process V2 Json to V2_DF

### The purpose of this notebook is to process the V2 Jsons to a V2 DF
- The Jsons are in 2 different formats.  I created 2 separate functions to process each one and put into the same list of dictionaries

## Imports

In [3]:
import os
import json
import pandas as pd
import re
import numpy as np

In [4]:
pd.options.display.max_rows = 70

## Functions

In [5]:
def Process_All_V2_Fight_Details(rootdir):
    list_V2_fight_dicts = []
    counter = 1
    for subdir, directory, files in os.walk(rootdir):
        for file in files:
            filepath = subdir + os.sep + file
            with open(filepath) as f:
                jsn = json.load(f)
            v2_fight_dict = Process_One_V2_Fight_Details(jsn)
            if len(v2_fight_dict) > 30:
                list_V2_fight_dicts.append(v2_fight_dict)
            else:
                #print(f"Encounter short json:  {len(v2_fight_dict)}, {file}")
                v2_fight_dict = Other_Process_One_V2_Fight_Details(jsn)
                list_V2_fight_dicts.append(v2_fight_dict)

            if counter%100==0:
                print(f"{counter}: {file}")
            
            counter +=1
            
        if counter > 300000:
            break
            
    return list_V2_fight_dicts

In [7]:
def Process_One_V2_Fight_Details(jsn):
#   Takes a json file representing 1 fight, returns a processed dictionary
    df = pd.io.json.json_normalize(jsn)
    
    # Mask Event Round columns to avoid dropping them
    df = df.rename(columns = {'FMLiveFeed.CurrentRound':'FMLiveFeed.CurrentRnd',
                              'FMLiveFeed.CurrentRoundTime': 'FMLiveFeed.CurrentRndTime',
                              'FMLiveFeed.MaxRounds':'FMLiveFeed.MaxRnds'})

    list_words_to_drop = ['punch','kick','roundstats','type']
    list_columns_to_keep = remove_columns(df.columns, list_words_to_drop)
    df = df[list_columns_to_keep]

    # Clean up column names
    df.columns = cleanse_column_names(df.columns)
    
    # Create Dictionary
    final_dict = df.T.to_dict()[0]
    
    # Convert all keys to lowercase
    final_dict = {k.lower():v for k, v in final_dict.items()}

    return final_dict

In [8]:
def remove_columns(list_columns, list_words_to_drop):
#   Take in list of columns and list of words to drop.  Return list of columns with columns dropped
    list_columns_to_keep = []
    for column in list_columns:
        if not any(drop_word in column.lower() for drop_word in list_words_to_drop):
            list_columns_to_keep.append(column) 
    return list_columns_to_keep

In [9]:
def cleanse_column_names(list_columns_to_keep):
#   Take in "dirty" list of column names, return "clean" list of column names
    list_clean_column_names = [cleanse_column_name(column) for column in list_columns_to_keep]
    return list_clean_column_names

In [10]:
def cleanse_column_name(column):
#   takes in column name.  Returns cleaned up column name
    column = re.sub(' ','_', column) # Get rid of spaces
    column = re.sub('FMLiveFeed.','', column) # Get rid of FMLiveFeed
    column = re.sub('.Blue.','.F1.', column) # Get rid of Blue
    column = re.sub('.Red.','.F2.', column) # Get rid of Red
    column = re.sub('FightStats.','', column) # Get rid of FightStats
    column = re.sub('\.Strikes\.','.', column) # Get rid of Strikes
    column = re.sub('Fighters\.','', column) # Get rid of Strikes  
    column = re.sub('\.','_', column) # periods to underscores
    column = re.sub('Leg_Total_Strikes','Legs_Total_Strikes', column ) # Fix column naming error for V2a
    return column

In [11]:
def Other_Process_One_V2_Fight_Details(jsn):
    # Split the data into lists for easier management
    list_f1_strikes = jsn['FMLiveFeed']['FightCard']['Fight'][2]['FightStats'][0]['Fighter'][0]['Strikes']
    list_f1_grappling = jsn['FMLiveFeed']['FightCard']['Fight'][2]['FightStats'][0]['Fighter'][1]['Grappling']
    list_f1_tip = jsn['FMLiveFeed']['FightCard']['Fight'][2]['FightStats'][0]['Fighter'][2]['TIP']
    list_f2_strikes = jsn['FMLiveFeed']['FightCard']['Fight'][2]['FightStats'][1]['Fighter'][0]['Strikes']
    list_f2_grappling = jsn['FMLiveFeed']['FightCard']['Fight'][2]['FightStats'][1]['Fighter'][1]['Grappling']
    list_f2_tip = jsn['FMLiveFeed']['FightCard']['Fight'][2]['FightStats'][1]['Fighter'][2]['TIP']  

    # Create new clean dictionary
    clean_dict = {}

    # Process each dataset
    clean_dict = parse_list_strikes_tip(clean_dict, list_f1_strikes, 'F1')
    clean_dict = parse_list_grappling(clean_dict, list_f1_grappling, 'F1')
    clean_dict = parse_list_strikes_tip(clean_dict, list_f1_tip, 'F1')
    clean_dict = parse_list_strikes_tip(clean_dict, list_f2_strikes, 'F2')
    clean_dict = parse_list_grappling(clean_dict, list_f2_grappling, 'F2')
    clean_dict = parse_list_strikes_tip(clean_dict, list_f2_tip, 'F2')
    clean_dict = get_other_keys(clean_dict, jsn)
    clean_dict = add_IDs_and_names(clean_dict, jsn)
    
    # Convert all keys to lowercase
    clean_dict = {k.lower():v for k, v in clean_dict.items()}
    
    return clean_dict

In [12]:
def parse_list_grappling(clean_dict, list_dicts, fighter_string): 
    for one_dict in list_dicts:
        
        # Ignore any dictionary for kicks and punches (since always null)
        if 'kick' in one_dict['Name'].lower() or 'punch' in one_dict['Name'].lower():
            continue

        # Put the values into the new dictionary
        if 'Attempts' in one_dict:
            attempts_key = f"{fighter_string}_Grappling_{re.sub(' ','_',one_dict['Name'].strip())}_Attempts"
            clean_dict[attempts_key] = try_int(one_dict['Attempts'])

        if 'Landed' in one_dict:
            landed_key = f"{fighter_string}_Grappling_{re.sub(' ','_',one_dict['Name'].strip())}_Landed"
            clean_dict[landed_key] = try_int(one_dict['Landed'])

        if 'Success' in one_dict:
            landed_key = f"{fighter_string}_Grappling_{re.sub(' ','_',one_dict['Name'].strip())}_Landed"
            clean_dict[landed_key] = try_int(one_dict['Success'])  
            
    return clean_dict

In [13]:
def parse_list_strikes_tip(clean_dict, list_dicts, fighter_string): 
    for one_dict in list_dicts:
        
        # Ignore any dictionary for kicks and punches (since always null)
        if 'kick' in one_dict['Name'].lower() or 'punch' in one_dict['Name'].lower():
            continue

        # Put the values into the new dictionary
        if 'Attempts' in one_dict:
            attempts_key = f"{fighter_string}_{re.sub(' ','_',one_dict['Name'].strip())}_Attempts"
            clean_dict[attempts_key] = try_int(one_dict['Attempts'])

        if 'Landed' in one_dict:
            landed_key = f"{fighter_string}_{re.sub(' ','_',one_dict['Name'].strip())}_Landed"
            clean_dict[landed_key] = try_int(one_dict['Landed'])

        if 'Success' in one_dict:
            landed_key = f"{fighter_string}_{re.sub(' ','_',one_dict['Name'].strip())}_Landed"
            clean_dict[landed_key] = try_int(one_dict['Success'])  
        
        if 'Time' in one_dict:
            tip_key = f"{fighter_string}_TIP_{re.sub(' ','_',one_dict['Name'].strip())}"
            tip_key = re.sub('\.','_',tip_key)
            clean_dict[tip_key] = one_dict['Time']
            
    return clean_dict

In [14]:
def get_other_keys(clean_dict, jsn): 
    Other_Columns_Map = {'FMLiveFeed_FightCard_Accolade':'Accolade',
     'FMLiveFeed_FightCard_CurRound':'CurrentRnd',
     'Missing1':'CurrentRndTime',
     'FMLiveFeed_EventID':'EventID',
     'FMLiveFeed_FightCard_FightID':'FightID',
     'FMLiveFeed_FightCard_TotalNumOfRounds':'MaxRnds',
     'FMLiveFeed_FightCard_Referee':'Referee',
     'FMLiveFeed_FightCard_Status':'Status',
     'Timestamp':'Timestamp',
     'FMLiveFeed_FightCard_WeightClass':'WeightClass'}

    # Flatten the file 
    flat_jsn = flatten_json(jsn)
    
    # delete unwanted keys
    keys_to_delete = []
    for key in flat_jsn.keys():
        if 'RoundStats' in key or 'FightStats' in key or 'Fighters' in key:
            keys_to_delete.append(key)
    for key in keys_to_delete:
        del flat_jsn[key]
        
     # Append to clean_dict
    for key, value in flat_jsn.items():
        if key in Other_Columns_Map:
            clean_dict[Other_Columns_Map[key]] = value   
    
    return clean_dict

In [15]:
def add_IDs_and_names(clean_dict, jsn):
    name_dict = {}
    for f_dict in jsn['FMLiveFeed']['FightCard']['Fight'][0]['Fighters']:
        name_dict[f_dict['FighterID']] = f_dict['Name']
        
    f1_fighterID = jsn['FMLiveFeed']['FightCard']['Fight'][2]['FightStats'][0]['FighterID']
    f2_fighterID = jsn['FMLiveFeed']['FightCard']['Fight'][2]['FightStats'][1]['FighterID']
    
    #Add FighterIDs and Names to dictionary
    clean_dict['F1_FighterID'] = f1_fighterID
    clean_dict['F2_FighterID'] = f2_fighterID
    clean_dict['F1_Name'] = name_dict[f1_fighterID]
    clean_dict['F2_Name'] = name_dict[f2_fighterID]
    
    return clean_dict

In [16]:
def flatten_json(y):
    out = {}

    def flatten(x, name=''):
        if type(x) is dict:
            for a in x:
                flatten(x[a], name + a + '_')
        elif type(x) is list:
            i = 0
            for a in x:
                flatten(a, name + str(i) + '_')
                i += 1
        else:
            out[name[:-1]] = x

    flatten(y)
    return out

In [17]:
def check_nulls(df):    
    return df.loc[:,df.isnull().sum()!=0].isnull().sum()

In [18]:
def try_int(value):
    try:
        return int(value)
         
    except:
        return value

## Call my Functions and Test the Code

In [19]:
rootdir = '../../02_Data/01_Raw_Scraped_Data/V2_Jsons/'

In [20]:
list_V2_fight_dicts = Process_All_V2_Fight_Details(rootdir)

100: fight_5967.json
200: fight_5762.json
300: fight_6688.json
400: fight_4490.json
500: fight_4996.json
600: fight_5109.json
700: fight_4537.json
800: fight_4764.json
900: fight_7021.json
1000: fight_6100.json
1100: fight_6187.json
1200: fight_5367.json
1300: fight_5548.json
1400: fight_5614.json
1500: fight_6634.json
1600: fight_6930.json
1700: fight_4995.json
1800: fight_4391.json
1900: fight_6399.json
2000: fight_7153.json
2100: fight_5030.json
2200: fight_5289.json
2300: fight_5428.json


## Convert list of dicts to a dataframe and replace '' with nan

In [71]:
df = pd.DataFrame(list_V2_fight_dicts)
df.replace('', np.nan, inplace=True)

## Setup Column Order: [Other Columns] [F1 Columns] [F2 Columns]

In [72]:
F1_Columns = [col for col in df.columns if 'f1' in col.lower()]
F2_Columns = [col for col in df.columns if 'f2' in col.lower()]
Other_Columns = [col for col in df.columns if not 'f2' in col.lower() and not 'f1' in col.lower()]


Ordered_Columns = Other_Columns + F1_Columns + F2_Columns
df = df[Ordered_Columns]

## Setup subsets of columns for easy access

In [90]:
F1_Strikes = [col for col in F1_Columns if 'strikes' in col.lower()] + ['f1_knock_down_landed']
F1_Grappling = [col for col in F1_Columns if 'grappling' in col.lower()]
F1_TIP = [col for col in F1_Columns if 'tip' in col.lower()]
F1_TIP = [col for col in F1_Columns if 'tip' in col.lower()]
F1_Identification = ['F1_fighterid','f1_name']
F1_Identification = set(F1_Columns) - set(F1_Strikes) - set(F1_Grappling) - set(F1_TIP)

# Convert all columns to int64

In [68]:
columns_to_convert_float = F1_Columns + F2_Columns
columns_to_convert_float.remove('f1_fighterid')
columns_to_convert_float.remove('f2_fighterid')

In [74]:
for col in columns_to_convert_float:
    try:
        df[col] = df[col].astype('float')
    except:
        pass

## Drop all nulls for now

In [None]:
df = df.dropna(axis=1, how='any')

In [83]:
df.shape

(2387, 107)

In [93]:
for col in df.columns:
    print(col)

currentrnd
eventid
fightid
maxrnds
timestamp
f1_body_significant_strikes_attempts
f1_body_significant_strikes_landed
f1_body_total_strikes_attempts
f1_body_total_strikes_landed
f1_clinch_body_strikes_attempts
f1_clinch_body_strikes_landed
f1_clinch_head_strikes_attempts
f1_clinch_head_strikes_landed
f1_clinch_leg_strikes_attempts
f1_clinch_leg_strikes_landed
f1_clinch_significant_strikes_attempts
f1_clinch_significant_strikes_landed
f1_clinch_total_strikes_attempts
f1_clinch_total_strikes_landed
f1_distance_body_strikes_attempts
f1_distance_body_strikes_landed
f1_distance_head_strikes_attempts
f1_distance_head_strikes_landed
f1_distance_leg_strikes_attempts
f1_distance_leg_strikes_landed
f1_distance_strikes_attempts
f1_distance_strikes_landed
f1_fighterid
f1_grappling_standups_landed
f1_grappling_submissions_attempts
f1_grappling_takedowns_attempts
f1_grappling_takedowns_landed
f1_ground_body_strikes_attempts
f1_ground_body_strikes_landed
f1_ground_head_strikes_attempts
f1_ground_head_

In [81]:
check_nulls(df)

Series([], dtype: float64)

## Export Data

In [84]:
df.to_csv('../../02_Data/02_Processed_Data/V2_Fight_Details.csv')