## Solution for jsonflatten
#### Joyjit Chowdhury

In [158]:
import json
import pandas as pd
from glob import glob

In [None]:
# Function : flatten
# Purpose : The flatten function which will be called recursively to parse the json tree
#           Attribute names are appended with an "_" from root to leaf node of any path
# Parameters:  element - each element encountered during recursive parsing of the json. May be a dict, a list or a scalar
#              keyname - the appended attribute name from root to leaf in each step
#              flat_json - the final flatten json variable passed as param for each parse step to be concatenated
# Return    : flat_json  - The flat_json dictionary built at that invocation of the function


def flatten(element,keyname ='', flat_json = {}): 
    # if the element is a dict, parse it further with recursion
    if type(element) is dict: 
        for e in element: 
            # except the first key, keep appending key names for each level with its parent with a "_" in between
            newkeyname = (keyname + '_' + e) if len(keyname) > 0 else e   
            flat_json = parse_and_flatten(element[e],newkeyname, flat_json)
    # if the element is a list, convert the list to a json string and update the flat_json with this value and appended key
    elif type(element) is list: 
        flat_json.update({keyname : json.dumps(element)})
    # if the element is a scalar, update the flat_json with this value and appended key
    else: 
        flat_json.update({keyname : element})
    
    return flat_json

In [155]:
# Function : make_flat_json
# Purpose :  The driver function to initiate the recursive function flatten
# Parameters:  raw_json - the raw json as a dictionary
# Return    : flat_json - the final flattened json as a dictionary   

def make_flat_json(raw_json): 
    
    flat_json = flatten(raw_json) 
    return flat_json

In [159]:
# Main code to parse directory structure, find json files, flatten and create csv.
# Transaction ID's are assumed to be the name of the containing folder of the json file


txn_list = []

# parse directory recursively to find json files
for jsonfile in glob('data' + '/**/*.json', recursive=True):
    # get the json file's containing folder as the transaction id
    txn_id = os.path.split(os.path.split(jsonfile)[0])[-1]
    # get the first column of the data set as transaction id as a dict.
    txn_record = {"transactionid" : txn_id}
    # open the json file, flatten it and append to the dict
    with open(jsonfile, "r") as f:
        data = json.load(f)    
    flat_data = make_flat_json(data)
    txn_record.update(flat_data)
    txn_list.append(txn_record)
    
# convert the final txn linst to a dataframe for validation
df = pd.DataFrame(txn_list)    
df


Unnamed: 0,transactionid,@available_sources,@http_status_code,@persons_count,@search_id,@visible_sources,available_data_basic_addresses,available_data_basic_dobs,available_data_basic_emails,available_data_basic_genders,...,person_urls,person_user_ids,person_usernames,query_addresses,query_emails,query_names,query_phones,available_data_basic_educations,available_data_premium_educations,person_gender_@inferred
0,transactionid1,119,200,1,17091205373321818383183,119,3,1,3,1,...,"[{""@category"": ""personal_profiles"", ""@domain"":...","[{""content"": ""10003@hi5""}, {""content"": ""5022@f...","[{""content"": ""ldt""}]","[{""city"": ""Brooklyn"", ""country"": ""US"", ""displa...","[{""@type"": ""personal"", ""address"": ""rog@gmail.c...","[{""display"": ""Jon Doe"", ""first"": ""John"", ""last...","[{""country_code"": 1, ""display"": ""555-555-9377""...",2,3,True
1,transactionid2,119,200,1,1709120537405176077288555043,119,3,1,3,1,...,"[{""@category"": ""personal_profiles"", ""@domain"":...","[{""content"": ""1003@hi5""}, {""content"": ""6767022...","[{""content"": ""aaaaaaadt""}]","[{""city"": ""Brooklyn"", ""country"": ""US"", ""displa...","[{""@type"": ""personal"", ""address"": ""rog@gmail.c...","[{""display"": ""Jon Doe"", ""first"": ""John"", ""last...","[{""country_code"": 1, ""display"": ""555-555-9377""...",2,3,True
2,transactionid3,59,200,1,1709120538398406071621808023086950504,59,5,1,2,1,...,"[{""@category"": ""professional_and_business"", ""@...","[{""content"": ""4a@linkedin""}, {""content"": ""#5@l...","[{""content"": ""aaaaaaadt""}]","[{""city"": ""Brooklyn"", ""country"": ""US"", ""displa...","[{""@type"": ""personal"", ""address"": ""rog@gmail.c...","[{""display"": ""Jon Doe"", ""first"": ""John"", ""last...","[{""country_code"": 1, ""display"": ""555-555-9377""...",2,3,True


In [157]:
# Generate the csv file
df.to_csv('Transactions_Flattened.csv', index = False)