# parse_logfile

This dict is the result of serialising (then cleaning) a logfile which is dumped when the assertion on line ~700 fails:
```
assert target.name not in self.converter._function_definitions, target.name
```

Look at this dict, parse how the three relevant objects are filled (really two are objects, one is a call history)

In [None]:
import json


from typed_python import SerializationContext

In [None]:
SERIALIZED_INPUT = '/home/wgrant/Dev/typed_python/demos/python_s_slow_u_slice_u_3/logfile_20221019.bytes'
# SERIALIZED_INPUT = '/home/wgrant/Dev/typed_python/demos/logfile.bytes'

In [None]:
with open(SERIALIZED_INPUT, 'rb') as flines:
    input_data = SerializationContext().deserialize(flines.read())

In [None]:
# PATH_TO_INPUT = '/home/wgrant/Dev/typed_python/demos/failure_dict.json'
# with open(PATH_TO_INPUT) as flines:
#     input_data = json.load(flines)

In [None]:
for key, value in input_data.items():
    print(key, len(value))

In [None]:
import pandas as pd 

In [None]:
pd.set_option('display.max_rows', 2000)

## History

In [None]:
def parse_dict_to_dataframe(input_dict: dict) -> pd.DataFrame:
    input_data_squared = []
    for key, value in input_dict.items():
        for x in value:
            try:
                timestamp, thread, data = x
            except TypeError:
                timestamp, thread, data = x, None
            input_data_squared.append((pd.to_datetime(timestamp, unit='s'), thread, key, data))
    input_df = pd.DataFrame(input_data_squared, columns=['timestamp', 'thread', 'key', 'data']).sort_values(by='timestamp').reset_index(drop=True)
    
    # extract the callTarget data
    callTarget_data = pd.json_normalize(input_df.query('key=="namedCallTargetToLLVM"').data).set_index(input_df.query('key=="namedCallTargetToLLVM"').index)
    callTarget_data.columns = ['name', 'namedCallTargetToLLVM.external']
    parsed_df = pd.merge(input_df, callTarget_data, how='left', left_index=True, right_index=True)
    parsed_df.loc[parsed_df.key=="namedCallTargetToLLVM", 'data'] = None
    # set the name field as data for function_definitions
    parsed_df.loc[parsed_df.key=="function_definitions", 'name'] = parsed_df.loc[parsed_df.key=="function_definitions", :].data  
    parsed_df.loc[parsed_df.key=="external_function_references", 'name'] = parsed_df.loc[parsed_df.key=="external_function_references", :].data  
    # blow up the externallyDefinedFunctionTypes
    parsed_df = parsed_df.explode('data')
    parsed_df.loc[parsed_df.key=="externallyDefinedFunctionTypes", 'name'] = parsed_df.loc[parsed_df.key=="externallyDefinedFunctionTypes", :].data
    # grab entryPoint names
    parsed_df.loc[parsed_df.key=="entryPoint", 'name'] = input_df.loc[input_df.key=='entryPoint', 'data'].apply(lambda x: x['name'])
    
    parsed_df = parsed_df.drop(columns=['data']).reset_index(drop=True)
    return parsed_df, input_df

In [None]:
df, input_df = parse_dict_to_dataframe(input_data['history'])

In [None]:
import numpy as np

In [None]:
def bold(row):
    if not pd.isnull(row['name']) and 'decref_str' in row['name']:
        return ['background-color: #1b9e77'] * len(row)
    elif row['name'] == 'CLEAR':
        return ['background-color: #d95f02'] * len(row)
        
    else:
        return [None] * len(row)

In [None]:
df.style.apply(bold, axis=1)

In [None]:
df[df.name.notnull() & df.name.str.contains('decref_str')]

In [None]:
repeated_names = df[df.key.isin(['function_definitions', 'externallyDefinedFunctionTypes'])].name.value_counts()

In [None]:
repeated_names = repeated_names[repeated_names > 1].index

In [None]:
repeated_names

In [None]:
df[df.name.isin(repeated_names)]

## End State

In [None]:
def parse_dict_to_end_state_dataframe(input_dict): 
    protodf = []
    for key, value in input_dict['end_state'].items():
        if key == 'namedCallTargetToLLVM':
            continue
        timestamp, thread, function_names = value
        for name in function_names:
            protodf.append((pd.to_datetime(timestamp, unit='s'), thread, key, name))
    end_state = pd.DataFrame(protodf, columns=['timestamp', 'thread', 'key', 'name']).sort_values(by=['key', 'name'])
    return end_state

In [None]:
parse_dict_to_end_state_dataframe(input_data).style.apply(bold,axis=1) 

In [None]:
input_data['end_state']['namedCallTargetToLLVM']

## Compiler Stuff

In [None]:
for key, value in input_data.items():
    print(key, len(value))

In [None]:
def parse_dict_to_compiler_cache_dataframe(input_dict): 
    protodf = []
    for key, value in input_dict['compiler_cache'].items():
        for timestamp, thread, function_name in value:
            protodf.append((pd.to_datetime(timestamp, unit='s'), thread, key, function_name))
    compiler_cache = pd.DataFrame(protodf, columns=['timestamp', 'thread', 'key', 'name']).sort_values(by='timestamp')
    return compiler_cache

In [None]:
compiler_df =parse_dict_to_compiler_cache_dataframe(input_data)

In [None]:
compiler_df[compiler_df.name.str.contains('decref_str')]

In [None]:
compiler_df.style.apply(bold,axis=1)

## Bug Test - 


In [None]:
def parse_dict_to_bug_test_dataframe(input_dict): 
    protodf = []
    for timestamp, thread, value in input_dict['bug_test']:
        for key, vals in value.items():
            for function_name in vals:
                protodf.append((pd.to_datetime(timestamp, unit='s'), thread, key, function_name))
    bug_test = pd.DataFrame(protodf, columns=['timestamp', 'thread', 'key', 'name']).sort_values(by=['timestamp', 'key']).reset_index(drop=True)
    return bug_test

In [None]:
bug_test_df = parse_dict_to_bug_test_dataframe(input_data)

In [None]:

bug_test_df[bug_test_df.name.str.contains('decref_str')]

In [None]:
bug_test_df.style.apply(bold, axis=1)

In [None]:
rows = []
for index, row in bug_test_df.query('key=="markExternal"').iterrows():
    name = row['name']
    preceding_rows = bug_test_df.iloc[:index].query('name==@name and key=="definedNames"')
    if len(preceding_rows):   
        # rows.append(pd.concat([preceding_rows, row]))
        rows.append(preceding_rows.append(row))

In [None]:
defined_then_loaded = pd.concat(rows)

In [None]:
defined_then_loaded

In [None]:
defined_then_loaded_names = defined_then_loaded['name'].unique() 

In [None]:
print("\n".join(sorted(defined_then_loaded['name'].unique())))

In [None]:
print("\n".join(sorted(repeated_names)))

In [None]:
defined_then_loaded

In [None]:
merged[merged.name.isin(defined_then_loaded['name'].unique())].style

In [None]:
repeated_names

In [None]:
bug_test_df

## Combo

In [None]:
history_df = df.drop(columns='namedCallTargetToLLVM.external')

In [None]:
history_df.shape

In [None]:
compiler_df.shape

In [None]:
merged = pd.concat([history_df, compiler_df]).sort_values(by=['timestamp', 'key', 'name'])

In [None]:
merged[merged.name.str.contains('decref_str')]

In [None]:
df[df['namedCallTargetToLLVM.external'] == False]

In [None]:
# check end_state to debug how on earth NCT2LLVM got called thrice

In [None]:
df.iloc[-100:]

In [None]:
end_state_df = parse_dict_to_end_state_dataframe(input_data)

In [None]:
end_state_df[end_state_df.name.str.contains('checkSetSizeAndThrow')]

In [None]:
end_state_df.query("key=='function_definitions'")

So what happened:
-  defineLinkName called with RDS - added to `_allDefinedNames_` and then loadFromCompilerCache hit. symbol found in the cache but either callTargetsAndTypes was None, or newNativeFunctionTypes is empty.
- then RDS added to function_definitions.
- call NCT2LLVM a bunch of times. 
- then load RDS again, as a result of loadFromCompilerCaching a different function **which function**. RDS gets in EDFTs, ADS, ACS
- then three more NCT2LLVM hits, and only the third breaks?

In [None]:
merged.loc[1048]

In [None]:
merged.index.get_loc(1048)

In [None]:
merged[600:792]

In [None]:
merged[791:1000]