In [1]:
import json
import numpy as np
import pandas as pd
import pickle

with open('./data/python100.json') as data_file:
    all_data = [json.loads(r) for r in data_file.readlines()]

In [2]:
len(all_data)

100

In [3]:
# input: a code source with multiple functions
# output: a dictionary: key is the code index, value is all functions and their children idx 
# e.g. full_list[index] =  (f_info,[children_idx])

def find_function_children(data):

    functions = [(i,text) for i, text in enumerate(data) if text['type'] == 'FunctionDef']

    full_list = {}
    for (index, f) in functions:
        ans = []
        row = f['children']
        while row:
            for i in row:
                ans.append(i)
            row = [node for root in row for node in data[root].get('children',[]) if node]
        full_list[index] =  (f,ans)
    
    return full_list

In [4]:
def check_child(type_name, data):
    if data['type'] == type_name:
        if data['children']:
            return data['children']

In [5]:
# input: k,v in full_list,  v[1]
# output: a string with all functions call e.g. 'create_stubs,first,AndReturn,ReplayAll,vpnservice_get,request'

def find_call_func(function_list, data):
    
    ans = []
    
    for children_id in function_list:
        if check_child('Call', data[children_id]):
            for j in data[children_id]['children']:
                if check_child('AttributeLoad', data[j]):
                    for z in check_child('AttributeLoad', data[j]):
                        if data[z]['type'] == 'attr': 
                            ans.append(data[z]['value'])
    call_func = ','.join(ans)
    return call_func
    

In [6]:
# input: k,v in full_list,  v[1]
# output: a string, the function's docstring 
def find_docstring(function_list, data):
    
    docstring = ''
    
    for children_id in function_list:
        if data[children_id]['type'] == 'Expr':
            if data[data[children_id]['children'][0]]['type'] == 'Str':
                docstring = data[data[children_id]['children'][0]]['value']
    return docstring

In [7]:
# input: code data
# output: a tuple, (function_name, docstring, functions_call)

def find_function_info(data):
    
    ans = []
    full_list = find_function_children(data)
    
    for k, v in full_list.items():
        docstring = find_docstring(v[1], data)
        call_def = find_call_func(v[1], data)
        
        if docstring:
            ans.append((full_list[k][0]['value'], docstring, call_def))
        elif call_def:
            ans.append((full_list[k][0]['value'], '', call_def))
            
    return ans

In [8]:
# Read data and create pandas dataframe
function_info = []
for i, value in enumerate(all_data):
    func_doc = find_function_info(value)
    if func_doc:
        for tuples in func_doc:
            function_info.append([i, tuples[0], tuples[1].replace("\n", " ").rstrip(), tuples[2]])

In [11]:
with open('./data/pickle100_list.pkl','wb') as f:
     pickle.dump(function_info, f)

In [12]:
#unpickled_df = pd.DataFrame(function_info, columns=['data_id', 'function_name', 'docstring', 'func_call'])

In [35]:
#unpickled_df['keep_in_codebase'] = np.where(((unpickled_df.function_name == unpickled_df.func_call)| (unpickled_df['func_call'] == '') ), 0, 1)

In [13]:
#unpickled_df[(unpickled_df.function_name == '__init__') & (unpickled_df.keep_in_codebase == 1)]

In [16]:
#unpickled_df['filter'] = np.where((unpickled_df.function_name == unpickled_df.func_call) , 0, 1)

In [14]:
#unpickled_df[unpickled_df['function_name'] == '__init__'].loc[146].func_call