In [None]:
import utilities as u
import pandas as pd
from tqdm import tqdm

In [None]:
def get_features(datum):
    processed_datum = u.process_theorem(datum)
    unbracketted_datum = [x for x in processed_datum if x not in ['(', ')']]
    n_nodes = len(unbracketted_datum)
    n_funs = len([x for x in unbracketted_datum if x == 'fun'])
    n_lambdas = len([x for x in unbracketted_datum if x == 'l'])
    n_applications = len([x for x in unbracketted_datum if x == 'a'])
    n_vars = len([x for x in unbracketted_datum if x == 'v'])
    n_constants = len([x for x in unbracketted_datum if x == 'c'])
    n_distinct_features = len(set(unbracketted_datum))
    
    tree, _ = u.thm_to_tree(u.process_theorem(datum), True)
    n_nodes_merged = len(u.merge_subexpressions(tree))
    
    return n_nodes, n_nodes_merged, n_distinct_features, n_funs, n_lambdas, n_applications, n_vars, n_constants

In [None]:
def make_dataset(binary):
    dataset = {'nodes': [],
                  'nodes_merged': [],
                  'distinct_features': [],
                  'functions': [],
                  'lambdas': [],
                  'applications': [],
                  'variables': [],
                  'constants': [],
                  'target': []}
    
    data = u.make_data(binary=binary, only_top=False)
    data_dict = dict()
    for thm, y in data:
        if thm in data_dict.keys():
            data_dict[thm] = min(data_dict[thm], y)
        else:
            data_dict[thm] = y
    data = [(t,y) for t,y in data_dict.items()]
    
    if binary:
        data_0 = [(t,y) for t,y in data if y == 0]
        data_1 = [(t,y) for t,y in data if y == 1]
        min_len = min(len(data_0), len(data_1))
        data_0 = [(t,y) for t,y in data_0[:min_len]]
        data_1 = [(t,y) for t,y in data_1[:min_len]]
        data = data_0 + data_1

    print(len(data))
    for thm, length in tqdm(data):
        features = get_features(thm)
        dataset['nodes'].append(features[0])
        dataset['nodes_merged'].append(features[1])
        dataset['distinct_features'].append(features[2])
        dataset['functions'].append(features[3])
        dataset['lambdas'].append(features[4])
        dataset['applications'].append(features[5])
        dataset['variables'].append(features[6])
        dataset['constants'].append(features[7])
        dataset['target'].append(length)
        
    df = pd.DataFrame(data=dataset)
    return df

In [None]:
dataset = make_dataset(binary=True)

In [None]:
dataset.head(50)

In [None]:
# multiclass_dataset = make_dataset(binary=False)
# multiclass_dataset.head(50)

In [None]:
dataset.to_csv('binary_baseline.dataset', index=False)
# multiclass_dataset.to_csv('multiclass_baseline.dataset', index=False)