In [None]:
from random import randrange,randint
import pandas as pd

In [None]:
def gen_axiom(maxlen=3):
    borel_field = 'MIU'
    return ''.join([borel_field[randrange(0,3)] for i in range(maxlen)])

In [None]:
def rule1(s):
    if s[-1] == 'I':
        return s + 'U'
    return s

def rule2(s):
    if s[0] == 'M':
        return 'M' + s[1:] + s[1:]
    return s

def rule3(s):
    if "III" in s:
        return s.replace("III","U")
    else:
        return s
    
def rule4(s):
    if "UU" in s:
        return s.replace("UU","")
    return s

In [None]:
def gen_derivation(axiom,derivations=10,rules=[rule1,rule2,rule3,rule4]):
    theorem = []
    for i in range(derivations):
        func = rules[randrange(0,4)]
        derivation = func(axiom)
        if derivation == '':
            theorem.append(axiom)
            return theorem,len(theorem)
        axiom = derivation
        theorem.append(axiom)
    return theorem,len(theorem)

In [None]:
def get_first_cliff_and_drop(diff_lens,diffs):
    cliff_drops = {
        'drop_ixs':[],
        'cliffs':[],
        'drops':[],
        'cliff_drop_diff':[],
        'pct_cliff_drop_diff':[],
        'overall_change':[]
    }
     
    for ix,val in enumerate(diffs):
        if (val<0) and (diffs[ix+1] >= 0):
            cliff_drops['overall_change'].append(sum(diffs))
            cliff_drops['drop_ixs'].append(ix+1)
            cliff_drops['cliffs'].append(diff_lens[ix])
            cliff_drops['drops'].append(diff_lens[ix+1])
            
            cliff_drop_diff = diff_lens[ix]-diff_lens[ix+1]
            cliff_drops['cliff_drop_diff'].append(cliff_drop_diff)
            cliff_drops['pct_cliff_drop_diff'].append(cliff_drop_diff/diff_lens[ix])
    return cliff_drops

In [None]:
def create_random_axiom_dataset(nsamples=20,max_axiom_size=3,max_derivations=20):
    sample_data = []
    for i in range(nsamples):
        axiom = gen_axiom(maxlen=randint(2,max_axiom_size))
        derivations,derivation_len = gen_derivation(axiom,max_derivations)
        diff_lens = [len(step) for step in derivations]
        diffs = [diff_lens[i] - diff_lens[i-1] for i in range(len(diff_lens))][1:]

        try:
            metadata = pd.DataFrame(get_first_cliff_and_drop(diff_lens,diffs))
            metadata.loc[:,'axiom'] = axiom
            metadata.loc[:,'axiom_len'] = len(axiom)
            metadata.loc[:,'sample_id'] = str(i)
            metadata.loc[:,'derivation_length'] = derivation_len
            sample_data.append(metadata)
        except:
            continue
    return pd.concat(sample_data)

In [None]:
random_data = create_random_axiom_dataset(nsamples=100,max_axiom_size=3,max_derivations=50)