The notebook runs the benchmark experiments for the paper on Atomic Patterns for CONCEPTS'25 conference.

In [1]:
import platform
import psutil

print("="*40, "System Information", "="*40)
uname = platform.uname()
print(f"System: {uname.system}")
print(f"Node Name: {uname.node}")
print(f"Release: {uname.release}")
print(f"Version: {uname.version}")
print(f"Machine: {uname.machine}")
print(f"Processor: {uname.processor}")

# let's print CPU information
print("="*40, "CPU Info", "="*40)
# number of cores
print("Physical cores:", psutil.cpu_count(logical=False))
print("Total cores:", psutil.cpu_count(logical=True))

System: Darwin
Node Name: macegor.loria.fr
Release: 24.5.0
Version: Darwin Kernel Version 24.5.0: Tue Apr 22 19:54:25 PDT 2025; root:xnu-11417.121.6~2/RELEASE_ARM64_T6020
Machine: arm64
Processor: arm
Physical cores: 12
Total cores: 12


In [2]:
!pip install --upgrade --quiet paspailleur==0.1.0

# Prepare the data

## Download the data

In [3]:
import pandas as pd

url = 'https://raw.githubusercontent.com/datasciencedojo/datasets/refs/heads/master/titanic.csv'
df = pd.read_csv(url, index_col=0)
df.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
df = df.drop(columns=['Embarked'])  # remove categorical column as CategorySetPattern is not covered in the paper
df['Sex'] = df['Sex']=='male'  # convert categorical column to numerical one
df['Fare'] = df['Fare'].astype(int)  # the only discretisation we run

Match every column to one of the build-in Patterns

## Define patterns for every column

In [5]:
from paspailleur.pattern_structures import built_in_patterns as bip

In [6]:
class SurvivedPattern(bip.IntervalPattern):
    BoundsUniverse = tuple(sorted(df['Survived'].unique()))

class PclassPattern(bip.IntervalPattern):
    BoundsUniverse = tuple(sorted(df['Pclass'].unique()))

class NamePattern(bip.NgramSetPattern):
    ...

class SexPattern(bip.IntervalPattern):
    BoundsUniverse = tuple(sorted(df['Sex'].unique()))

class AgePattern(bip.IntervalPattern):
    BoundsUniverse = tuple(sorted(df['Age'].unique()))

class SibSpPattern(bip.IntervalPattern):
    BoundsUniverse = tuple(sorted(df['SibSp'].unique()))

class ParchPattern(bip.IntervalPattern):
    BoundsUniverse = tuple(sorted(df['Parch'].unique()))

class TicketPattern(bip.NgramSetPattern):
    def parse_string_description(self, value: str) -> set[tuple[str, ...]]:
        return {('<start>',)+ tuple(value) + ('<stop>',)}

    def __repr__(self):
        txt = [''.join(seq) for seq in self.value]
        pattern_verb = "{'" + "', '".join(txt) + "'}"
        return pattern_verb

class FarePattern(bip.IntervalPattern):
    BoundsUniverse = tuple(sorted(df['Fare'].unique()))

class CabinPattern(bip.NgramSetPattern):
    def parse_string_description(self, value: str) -> set[tuple[str, ...]]:
        return {('<start>',) + tuple(value) + ('<stop>',)}

    def __repr__(self):
        txt = [''.join(seq) for seq in self.value]
        pattern_verb = "{'" + "', '".join(txt) + "'}"
        return pattern_verb


class PassengerPattern(bip.CartesianPattern):
    DimensionTypes = {
        'Survived': SurvivedPattern, 'Pclass': PclassPattern, 'Name': NamePattern,
        'Sex': SexPattern, 'Age': AgePattern, 'SibSp': SibSpPattern, 'Parch': ParchPattern,
        'Ticket': TicketPattern, 'Fare': FarePattern, 'Cabin': CabinPattern,
    }

In [7]:
for f in df.columns:
    # replace the None values with minimal patterns per dimension
    df[f] = df[f].fillna(PassengerPattern.DimensionTypes[f].get_min_pattern())

## Compute statistics on the data: Table 2 in the paper

In [8]:
%%time
from paspailleur.pattern_structures.pattern_structure import PatternStructure

ps = PatternStructure(PassengerPattern)
ps.fit(df.to_dict('index'), use_tqdm=True, min_atom_support=1)

Compute atomic extents:   0%|          | 0/891 [00:00<?, ?it/s]

Compute order of patterns:   0%|          | 0/20947 [00:00<?, ?it/s]

CPU times: user 25 s, sys: 553 ms, total: 25.5 s
Wall time: 25.7 s


In [9]:
atoms_df = pd.Series([next(iter(atom.value)) for atom in ps.atomic_patterns]).value_counts()
#atoms_df = atoms_df.reset_index()
atoms_df = atoms_df.to_frame()
for f in ['Ticket', 'Name', 'Cabin']:
    atoms_df.loc[f, 'type'] = 'Sequence'
atoms_df['type'] = atoms_df['type'].fillna('Interval')

atoms_df = atoms_df[['type', 'count']]
atoms_df = atoms_df.sort_values(['type', 'count'], ascending=False)
atoms_df

Unnamed: 0,type,count
Ticket,Sequence,11601
Name,Sequence,7279
Cabin,Sequence,1281
Fare,Interval,362
Age,Interval,350
SibSp,Interval,26
Parch,Interval,26
Pclass,Interval,10
Survived,Interval,6
Sex,Interval,6


In [10]:
supmin_atoms,_ = ps._filter_atomic_patterns_by_support('minimal')
supmax_atoms, _ = ps._filter_atomic_patterns_by_support('maximal')

atoms_df['count sup.min'] = pd.Series([next(iter(atom.value)) for atom in supmin_atoms]).value_counts()
atoms_df['count sup.max'] = pd.Series([next(iter(atom.value)) for atom in supmax_atoms]).value_counts()

In [11]:
atoms_df.loc['Total'] = atoms_df[['count', 'count sup.min', 'count sup.max']].sum(0).astype(int)
for f in ['count sup.min', 'count sup.max', 'count']:
    atoms_df[f] = atoms_df[f].astype(int)
atoms_df.loc['Total', 'type'] = 'Cartesian'

In [12]:
atoms_df

Unnamed: 0,type,count,count sup.min,count sup.max
Ticket,Sequence,11601,2359,1790
Name,Sequence,7279,2517,1427
Cabin,Sequence,1281,318,276
Fare,Interval,362,182,182
Age,Interval,350,176,176
SibSp,Interval,26,14,14
Parch,Interval,26,14,14
Pclass,Interval,10,6,6
Survived,Interval,6,4,4
Sex,Interval,6,4,4


In [13]:
print(atoms_df.to_latex())

\begin{tabular}{llrrr}
\toprule
 & type & count & count sup.min & count sup.max \\
\midrule
Ticket & Sequence & 11601 & 2359 & 1790 \\
Name & Sequence & 7279 & 2517 & 1427 \\
Cabin & Sequence & 1281 & 318 & 276 \\
Fare & Interval & 362 & 182 & 182 \\
Age & Interval & 350 & 176 & 176 \\
SibSp & Interval & 26 & 14 & 14 \\
Parch & Interval & 26 & 14 & 14 \\
Pclass & Interval & 10 & 6 & 6 \\
Survived & Interval & 6 & 4 & 4 \\
Sex & Interval & 6 & 4 & 4 \\
Total & Cartesian & 20947 & 5594 & 3893 \\
\bottomrule
\end{tabular}



# Q1. Mining all concepts

## Define the functions to run

Packages to track the running of the experiments

In [14]:
from tqdm.notebook import tqdm
from datetime import datetime
from timeout_function_decorator.timeout_decorator import timeout

In [15]:
import caspailleur as csp
from paspailleur import algorithms as algos

In [16]:
from bitarray import bitarray, frozenbitarray as fbarray
from bitarray.util import zeros

Function to run CbOI algorithm and to abbort the computation if they take more that `timelimit` seconds

In [17]:
def mine_atomised_intents_cboi(atomic_patterns, superatoms_order, timelimit=3600, min_support=0):
    try:
        t1 = datetime.now()
        concepts_iterator = algos.mine_equivalence_classes.iter_intents_via_cboi(
            atomic_patterns=atomic_patterns,
            superatoms_order=superatoms_order,
            min_support=min_support, yield_pattern_intents=False
        )
        concepts = timeout(timelimit)(list)(concepts_iterator)
        t2 = datetime.now()
    except TimeoutError:
        return None, None
    intents = [intent for intent, extent in concepts]
    return intents, (t2-t1).total_seconds()

Function to run LCM algorithm and to abbort the computation if they take more that `timelimit` seconds

In [18]:
import skmine
print(f"{skmine.__version__=}")
from skmine.itemsets import LCM

def mine_atomised_intents_lcm(itemsets, timelimit=3600, min_support=1):
    try:
        t1 = datetime.now()
        lcm = LCM(min_supp=min_support)
        closed_itemsets_df = timeout(timelimit)(lcm.fit_transform)(itemsets)
        t2 = datetime.now()
    except TimeoutError:
        return None, None
    return closed_itemsets_df, (t2-t1).total_seconds()

def lcm_output_to_ba(lcm_df, n_atomic_patterns) -> set[fbarray]:
    intents_lcm_ba = set(csp.io.isets2bas(list(map(set,lcm_df['itemset'].values)), n_atomic_patterns))
    bottom_intent = ~fbarray(zeros(n_atomic_patterns))
    if bottom_intent not in intents_lcm_ba:
        intents_lcm_ba.add(bottom_intent)
    return intents_lcm_ba

# Silence warning of numpy and pandas packages that appear while running LCM algorithm
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

skmine.__version__='1.0.0'


Function to run meet-CbO algorithm and to abbort the computation if they take more that `timelimit` seconds

In [19]:
def mine_intents_meet_cbo(objects_patterns, timelimit=3600):
    try:
        t1 = datetime.now()
        concepts_iterator = algos.mine_equivalence_classes.iter_intents_via_ocbo(
            objects_patterns=objects_patterns,
        )
        concepts = timeout(timelimit)(list)(concepts_iterator)
        t2 = datetime.now()
    except TimeoutError:
        return None, None
    intents = [intent for intent, extent in concepts]
    return intents, (t2-t1).total_seconds()

Function to convert patterns from their atomic representation into the actual Paterns

In [20]:
def patternise_descriptions(atomised_descriptions: list[bitarray], atomic_patterns, subatoms_order):
    t1 = datetime.now()
    patterns = [algos.base_functions.patternise_description(ba, atomic_patterns, subatoms_order, trusted_input=True)
                for ba in atomised_descriptions]
    return patterns, (datetime.now()-t1).total_seconds()

## Run the experiments

In [21]:
TIMELIMIT = 60*10  # 10 minutes

Compute atomic patterns (and their statistics) for varying number of objects

In [22]:
n_rows_vars = [5, 10, 15, 20, 25, 50, 100]

atomic_stats = dict()
pstructures = dict()
for n_rows in tqdm(n_rows_vars):
    ps = PatternStructure(PassengerPattern)
    ps.fit(df[:n_rows].to_dict('index'), compute_atomic_patterns=False)

    t1 = datetime.now()
    ps.init_atomic_patterns()
    t2 = datetime.now()
    pstructures[n_rows] = ps

    atomic_stats[n_rows] = {
        'time atomise': (t2-t1).total_seconds(),
        '#atoms': len(ps._atomic_patterns),
        '#supmax atoms': len(ps._filter_atomic_patterns_by_support('maximal')[0]),
        '#supmin atoms': len(ps._filter_atomic_patterns_by_support('minimal')[0]),
        '#min atom support': min(extent.count() for extent in ps._atomic_patterns.values()),
    }

  0%|          | 0/7 [00:00<?, ?it/s]

Compute the running time of CbOI algorithm and how much time it takes to join atomic representations of intents into Patterns

In [23]:
stats_cboi = dict()
stats_join = dict()
for n_rows in tqdm(n_rows_vars):
    atomic_patterns, atomic_order = pstructures[n_rows]._filter_atomic_patterns_by_support('maximal')

    intents, dt = mine_atomised_intents_cboi(atomic_patterns, atomic_order, timelimit=TIMELIMIT)
    if intents is None:
        break
    stats_cboi[n_rows] = {'#intents cboi': len(intents), 'time cboi': dt}

    subatoms_order = csp.inverse_order(atomic_order)
    pattern_intents, dt = patternise_descriptions(intents, list(atomic_patterns), subatoms_order)
    stats_join[n_rows] = {'time': dt}

  0%|          | 0/7 [00:00<?, ?it/s]

Compute the running time of CbO (i.e. CbOI withoit Implications)

In [24]:
stats_cbo = dict()
for n_rows in tqdm(n_rows_vars):
    atomic_patterns, _ = pstructures[n_rows]._filter_atomic_patterns_by_support('maximal')
    emptyorder = [zeros(len(atomic_patterns)) for _ in atomic_patterns]

    intents, dt = mine_atomised_intents_cboi(atomic_patterns, emptyorder, timelimit=TIMELIMIT)
    if intents is None:
        break
    stats_cbo[n_rows] = {'#intents cbo': len(intents), 'time cbo': dt}

  0%|          | 0/7 [00:00<?, ?it/s]

Compute the running time of LCM

In [25]:
stats_lcm = dict()
for n_rows in tqdm(n_rows_vars):
    atomic_patterns, _ = pstructures[n_rows]._filter_atomic_patterns_by_support('maximal')
    itemsets = csp.io.to_itemsets(csp.io.transpose_context(list(atomic_patterns.values())))

    intents, dt = mine_atomised_intents_lcm(itemsets, timelimit=TIMELIMIT)
    if intents is None:
        break
    stats_lcm[n_rows] = {'#intents lcm': len(intents), 'time lcm': dt}

  0%|          | 0/7 [00:00<?, ?it/s]

Compute the running time of meet-CbO

In [26]:
stats_mcbo = dict()
for n_rows in tqdm(n_rows_vars):
    objects_patterns = [PassengerPattern(row) for row in df[:n_rows].to_dict('index').values()]

    intents, dt = mine_intents_meet_cbo(objects_patterns, timelimit=TIMELIMIT)
    if intents is None:
        break
    stats_mcbo[n_rows] = {'#intents mcbo': len(intents), 'time mcbo': dt}

  0%|          | 0/7 [00:00<?, ?it/s]

Combine all the information into a single table

In [27]:
stats_df = pd.concat([
    pd.DataFrame(atomic_stats), 
    pd.DataFrame(stats_cboi), pd.DataFrame(stats_cbo), pd.DataFrame(stats_lcm), pd.DataFrame(stats_mcbo),
    pd.DataFrame(stats_join)
]).T
stats_df = stats_df.rename(columns={'time': 'time join'})
stats_df.index.name = '#objects'
stats_df = stats_df.reset_index()
stats_df = stats_df[:7]
#stats_df['#atoms'] = stats_df['#atoms'].astype(int)
stats_df['#intents'] = stats_df['#intents cboi']#.astype(int)
stats_df = stats_df[['#objects', 
                     '#atoms', '#intents',
                    'time mcbo', 'time cboi', 'time cbo', 'time lcm',
                    'time atomise', 'time join']]
stats_df

Unnamed: 0,#objects,#atoms,#intents,time mcbo,time cboi,time cbo,time lcm,time atomise,time join
0,5,1000.0,28.0,0.183288,0.001532,0.002761,0.089254,1.393421,0.014668
1,10,1283.0,321.0,2.61342,0.017405,0.044953,0.134519,1.275252,0.147767
2,15,1498.0,1438.0,30.180839,0.075889,0.27296,0.678007,1.415559,0.627547
3,20,1656.0,3794.0,150.704337,0.224948,0.901631,2.132762,1.548037,1.705235
4,25,1784.0,8753.0,444.396787,0.550029,2.404384,5.714042,1.683326,3.895704
5,50,2852.0,57273.0,,6.297654,28.390238,70.76238,2.574883,25.038494
6,100,4573.0,1054138.0,,155.541369,,,4.264424,473.959958


In [28]:
ltx = stats_df.to_latex(float_format="%.3f", index=False, na_rep='')
print(ltx)

\begin{tabular}{rrrrrrrrr}
\toprule
#objects & #atoms & #intents & time mcbo & time cboi & time cbo & time lcm & time atomise & time join \\
\midrule
5 & 1000.000 & 28.000 & 0.183 & 0.002 & 0.003 & 0.089 & 1.393 & 0.015 \\
10 & 1283.000 & 321.000 & 2.613 & 0.017 & 0.045 & 0.135 & 1.275 & 0.148 \\
15 & 1498.000 & 1438.000 & 30.181 & 0.076 & 0.273 & 0.678 & 1.416 & 0.628 \\
20 & 1656.000 & 3794.000 & 150.704 & 0.225 & 0.902 & 2.133 & 1.548 & 1.705 \\
25 & 1784.000 & 8753.000 & 444.397 & 0.550 & 2.404 & 5.714 & 1.683 & 3.896 \\
50 & 2852.000 & 57273.000 &  & 6.298 & 28.390 & 70.762 & 2.575 & 25.038 \\
100 & 4573.000 & 1054138.000 &  & 155.541 &  &  & 4.264 & 473.960 \\
\bottomrule
\end{tabular}



# Q2. Mining frequent intents

In [29]:
min_supports = [800, 700, 600, 500, 400]
print(len(min_supports))

5


Compute atomic patterns for every minimal support threshold

In [30]:
pstructures_freq = {}
atomic_freq_stats = {}
for min_support in tqdm(min_supports):
    t1 = datetime.now()
    ps = PatternStructure(PassengerPattern)
    ps.fit(df.to_dict('index'), min_atom_support=min_support)
    t2 = datetime.now()
    
    pstructures_freq[min_support] = ps
    atomic_freq_stats[min_support] = {
        '#atoms': len(ps._atomic_patterns),
        '#minsup atoms': len(ps._filter_atomic_patterns_by_support('minimal')[0]),
        '#maxsup atoms': len(ps._filter_atomic_patterns_by_support('maximal')[0]),
        'time atomise': (t2-t1).total_seconds()
    }

  0%|          | 0/5 [00:00<?, ?it/s]

Mine frequent atomic intents with CbOI

In [31]:
stats_freq_cboi = {}
for min_support in tqdm(min_supports):
    maxsup_atoms, maxsup_order = pstructures_freq[min_support]._filter_atomic_patterns_by_support('maximal')

    intents, dt = mine_atomised_intents_cboi(maxsup_atoms, maxsup_order, timelimit=TIMELIMIT, min_support=min_support)
    if intents is None: break
    stats_freq_cboi[min_support] = {'#intents cboi': len(intents), 'time cboi': dt}

  0%|          | 0/5 [00:00<?, ?it/s]

Mine frequent atomic intents with CbO

In [32]:
stats_freq_cbo = {}
for min_support in tqdm(min_supports):
    maxsup_atoms, _ = pstructures_freq[min_support]._filter_atomic_patterns_by_support('maximal')
    emptyorder = [zeros(len(maxsup_atoms)) for _ in maxsup_atoms]

    intents, dt = mine_atomised_intents_cboi(maxsup_atoms, emptyorder, timelimit=TIMELIMIT, min_support=min_support)
    if intents is None: break
    stats_freq_cbo[min_support] = {'#intents cbo': len(intents), 'time cbo': dt}

  0%|          | 0/5 [00:00<?, ?it/s]

Mine frequent atomic intents with LCM

In [33]:
stats_freq_lcm = dict()
for min_support in tqdm(min_supports):
    atomic_patterns, _ = pstructures_freq[min_support]._filter_atomic_patterns_by_support('maximal')
    itemsets = csp.io.to_itemsets(csp.io.transpose_context(list(atomic_patterns.values())))

    intents, dt = mine_atomised_intents_lcm(itemsets, timelimit=TIMELIMIT, min_support=min_support)
    if intents is None: break
    stats_freq_lcm[min_support] = {'#intents lcm': len(intents), 'time lcm': dt}

  0%|          | 0/5 [00:00<?, ?it/s]

Combine all obtained results into one table

In [34]:
stats_freq_df = pd.concat([
    pd.DataFrame(atomic_freq_stats),
    pd.DataFrame(stats_freq_cboi), pd.DataFrame(stats_freq_cbo), pd.DataFrame(stats_freq_lcm),
]).T
stats_freq_df.index.name = '#objects'
stats_freq_df = stats_freq_df.reset_index()
stats_freq_df['#intents'] = stats_freq_df['#intents cboi']#.astype(int)
stats_freq_df = stats_freq_df[[
    '#objects',
    '#maxsup atoms', '#intents',
    'time cboi', 'time cbo', 'time lcm',
]]
stats_freq_df

Unnamed: 0,#objects,#maxsup atoms,#intents,time cboi,time cbo,time lcm
0,800,57.0,2409.0,0.200687,0.296464,0.568086
1,700,100.0,7558.0,0.220971,0.542898,1.861538
2,600,152.0,604632.0,31.230825,108.168841,217.104065
3,500,185.0,3444140.0,159.801218,,
4,400,212.0,8053643.0,370.77846,,


In [35]:
print(stats_freq_df.to_latex(float_format='%.2f', index=False, na_rep=''))

\begin{tabular}{rrrrrr}
\toprule
#objects & #maxsup atoms & #intents & time cboi & time cbo & time lcm \\
\midrule
800 & 57.00 & 2409.00 & 0.20 & 0.30 & 0.57 \\
700 & 100.00 & 7558.00 & 0.22 & 0.54 & 1.86 \\
600 & 152.00 & 604632.00 & 31.23 & 108.17 & 217.10 \\
500 & 185.00 & 3444140.00 & 159.80 &  &  \\
400 & 212.00 & 8053643.00 & 370.78 &  &  \\
\bottomrule
\end{tabular}



# Q3. Mining frequent keys

Run the function to mine frequent keys (aka frequent minimal generators)

In [36]:
def mine_keys_talkygi(atomic_patterns, superatoms_order, min_support, timelimit=3600):
    try:
        t1 = datetime.now()
        keys_iterator = algos.mine_equivalence_classes.iter_keys_via_talky_gi(
            atomic_patterns, superatoms_order, min_support=min_support, yield_pattern_keys=False
        )
        keys = timeout(timelimit)(list)(keys_iterator)
        t2 = datetime.now()
    except TimeoutError:
        return None, None
    return keys, (t2-t1).total_seconds()

Compute keys (i.e. minimal generators) using Talky-GI algorithm

In [37]:
stats_talkygi = {}
for min_support in tqdm(min_supports):
    minsup_atoms, minsup_order = pstructures_freq[min_support]._filter_atomic_patterns_by_support('minimal')
    minsup_subatoms_order = algos.base_functions.inverse_order(minsup_order)
    
    keys, dt = mine_keys_talkygi(minsup_atoms, minsup_order, min_support, timelimit=TIMELIMIT)
    if keys is None: break
    stats_talkygi[min_support] = {'#keys talkygi': len(keys), 'time talkygi': dt}

  0%|          | 0/5 [00:00<?, ?it/s]

Compute keys (i.e. minimal generators) using Talky-G algorithm

In [38]:
stats_talkyg = {}
for min_support in tqdm(min_supports):
    minsup_atoms, _ = pstructures_freq[min_support]._filter_atomic_patterns_by_support('minimal')
    emptyorder = [zeros(len(minsup_atoms)) for _ in minsup_atoms]
    
    keys, dt = mine_keys_talkygi(minsup_atoms, emptyorder, min_support, timelimit=TIMELIMIT)
    if keys is None: break
    stats_talkyg[min_support] = {'#keys talkyg': len(keys), 'time talkyg': dt}

ERROR! Session/line number was not unique in database. History logging moved to new session 1868


  0%|          | 0/5 [00:00<?, ?it/s]

In [39]:
stats_freq_df = pd.concat([
    pd.DataFrame(atomic_freq_stats),
    pd.DataFrame(stats_talkygi), pd.DataFrame(stats_talkyg),
]).T
stats_freq_df.index.name = '#objects'
stats_freq_df = stats_freq_df.reset_index()

stats_freq_df = stats_freq_df[[
    '#objects', 
    '#minsup atoms', 
    '#keys talkygi', 'time talkygi',
    '#keys talkyg', 'time talkyg'
]]
stats_freq_df

Unnamed: 0,#objects,#minsup atoms,#keys talkygi,time talkygi,#keys talkyg,time talkyg
0,800,57.0,2409.0,0.167434,2425.0,0.273879
1,700,100.0,7612.0,0.299887,7735.0,0.559552
2,600,152.0,608342.0,29.088621,701639.0,59.457724
3,500,185.0,3459158.0,183.897075,4147580.0,346.352084
4,400,212.0,8115532.0,426.562943,,


In [40]:
print(stats_freq_df.to_latex(float_format='%.2f', index=False, na_rep=''))

\begin{tabular}{rrrrrr}
\toprule
#objects & #minsup atoms & #keys talkygi & time talkygi & #keys talkyg & time talkyg \\
\midrule
800 & 57.00 & 2409.00 & 0.17 & 2425.00 & 0.27 \\
700 & 100.00 & 7612.00 & 0.30 & 7735.00 & 0.56 \\
600 & 152.00 & 608342.00 & 29.09 & 701639.00 & 59.46 \\
500 & 185.00 & 3459158.00 & 183.90 & 4147580.00 & 346.35 \\
400 & 212.00 & 8115532.00 & 426.56 &  &  \\
\bottomrule
\end{tabular}



Find examples of "redundant" keys outputted by Talky-G

In [41]:
ps = pstructures_freq[800]
atomic_patterns, superatoms_order = ps._filter_atomic_patterns_by_support('minimal')
emptyorder = [zeros(len(atomic_patterns)) for _ in atomic_patterns]
subatoms_order = algos.base_functions.inverse_order(superatoms_order)

In [42]:
keys_per_extent = {}
for key, extent in mine_keys_talkygi(atomic_patterns, emptyorder, 800)[0]:
    keys_per_extent[extent] = keys_per_extent.get(extent, set()) | {key}

keys_per_extent_impl = {}
for key, extent in mine_keys_talkygi(atomic_patterns, superatoms_order, 800)[0]:
    keys_per_extent_impl[extent] = keys_per_extent_impl.get(extent, set()) | {key}

In [43]:
extents_diff_keys = [extent for extent in keys_per_extent_impl if keys_per_extent[extent]!=keys_per_extent_impl[extent]]
extent = min(extents_diff_keys, key=lambda extent: min(ba.count() for ba in keys_per_extent_impl[extent]))
#extent

Minimal generators of `extent` according to Talky-G

In [44]:
patternise_descriptions(keys_per_extent[extent], list(atomic_patterns), subatoms_order)[0]

[{'Fare': < 263.0, 'SibSp': < 2.0}, {'Fare': < 262.0, 'SibSp': < 2.0}]

Minimal generators of `extent` according to Talky-GI

In [45]:
patternise_descriptions(keys_per_extent_impl[extent], list(atomic_patterns), subatoms_order)[0]

[{'Fare': < 263.0, 'SibSp': < 2.0}]