In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

# Intro

There are a number of python packages to work with FCA. In this notebook we will compare their performances in the basic FCA task: constructing the concept lattice from a formal context.

We consider three packages: FCApy, fcapsy and Concepts

// More packages can be compared in the future

# Install competitors libraries

The current `FCApy` library (by Egor Dudyrev, HSE Moscow): https://github.com/EgorDudyrev/FCApy 

In [2]:
!pip -q install -U fcapy[all] --user

In [3]:
from fcapy import LIB_INSTALLED
from fcapy.context import FormalContext, converters
from fcapy.lattice import ConceptLattice
from fcapy.visualizer import Visualizer

`Concepts` package (by Sebastian Bank, University of Leipzig): https://github.com/xflr6/concepts

In [4]:
!pip -q install -U concepts --user

In [5]:
import concepts

`fcapsy` package (by Tomáš Mikula, Palacký University): https://github.com/mikulatomas/fcapsy

In [6]:
!pip -q install -U fcapsy --user

In [7]:
import fcapsy

# Load data

First we load some classic FCA contexts (datasets)

In [8]:
frames_classic = {}
contexts_to_test = ['animal_movement', 'digits', 'gewaesser',
                    'lattice', 'liveinwater', 'tealady']

!rm -rf tmp
!mkdir tmp

for ctx_name in contexts_to_test:
    fname = f'tmp/{ctx_name}.cxt'
    !wget -O {fname} -q https://raw.githubusercontent.com/EgorDudyrev/FCApy/main/data/{ctx_name}.cxt
    ctx = converters.read_cxt(fname)
    df = ctx.to_pandas()
    df.name = ctx_name
    frames_classic[ctx_name] = df
!rm -rf tmp

Add Bob-Ross dataset which has more objects and attributes than the classic FCA datasets

In [9]:
import pandas as pd
ctx_name = 'bob_ross'
fname = f"{ctx_name}.csv"
!wget -O {fname} -q https://raw.githubusercontent.com/fivethirtyeight/data/master/bob-ross/elements-by-episode.csv 
df = pd.read_csv(fname)
df['EPISODE_TITLE'] = df['EPISODE']+' '+df['TITLE']
df = df.drop(['EPISODE','TITLE'],1).set_index('EPISODE_TITLE').astype(bool)
df.name = ctx_name
frames_classic[ctx_name] = df
print(df.shape)
!rm {fname}

(403, 67)


These classic real world contexts are small so we add some big random contexts to our examination

In [10]:
import numpy as np
from itertools import product

np.random.seed(42)
n_objects_vars = [10, 30, 100]
n_attributes_vars = [10, 30, 50]
densities_vars = [0.1, 0.5, 0.9]
frames_random = {}
for comb in product(n_objects_vars, n_attributes_vars, densities_vars):
    n_objects, n_attributes, density = comb

    frame = pd.DataFrame(np.random.binomial(1, density, size=(n_objects,n_attributes)))
    frame.columns = [f"m_{i}" for i in frame.columns]
    frame.index = [f"g_{i}" for i in frame.index]
    frame = frame.astype(bool)

    frame.name = f"random_{n_objects}_{n_attributes}_{density}"
    frames_random[frame.name] = frame

In [11]:
frames = dict(frames_classic, **frames_random)
#frames = dict(frames_classic)

# Run benchmarks

## Default lattice visualizations

Let us take one classic FCA context 'animal movement' and a bigger one 'bob ross' dataset

The description of Animals context:
* objects (rows) are Animals
* attributes (columns) are Actions
* the table shows whether an Animal can perform an Action

The description of Bob Ross dataset:
* objects (rows) are paintings by Bob Ross
* attributes (columns) are specific elements in these paintings
* the table shows whether an element is on a painting

In [12]:
ctx_names = ['animal_movement', 'tealady']#'bob_ross']

In [13]:
ctx_name = ctx_names[0]
print(ctx_name)
df = frames[ctx_name]
print(df.shape)
df.head()

animal_movement
(16, 4)


Unnamed: 0,fly,hunt,run,swim
dove,True,False,False,False
hen,False,False,False,False
duck,True,False,False,True
goose,True,False,False,True
owl,True,True,False,False


### Visualization by `concepts`

The visualization can be found in the file
* _lattice_visualization_concepts_animal_movement.png_
* _lattice_visualization_concepts_bob_ross.png_

In [14]:
from datetime import datetime

In [15]:
for ctx_name in ctx_names:
    df = frames[ctx_name]
    print(ctx_name)
    t1 = datetime.now()
    ctx_concepts = concepts.Context(df.index, df.columns, df.values)
    ltc_concepts = ctx_concepts.lattice
    print(f'Lattice constructed in {(datetime.now()-t1).total_seconds()} seconds')
    ltc_concepts.graphviz(f'imgs/lattice_visualization/concepts_{ctx_name}', render=True);
    t2 = datetime.now()
    dt = (t2-t1).total_seconds()
    print(f"Executed in {dt} seconds")

animal_movement
Lattice constructed in 0.001663 seconds
Executed in 0.058675 seconds
tealady
Lattice constructed in 0.006811 seconds
Executed in 0.056226 seconds


### Visualization by `fcapy`

Visualizations can be found in the files
* _lattice_visualization_fcapy_networkx_animal_movement.png_ 
* _lattice_visualization_fcapy_plotly_animal_movement.png_

* _lattice_visualization_fcapy_networkx_bob_ross.png_ 
* _lattice_visualization_fcapy_plotly_bob_ross.png_

In [16]:
import matplotlib.pyplot as plt

for ctx_name in ctx_names:
    df = frames[ctx_name]
    print(ctx_name)
    
    t1 = datetime.now()
    ctx_fcapy = FormalContext.from_pandas(df)
    ltc_fcapy = ConceptLattice.from_context(ctx_fcapy)
    print(f'Lattice constructed in {(datetime.now()-t1).total_seconds()} seconds')
    vsl_fcapy = Visualizer(ltc_fcapy)
    print(f'Visualizer constructed in {(datetime.now()-t1).total_seconds()} seconds')

    plt.title('Networkx lattice')
    vsl_fcapy.draw_networkx()
    plt.savefig(f'imgs/lattice_visualization/fcapy_networkx_{ctx_name}.png')
    plt.close()
    print(f'Png saved in {(datetime.now()-t1).total_seconds()} seconds')

    fig = vsl_fcapy.get_plotly_figure(title='Plotly lattice')
    fig.write_image(f'imgs/lattice_visualization/fcapy_plotly_{ctx_name}.png')
    t2 = datetime.now()
    dt = (t2-t1).total_seconds()
    print(f'Executed in {dt} seconds')

animal_movement
Lattice constructed in 0.041466 seconds
Visualizer constructed in 0.042298 seconds
Png saved in 0.245663 seconds
Executed in 3.016535 seconds
tealady
Lattice constructed in 0.013088 seconds
Visualizer constructed in 0.015431 seconds
Png saved in 0.86057 seconds
Executed in 1.020402 seconds


## Time to construct a lattice

Functions to run the same lattice construction task with different libraries

In [17]:
from datetime import datetime

In [18]:
def construct_context_by_lib(frame, lib_name):
    if lib_name == 'concepts':
        context = concepts.Context(frame.index, frame.columns, frame.values)
    elif lib_name == 'fcapy':
        context = FormalContext.from_pandas(frame)
    elif lib_name == 'fcapsy':
        context = fcapsy.Context.from_pandas(frame)
    else:
        raise ValueError(f'Given library "{lib_name}" is not supported')
        
    return context

In [19]:
def test_intent_extent_time_by_func(objects, attributes, extent_func, intent_func, samples_per_size=100):
    times = []
    for arr, fnc in [(objects, intent_func), (attributes, extent_func)]:
        subsample_sizes = np.logspace(0, np.log(len(arr))/np.log(10), 10).round(0).astype(int)    
        np.random.seed(42)
        samples = [sample for size in subsample_sizes for sample in np.random.choice(arr, size=(samples_per_size, size))]
        
        t1 = datetime.now()
        intents = [fnc(sample) for sample in samples]
        t2 = datetime.now()
        dt = (t2-t1).total_seconds()/len(samples) 
        times.append(dt)
    intent_time, extent_time = times
    
    return intent_time, extent_time


def test_intent_extent_time_by_lib(frame, context, lib_name, samples_per_size=100):
    if lib_name == 'concepts':
        intent_time, extent_time = test_intent_extent_time_by_func(
            frame.index, frame.columns, context.extension, context.intension, samples_per_size)
    elif lib_name == 'fcapy':
        intent_time, extent_time = test_intent_extent_time_by_func(
            frame.index, frame.columns, context.extension, context.intention, samples_per_size)
    elif lib_name == 'fcapsy':
        intent_time, extent_time = test_intent_extent_time_by_func(
            frame.index, frame.columns,
            lambda ar: context.down(context.Attributes(ar)),
            lambda ar: context.up(context.Objects(ar)),
            samples_per_size
        )
    else:
        raise ValueError(f'Given library "{lib_name}" is not supported')
        
    return intent_time, extent_time


def test_intent_extent_time_by_lib_multiprocess(frame, context, lib_name, intent_time, extent_time, samples_per_size=100):
    intent_time.value, extent_time.value = test_intent_extent_time_by_lib(frame, context, lib_name, samples_per_size)

In [20]:
def test_lattice_time_by_func(context, lattice_func):
    t1 = datetime.now()
    ltc = lattice_func(context)
    t2 = datetime.now()
    dt = (t2-t1).total_seconds()
    return dt

def test_lattice_time_by_lib(context, lib_name):
    if lib_name == 'concepts':
        lattice_time = test_lattice_time_by_func(context, lambda ctx: ctx.lattice)
    elif lib_name == 'fcapy':
        lattice_time = test_lattice_time_by_func(context, lambda ctx: ConceptLattice.from_context(ctx, algo='CbO'))
    elif lib_name == 'fcapsy':
        lattice_time = test_lattice_time_by_func(context, lambda ctx: fcapsy.Lattice.from_context(ctx))
    else:
        raise ValueError(f'Given library "{lib_name}" is not supported')

    return lattice_time

def test_lattice_time_by_lib_multiprocess(context, lib_name, lattice_time):
    lattice_time.value = test_lattice_time_by_lib(context, lib_name)

In [21]:
import multiprocessing
def run_func_multiprocess(frame, lib_name, timeout_seconds):
    context = construct_context_by_lib(frame, lib_name)
    
    lattice_time, intent_time, extent_time = [multiprocessing.Value('f', -1, lock=False) for _ in range(3)]
    
    p = multiprocessing.Process(
        target=test_intent_extent_time_by_lib_multiprocess,
        name=f"test_intent_extent_{lib_name}",
        args=[frame, context, lib_name, intent_time, extent_time, 1000])
    p.start()
    p.join(timeout_seconds)
    if p.is_alive():    
        p.terminate()
        
    p = multiprocessing.Process(
        target=test_lattice_time_by_lib_multiprocess,
        name=f"test_lattice_{lib_name}",
        args=[context, lib_name, lattice_time])
    p.start()
    p.join(timeout_seconds)
    if p.is_alive():    
        p.terminate()
        
    def neg1_to_none(multiprocess_var):
        return multiprocess_var.value if multiprocess_var.value != -1 else None

    stat = {
        'lattice_construction_time (secs)': neg1_to_none(lattice_time),
        'intent_time (secs)': neg1_to_none(intent_time),
        'extent_time (secs)': neg1_to_none(extent_time),
        'timeout_seconds': timeout_seconds,
    }
    return stat

In [22]:
def get_context_stat(frame):
    ctx_stat = {
        'ctx_name': frame.name,
        'n_objects': frame.shape[0], 'n_attributes': frame.shape[1],
        'n_connections': frame.sum().sum(),
        'density': frame.sum().sum()/(frame.shape[0]*frame.shape[1]),
        'is_random': frame.name.startswith('random')
    }
    return ctx_stat

In [23]:
from tqdm.notebook import tqdm
import seaborn as sns

Run the benchmarks

In [24]:
frames_order = sorted(frames, key=lambda ctx_name: get_context_stat(frames[ctx_name])['n_connections'])

In [25]:
n_runs = 10
timeout_secs = 5*60

In [26]:
from itertools import product

In [27]:
run_number_vals = list(range(n_runs))
ctx_names_vals = frames_order
lib_names_vals = ['concepts', 'fcapy', 'fcapsy']
all_combs = list(product(run_number_vals, ctx_names_vals, lib_names_vals))
print(len(all_combs))

1020


In [28]:
stats_df = pd.DataFrame(all_combs, columns=['run_number', 'ctx_name', 'lib_name'])
stats_df['is_computed'] = False
stats_df.to_csv('benchmark_stats_tmp.csv')

In [29]:
%%time

df_to_compute = stats_df[~stats_df['is_computed']][['run_number','ctx_name','lib_name']]
for comb in tqdm(df_to_compute.iterrows(), total=len(df_to_compute)):
    stats_df = pd.read_csv('benchmark_stats_tmp.csv', index_col=0)
    row_idx, (run_number, ctx_name, lib_name) = comb
    
    frame = frames[ctx_name]
    frame_stat = get_context_stat(frame)
    
    stat = run_func_multiprocess(frame, lib_name, timeout_seconds=timeout_secs)
    stat = dict(stat, **frame_stat)
    
    for k,v in stat.items():
        stats_df.loc[row_idx, k] = v
    stats_df.loc[row_idx, 'is_computed'] = True
    
    stats_df.to_csv('benchmark_stats_tmp.csv')

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1020.0), HTML(value='')))


CPU times: user 26.2 s, sys: 10.5 s, total: 36.7 s
Wall time: 15h 4min 37s


In [30]:
!cp benchmark_stats_tmp.csv benchmark_stats.csv
!rm benchmark_stats_tmp.csv

# Analyze the results

In [31]:
stats_df = pd.read_csv('benchmark_stats.csv', index_col=0)
print(stats_df.shape)
stats_df.head()

(1020, 13)


Unnamed: 0,run_number,ctx_name,lib_name,is_computed,lattice_construction_time (secs),intent_time (secs),extent_time (secs),timeout_seconds,n_objects,n_attributes,n_connections,density,is_random
0,0,random_10_10_0.1,concepts,True,0.000661,6e-06,5e-06,300.0,10.0,10.0,9.0,0.09,True
1,0,random_10_10_0.1,fcapy,True,0.0017,5e-06,6e-06,300.0,10.0,10.0,9.0,0.09,True
2,0,random_10_10_0.1,fcapsy,True,0.000537,6e-06,4e-06,300.0,10.0,10.0,9.0,0.09,True
3,0,animal_movement,concepts,True,0.000837,6e-06,4e-06,300.0,16.0,4.0,24.0,0.375,False
4,0,animal_movement,fcapy,True,0.002259,5e-06,7e-06,300.0,16.0,4.0,24.0,0.375,False


In [32]:
stats_df = stats_df.fillna(timeout_secs)

Benchmark results can be found in the file:
* _latice_construction_statistics.csv_

Contexts statistics (num. of objects, attributes, e.t.c) is in the file:
* _context_statistics.csv_

In [33]:
context_stat_feats = ['n_objects', 'n_attributes', 'n_connections', 'density']
context_stat_df = stats_df[['ctx_name',]+context_stat_feats].drop_duplicates().reset_index(drop=True)
context_stat_df.to_csv('context_statistics.csv')

We do not draw any plots in this notebook in order for Github to render it.

Benchmark plot can be found in the files:
* _lattice_construction_time_for_classic_contexts.png_
* _lattice_construction_time_for_random_contexts.png_
* _lattice_construction_time_all_data.png_

In [34]:
plt.rcParams['figure.facecolor'] = (1,1,1,1)  # (1,1,1,0)

In [35]:
y_feat = 'lattice_construction_time (secs)'
width = 2

plt.figure(figsize=(10,10))
for idx, x_feat in enumerate(context_stat_feats):
    plt.subplot(len(context_stat_feats)//width+1, width, idx+1)
    sns.lineplot(x=x_feat, y=y_feat, hue='lib_name', data=stats_df[~stats_df['is_random']])
    plt.xlabel(''); plt.ylabel('')
    plt.title(x_feat)
    plt.xlabel(x_feat)
    plt.ylabel('time (secs)')
    plt.axhline(timeout_secs, linestyle='--', color='grey') #label='maximal time per run')
    plt.text(plt.xlim()[1]*0.6, timeout_secs*1.05, 'maximal time per run')
    plt.ylim(-1, timeout_secs*1.2)
    plt.legend()
    
plt.tight_layout()
plt.subplots_adjust(top=0.9)
plt.suptitle(f"{y_feat} based on context statistics\n(for classic fca contexts)")
plt.savefig('imgs/lattice_construction_time/classic_contexts.png', pad_inches=0.1, bbox_inches='tight')
plt.close()

In [36]:
context_stat_feats = ['n_objects', 'n_attributes', 'n_connections', 'density']
y_feat = 'lattice_construction_time (secs)'
width = 2

plt.figure(figsize=(10,10))
for idx, x_feat in enumerate(context_stat_feats):
    plt.subplot(len(context_stat_feats)//width+1, width, idx+1)
    sns.lineplot(x=x_feat, y=y_feat, hue='lib_name', data=stats_df[stats_df['is_random']])
    plt.xlabel(''); plt.ylabel('')
    plt.title(x_feat)
    plt.xlabel(x_feat)
    plt.ylabel('time (secs)')
    plt.axhline(timeout_secs, linestyle='--', color='grey') #label='maximal time per run')
    plt.text(plt.xlim()[1]*0.6, timeout_secs*1.05, 'maximal time per run')
    plt.ylim(-1, timeout_secs*1.2)
    plt.legend()
plt.tight_layout()
plt.subplots_adjust(top=0.9)
plt.suptitle(f"{y_feat} based on context statistics\n(for random contexts)")
plt.savefig('imgs/lattice_construction_time/random_contexts.png', pad_inches=0.1, bbox_inches='tight')
plt.close()

In [37]:
stats_df['intent+extent_time (secs)'] = stats_df[['intent_time (secs)', 'extent_time (secs)']].sum(1)

In [38]:
context_stat_feats = ['n_objects', 'n_attributes', 'n_connections', 'density']
y_feat = 'intent+extent_time (secs)'
width = 2

plt.figure(figsize=(10,7))
for idx, x_feat in enumerate(context_stat_feats):
    plt.subplot(len(context_stat_feats)//width+1, width, idx+1)
    #sns.lineplot(x=x_feat, y=y_feat, hue='lib_name', data=stats_df)
    sns.lineplot(x=stats_df[x_feat], y=stats_df[y_feat]*1e6, hue=stats_df['lib_name'])
    plt.xlabel(''); plt.ylabel('')
    plt.title(x_feat)
    plt.xlabel(x_feat)
    plt.ylabel(r'time (microseconds)')
    plt.legend()
plt.tight_layout()
plt.subplots_adjust(top=0.9)
plt.suptitle(f"{y_feat} based on context statistics\n(for random and classic contexts)")
plt.savefig('imgs/intent_extent_time/all_data.png', pad_inches=0.1, bbox_inches='tight')
plt.close()