In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

# Intro

There are a number of python packages to work with FCA. In this notebook we will compare their performances in the basic FCA task: constructing the concept lattice from a formal context.

We consider three packages: FCApy, fcapsy and Concepts

// More packages can be compared in the future

# Install competitors libraries

The current `FCApy` library (by Egor Dudyrev, HSE Moscow): https://github.com/EgorDudyrev/FCApy 

In [2]:
!pip -q install -U fcapy

In [3]:
from fcapy import LIB_INSTALLED
from fcapy.context import FormalContext, converters
from fcapy.lattice import ConceptLattice
from fcapy.visualizer import Visualizer

`Concepts` package (by Sebastian Bank, University of Leipzig): https://github.com/xflr6/concepts

In [4]:
!pip -q install -U concepts

In [5]:
import concepts

`fcapsy` package (by Tomáš Mikula, Palacký University): https://github.com/mikulatomas/fcapsy

In [6]:
!pip -q install -U fcapsy

In [7]:
import fcapsy

# Load data

First we load some classic FCA contexts (datasets)

In [8]:
frames_classic = {}
contexts_to_test = ['animal_movement', 'digits', 'gewaesser',
                    'lattice', 'liveinwater', 'tealady']

!rm -rf tmp
!mkdir tmp

for ctx_name in contexts_to_test:
    fname = f'tmp/{ctx_name}.cxt'
    !wget -O {fname} -q https://raw.githubusercontent.com/EgorDudyrev/FCApy/main/data/{ctx_name}.cxt
    ctx = converters.read_cxt(fname)
    df = ctx.to_pandas()
    df.name = ctx_name
    frames_classic[ctx_name] = df
!rm -rf tmp

Add Bob-Ross dataset which has more objects and attributes than the classic FCA datasets

In [9]:
import pandas as pd
ctx_name = 'bob_ross'
fname = f"{ctx_name}.csv"
!wget -O {fname} -q https://raw.githubusercontent.com/fivethirtyeight/data/master/bob-ross/elements-by-episode.csv 
df = pd.read_csv(fname)
df['EPISODE_TITLE'] = df['EPISODE']+' '+df['TITLE']
df = df.drop(['EPISODE','TITLE'],1).set_index('EPISODE_TITLE').astype(bool)
df.name = ctx_name
frames_classic[ctx_name] = df
print(df.shape)
!rm {fname}

(403, 67)


These classic real world contexts are small so we add some big random contexts to our examination

In [10]:
import numpy as np
from itertools import product

np.random.seed(42)
n_objects_vars = [10, 15,]
n_attributes_vars = [10, 15]
densities_vars = [0.1, 0.5, 0.9]
frames_random = {}
for comb in product(n_objects_vars, n_attributes_vars, densities_vars):
    n_objects, n_attributes, density = comb

    frame = pd.DataFrame(np.random.binomial(1, density, size=(n_objects,n_attributes)))
    frame.columns = [f"m_{i}" for i in frame.columns]
    frame.index = [f"g_{i}" for i in frame.index]
    frame = frame.astype(bool)

    frame.name = f"random_{n_objects}_{n_attributes}_{density}"
    frames_random[frame.name] = frame

In [11]:
frames = dict(frames_classic, **frames_random)

# Run benchmarks

## Default lattice visualizations

Let us take one classic FCA context 'animal movement' and a bigger one 'bob ross' dataset

The description of Animals context:
* objects (rows) are Animals
* attributes (columns) are Actions
* the table shows whether an Animal can perform an Action

The description of Bob Ross dataset:
* objects (rows) are paintings by Bob Ross
* attributes (columns) are specific elements in these paintings
* the table shows whether an element is on a painting

In [12]:
ctx_names = ['animal_movement',]# 'bob_ross']

In [13]:
ctx_name = ctx_names[0]
print(ctx_name)
df = frames[ctx_name]
print(df.shape)
df.head()

animal_movement
(16, 4)


Unnamed: 0,fly,hunt,run,swim
dove,True,False,False,False
hen,False,False,False,False
duck,True,False,False,True
goose,True,False,False,True
owl,True,True,False,False


### Visualization by `concepts`

The visualization can be found in the file
* _lattice_visualization_concepts_animal_movement.png_
* _lattice_visualization_concepts_bob_ross.png_

In [14]:
from datetime import datetime

In [15]:
for ctx_name in ctx_names:
    df = frames[ctx_name]
    print(ctx_name)
    t1 = datetime.now()
    ctx_concepts = concepts.Context(df.index, df.columns, df.values)
    ctx_concepts.lattice.graphviz(f'lattice_visualization_concepts_{ctx_name}', render=True);
    t2 = datetime.now()
    dt = (t2-t1).total_seconds()
    print(f"Executed in {dt} seconds")

animal_movement
Executed in 0.023466 seconds


### Visualization by `fcapy`

Visualizations can be found in the files
* _lattice_visualization_fcapy_networkx_animal_movement.png_ 
* _lattice_visualization_fcapy_plotly_animal_movement.png_

* _lattice_visualization_fcapy_networkx_bob_ross.png_ 
* _lattice_visualization_fcapy_plotly_bob_ross.png_

In [16]:
import matplotlib.pyplot as plt

for ctx_name in ctx_names:
    df = frames[ctx_name]
    print(ctx_name)
    
    t1 = datetime.now()
    ctx_fcapy = FormalContext.from_pandas(df)
    ltc_fcapy = ConceptLattice.from_context(ctx_fcapy)
    vsl_fcapy = Visualizer(ltc_fcapy)

    plt.title('Networkx lattice')
    vsl_fcapy.draw_networkx()
    plt.savefig(f'lattice_visualization_fcapy_networkx_{ctx_name}.png')
    plt.close()

    fig = vsl_fcapy.get_plotly_figure(title='Plotly lattice')
    fig.write_image(f'lattice_visualization_fcapy_plotly_{ctx_name}.png')
    t2 = datetime.now()
    dt = (t2-t1).total_seconds()
    print(f'Executed in {dt} seconds')

animal_movement
Executed in 1.073445 seconds


## Time to construct a lattice

Functions to run the same lattice construction task with different libraries

In [17]:
def test_intent_extent_time(objects, attributes, extent_func, intent_func, samples_per_size=10):
    times = []
    for arr, fnc in [(objects, intent_func), (attributes, extent_func)]:
        subsample_sizes = np.logspace(0, np.log(len(arr))/np.log(10), 10).round(0).astype(int)    
        np.random.seed(42)
        samples = [sample for size in subsample_sizes for sample in np.random.choice(arr, size=(samples_per_size, size))]
        
        t1 = datetime.now()
        intents = [fnc(sample) for sample in samples]
        t2 = datetime.now()
        dt = (t2-t1).total_seconds()/len(samples) 
        times.append(dt)
    intent_time, extent_time = times
    
    return intent_time, extent_time

In [18]:
from datetime import datetime
def run_concepts(frame):
    ctx = concepts.Context(frame.index, frame.columns, frame.values)
    
    intent_time, extent_time = test_intent_extent_time(frame.index, frame.columns, ctx.extension, ctx.intension)
    
    t1 = datetime.now()
    ltc = ctx.lattice
    t2 = datetime.now()
    dt = (t2-t1).total_seconds()
    
    stat = {
        'lattice_construction_time (secs)': dt,
        'intent_time (secs)': intent_time,
        'extent_time (secs)': extent_time,
    }
    return stat

def run_fcapy(frame):
    LIB_INSTALLED['numpy'] = False
    ctx = FormalContext.from_pandas(frame)
    
    intent_time, extent_time = test_intent_extent_time(frame.index, frame.columns, ctx.extension, ctx.intention)
    
    t1 = datetime.now()
    ltc = ConceptLattice.from_context(ctx, algo='CbO')
    t2 = datetime.now()
    dt = (t2-t1).total_seconds()
    
    stat = {
        'lattice_construction_time (secs)': dt,
        'intent_time (secs)': intent_time,
        'extent_time (secs)': extent_time,
    }
    return stat

def run_fcapsy(frame):
    ctx = fcapsy.Context.from_pandas(frame)
    
    intent_time, extent_time = test_intent_extent_time(frame.index, frame.columns, lambda ar: ctx.down(ctx.Attributes(ar)), lambda ar: ctx.up(ctx.Objects(ar)),)
    
    t1 = datetime.now()
    ltc = fcapsy.Lattice(ctx)
    t2 = datetime.now()
    dt = (t2-t1).total_seconds()
    
    stat = {
        'lattice_construction_time (secs)': dt,
        'intent_time (secs)': intent_time,
        'extent_time (secs)': extent_time,
    }
    return stat

In [19]:
def get_context_stat(frame):
    ctx_stat = {
        'ctx_name': frame.name,
        'n_objects': frame.shape[0], 'n_attributes': frame.shape[1],
        'n_connections': frame.sum().sum(),
        'density': frame.sum().sum()/(frame.shape[0]*frame.shape[1]),
    }
    return ctx_stat

In [20]:
from tqdm.notebook import tqdm
import seaborn as sns

Run the benchmarks

In [21]:
%%time
n_runs = 100

stats_total = []
for ctx_name, frame in tqdm(frames.items(), total=len(frames), disable=False):
    ctx_stat = get_context_stat(frame)
    if ctx_stat['ctx_name'] == 'random_15_15_0.9':
        # works to slow on this context
        continue
    if ctx_stat['ctx_name'] == 'bob_ross':
        # skip the big dataset for now
        continue
    
    stats_per_ctx = []
    for lib_name, run_func in [
        ('concepts', run_concepts),
        ('fcapy', run_fcapy),
        ('fcapsy', run_fcapsy)
    ]:
        for run_number in tqdm(range(n_runs), leave=False, desc=lib_name, disable=True):
            stat = run_func(frame)
            stat = dict(stat, library=lib_name, run_number=run_number, **ctx_stat)
            stats_per_ctx.append(stat)
    stats_total += stats_per_ctx

HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))


CPU times: user 2min 9s, sys: 74.1 ms, total: 2min 9s
Wall time: 2min 9s


Benchmark results can be found in the file:
* _latice_construction_statistics.csv_

Contexts statistics (num. of objects, attributes, e.t.c) is in the file:
* context_statistics.csv

In [22]:
stats_df = pd.DataFrame(stats_total)
stats_df['is_random'] = [x.startswith('random') for x in stats_df['ctx_name']]
stats_df.to_csv('latice_construction_statistics.csv')
stats_df.head()

Unnamed: 0,lattice_construction_time (secs),intent_time (secs),extent_time (secs),library,run_number,ctx_name,n_objects,n_attributes,n_connections,density,is_random
0,0.000409,1.1e-05,4e-06,concepts,0,animal_movement,16,4,24,0.375,False
1,0.000364,4e-06,4e-06,concepts,1,animal_movement,16,4,24,0.375,False
2,0.000463,4e-06,5e-06,concepts,2,animal_movement,16,4,24,0.375,False
3,0.000408,4e-06,5e-06,concepts,3,animal_movement,16,4,24,0.375,False
4,0.000418,5e-06,4e-06,concepts,4,animal_movement,16,4,24,0.375,False


In [23]:
context_stat_feats = ['n_objects', 'n_attributes', 'n_connections', 'density']
context_stat_df = stats_df[['ctx_name',]+context_stat_feats].drop_duplicates().reset_index(drop=True)
context_stat_df.to_csv('context_statistics.csv')

We do not draw any plots in this notebook in order for Github to render it.

Benchmark plot can be found in the files:
* _lattice_construction_time_for_classic_contexts.png_
* _lattice_construction_time_for_random_contexts.png_
* _lattice_construction_time_all_data.png_

In [24]:
plt.rcParams['figure.facecolor'] = (1,1,1,1)  # (1,1,1,0)

In [25]:
y_feat = 'lattice_construction_time (secs)'
width = 2

plt.figure(figsize=(10,7))
for idx, x_feat in enumerate(context_stat_feats):
    plt.subplot(len(context_stat_feats)//width+1, width, idx+1)
    sns.lineplot(x=x_feat, y=y_feat, hue='library', data=stats_df[~stats_df['is_random']])
    plt.xlabel(''); plt.ylabel('')
    plt.title(x_feat)
    plt.xlabel(x_feat)
    plt.ylabel('time (secs) (log scale)')
    plt.legend()
plt.tight_layout()
plt.subplots_adjust(top=0.9)
plt.suptitle(f"{y_feat} based on context statistics\n(for classic fca contexts)")
plt.savefig('lattice_construction_time_for_classic_contexts.png')
plt.close()

In [26]:
context_stat_feats = ['n_objects', 'n_attributes', 'n_connections', 'density']
y_feat = 'lattice_construction_time (secs)'
width = 2

plt.figure(figsize=(10,10))
for idx, x_feat in enumerate(context_stat_feats):
    plt.subplot(len(context_stat_feats)//width+1, width, idx+1)
    sns.lineplot(x=x_feat, y=y_feat, hue='library', data=stats_df[stats_df['is_random']])
    plt.xlabel(''); plt.ylabel('')
    plt.title(x_feat)
    plt.xlabel(x_feat)
    plt.ylabel('time (secs) (log scale)')
    plt.legend()
plt.tight_layout()
plt.subplots_adjust(top=0.9)
plt.suptitle(f"{y_feat} based on context statistics\n(for random contexts)")
plt.savefig('lattice_construction_time_for_random_contexts.png')
plt.close()

In [27]:
context_stat_feats = ['n_objects', 'n_attributes', 'n_connections', 'density']
y_feat = 'lattice_construction_time (secs)'
width = 2

plt.figure(figsize=(10,7))
for idx, x_feat in enumerate(context_stat_feats):
    plt.subplot(len(context_stat_feats)//width+1, width, idx+1)
    sns.lineplot(x=x_feat, y=y_feat, hue='library', data=stats_df)
    plt.xlabel(''); plt.ylabel('')
    plt.title(x_feat)
    plt.yscale('log')
    plt.xlabel(x_feat)
    plt.ylabel('time (secs) (log scale)')
    plt.legend()
plt.tight_layout()
plt.subplots_adjust(top=0.9)
plt.suptitle(f"{y_feat} based on context statistics\n(for random and classic contexts)")
plt.savefig('lattice_construction_time_all_data.png')
plt.close()

In [28]:
stats_df['intent+extent_time (secs)'] = stats_df[['intent_time (secs)', 'extent_time (secs)']].sum(1)

In [29]:
context_stat_feats = ['n_objects', 'n_attributes', 'n_connections', 'density']
y_feat = 'intent+extent_time (secs)'
width = 2

plt.figure(figsize=(10,7))
for idx, x_feat in enumerate(context_stat_feats):
    plt.subplot(len(context_stat_feats)//width+1, width, idx+1)
    sns.lineplot(x=x_feat, y=y_feat, hue='library', data=stats_df)
    plt.xlabel(''); plt.ylabel('')
    plt.title(x_feat)
    plt.xlabel(x_feat)
    plt.ylabel('time (secs)')
    plt.legend()
plt.tight_layout()
plt.subplots_adjust(top=0.9)
plt.suptitle(f"{y_feat} based on context statistics\n(for random and classic contexts)")
plt.savefig('intent_extent_time_all_data.png')
plt.close()