In [25]:
import pandas as pd 
import numpy as np
import os
import json
from hbic import Hbic
from hbic.utils import metrics 
from tqdm import tqdm
import time

In [28]:
def get_experiment_list(root):
    experiments = os.listdir(root)
    return experiments

def preprocess_ref_json(refs):
    keys = set(refs.keys())
    bic_keys = keys.intersection({"biclusters", "SymbolicBiclusters", "NumericBiclusters"})
    X, Y = [], []
    for key in bic_keys:
        for bic in refs[key]:
            X.append(refs[key][bic]["X"])
            Y.append(refs[key][bic]["Y"])
    X = [np.where(x)[0] for x in X]
    Y = [np.where(y)[0] for y in Y]
    return X, Y

def load_experiment(experiment):
    path = os.path.join(root, experiment)
    files = os.listdir(path)
    experiment_arrays = []
    experiment_biclusters = []
    for file in files:
        if file.endswith('.tsv'):
            experiment_arrays.append(pd.read_csv(os.path.join(path, file), sep = '\t'))
        if file.endswith('.json'):
            with open(os.path.join(path, file)) as f:
                experiment_biclusters.append(json.load(f))
    experiment_arrays = [(arr, preprocess_ref_json(ref)) for arr, ref in zip(experiment_arrays, experiment_biclusters)]
    return experiment_arrays

def score_biclusters(biclusters, ref_bic, reduction):
    dic = {}
    dic["experiment"] = directory
    dic["reduce"] = reduction
    dic["rel"] = metrics.prelic_relevance(biclusters, ref_bic)
    dic["nb_bics"] = len(biclusters)
    dic["rec"] = metrics.prelic_recovery(biclusters, ref_bic)
    dic["l_and_w"] = metrics.l_and_w(biclusters,ref_bic)
    dic["ayadi"] = metrics.ayadi(biclusters, ref_bic)
    return dic

root = "datasets/GBIC/"


def run_experiment(experiment):
    data = load_experiment(experiment)
    for nb_exp,exp in tqdm(enumerate(data), desc="Exp:"+str(experiment)):
        arr, ref = exp
        results = []
        for reduction in ["distance","pareto", None]:
            start = time.time()
            hbic = Hbic(reduction=reduction, min_cols=2, min_rows=2)
            biclusters = hbic.fit_predict(arr, var_type=[""])
            dic = score_biclusters(biclusters, ref)
            dic["experiment"] = experiment
            dic["instance"] = nb_exp
            dic["time"] = time.time()-start
            results.append(dic)   
    return results

if __name__ == "__main__":
    experiments = get_experiment_list(root)
    results = []
    for experiment in experiments:
        res = run_experiment(experiment)
        results.append(res)

Exp:EXP1_Heterogeneity_Level: 0it [00:00, ?it/s]


InvalidIndexError: (slice(None, None, None), 0)

In [27]:
time

<module 'time' (built-in)>