In [26]:
from gbd_core.api import GBD
import pandas as pd

def eval(confs):
    df = pd.read_csv('top40.csv')
    with GBD([ '/home/raphael-zipperer/Uni/BA/database/base.db', '/home/raphael-zipperer/Uni/BA/database/meta.db' ]) as gbd:
        data = gbd.query('(track=main_2023 or track=main_2024) and minisat1m!=yes', resolve=["family"])

    family_hashes = {}
    for family in data['family'].unique():
        family_hashes[family] = data[data['family'] == family]['hash'].tolist()

    #best_confs = {}
    total = 0
    famscore = pd.read_csv('familyscores.csv')
    famscore_filtered = famscore[(famscore['configuration'].isin(confs))]
    for family, hashes in family_hashes.items():
        fam = famscore_filtered[(famscore_filtered['family'] == family)]
        min_time = fam['time'].min()
        total += min_time
    return total

    

In [21]:
import csv

df = pd.read_csv('top40.csv')
confs = df['configuration'].unique().tolist()

with GBD([ '/home/raphael-zipperer/Uni/BA/database/base.db', '/home/raphael-zipperer/Uni/BA/database/meta.db' ]) as gbd:
    data = gbd.query('(track=main_2023 or track=main_2024) and minisat1m!=yes', resolve=["family"])

family_hashes = {}
for family in data['family'].unique():
    family_hashes[family] = data[data['family'] == family]['hash'].tolist()

#best_confs = {}



with open('familyscores.csv', mode='w', newline='') as file:
    writer = csv.writer(file)
    # Write the header
    writer.writerow(['time', 'family', 'configuration'])               
    # Write the data
    for family, hashes in family_hashes.items():
        for conf in confs:
            conf_df = df[df['configuration'] == conf]
            rows = conf_df[conf_df['key'].isin(hashes)]
            total_time = rows['time'].sum()
            writer.writerow([total_time, family, conf])




In [34]:
def decimate(configurations):
    baseline = eval(configurations)
    results = []
    for conf in configurations:
        remaining_confs = [c for c in configurations if c != conf]
        result = eval(remaining_confs)
        results.append((conf, result))
    return [(x,y) for x,y in results if y > baseline + 10]
    


In [None]:
df = pd.read_csv('top40.csv')
configurations = df['configuration'].unique().tolist()
results = decimate(configurations)
results = [x for x,y in results]

def beamsearch(beam, beamwidth, depth):
    while len(beam[0]) > depth:
        newbeam = []
        for results in beam:
            print(results)
            new_results = []
            for conf in results:
                remaining_confs = [c for c in results if c != conf]
                score = eval(remaining_confs)
                new_results.append((conf, score))
            new_results = [([c for c in results if c != x],y) for x,y in new_results]
            newbeam.extend(new_results.copy())
        newbeam = sorted(newbeam, key=lambda x: x[1])
        beam = newbeam[:beamwidth]
        beam = [x for x,_ in beam]
        with open('beams.txt', 'a') as f:
            f.write(f"Length: {len(beam[0])}\n")
            f.write(f"Entry: {beam[0]}\n")
    return beam

beamwidth = 10
best_results = beamsearch([results], beamwidth, 15)
print(len(best_results))
print(eval(best_results[0]))

print(len(results))
#valsonly = [y for x,y in results]
#print(valsonly)

['Default', "{'backbone': 0, 'bump': 1, 'chrono': 1, 'congruence': 0, 'eliminate': 1, 'extract': 1, 'factor': 0, 'fastel': 1, 'forward': 1, 'lucky': 0, 'phase': 0, 'phasesaving': 0, 'preprocess': 0, 'probe': 1, 'randec': 1, 'reluctant': 0, 'reorder': 1, 'rephase': 0, 'restart': 1, 'stable': 0, 'substitute': 1, 'sweep': 1, 'target': 1, 'transitive': 0, 'vivify': 1, 'warmup': 0}", "{'backbone': 1, 'bump': 1, 'chrono': 1, 'congruence': 0, 'eliminate': 1, 'extract': 0, 'factor': 1, 'fastel': 0, 'forward': 0, 'lucky': 1, 'phase': 1, 'phasesaving': 0, 'preprocess': 1, 'probe': 1, 'randec': 1, 'reluctant': 1, 'reorder': 2, 'rephase': 1, 'restart': 0, 'stable': 1, 'substitute': 0, 'sweep': 0, 'target': 2, 'transitive': 0, 'vivify': 1, 'warmup': 0}", "{'backbone': 2, 'bump': 1, 'chrono': 0, 'congruence': 0, 'eliminate': 1, 'extract': 1, 'factor': 1, 'fastel': 0, 'forward': 1, 'lucky': 0, 'phase': 1, 'phasesaving': 1, 'preprocess': 1, 'probe': 0, 'randec': 1, 'reluctant': 1, 'reorder': 0, 'repha

In [None]:
import pandas as pd
import pickle
from utils import *

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

def get_available_features():
    with GBD([ "/home/raphael-zipperer/Uni/BA/database/base.db" ]) as gbd:
        feat = gbd.get_features('base')
        feat.remove("base_features_runtime")
        feat.remove("ccs")
        feat.remove("bytes")
        return feat

def get_prediction_dataset(features, target):
    with GBD([ '/home/raphael-zipperer/Uni/BA/database/base.db', '/home/raphael-zipperer/Uni/BA/database/meta.db' ]) as gbd:
        df = gbd.query('(track=main_2023 or track=main_2024) and minisat1m!=yes', resolve=features+[target])
        df[features] = df[features].apply(pd.to_numeric)
        #for i, group in enumerate(groups):
        #    if df[target].isin(group).any():
        #        df[target] = df[target].apply(lambda x: i if x in group else x)
        return df

def get_dataset_by_hashes(hashes_list):
    with GBD(['/home/raphael-zipperer/Uni/BA/database/base.db', '/home/raphael-zipperer/Uni/BA/database/meta.db']) as gbd:
        features = get_available_features()
        all_hashes = [hash for sublist in hashes_list for hash in sublist]
        df = gbd.query('(track=main_2023 or track=main_2024) and minisat1m!=yes', resolve = features)
        df = df[df['hash'].isin(all_hashes)]
        df[features] = df[features].apply(pd.to_numeric)
        df['index'] = -1
        for i, sublist in enumerate(hashes_list):
            df.loc[df['hash'].isin(sublist), 'index'] = i
        return df


configs = 'Default', "{'backbone': 0, 'bump': 1, 'chrono': 1, 'congruence': 0, 'eliminate': 1, 'extract': 1, 'factor': 0, 'fastel': 1, 'forward': 1, 'lucky': 0, 'phase': 0, 'phasesaving': 0, 'preprocess': 0, 'probe': 1, 'randec': 1, 'reluctant': 0, 'reorder': 1, 'rephase': 0, 'restart': 1, 'stable': 0, 'substitute': 1, 'sweep': 1, 'target': 1, 'transitive': 0, 'vivify': 1, 'warmup': 0}", "{'backbone': 1, 'bump': 1, 'chrono': 1, 'congruence': 0, 'eliminate': 1, 'extract': 0, 'factor': 1, 'fastel': 0, 'forward': 0, 'lucky': 1, 'phase': 1, 'phasesaving': 0, 'preprocess': 1, 'probe': 1, 'randec': 1, 'reluctant': 1, 'reorder': 2, 'rephase': 1, 'restart': 0, 'stable': 1, 'substitute': 0, 'sweep': 0, 'target': 2, 'transitive': 0, 'vivify': 1, 'warmup': 0}", "{'backbone': 2, 'bump': 1, 'chrono': 0, 'congruence': 0, 'eliminate': 1, 'extract': 1, 'factor': 1, 'fastel': 0, 'forward': 1, 'lucky': 0, 'phase': 1, 'phasesaving': 1, 'preprocess': 1, 'probe': 0, 'randec': 1, 'reluctant': 1, 'reorder': 0, 'rephase': 0, 'restart': 1, 'stable': 2, 'substitute': 1, 'sweep': 0, 'target': 0, 'transitive': 0, 'vivify': 1, 'warmup': 0}", "{'backbone': 0, 'bump': 1, 'chrono': 1, 'congruence': 0, 'eliminate': 0, 'extract': 0, 'factor': 1, 'fastel': 1, 'forward': 1, 'lucky': 1, 'phase': 1, 'phasesaving': 1, 'preprocess': 1, 'probe': 1, 'randec': 0, 'reluctant': 0, 'reorder': 2, 'rephase': 0, 'restart': 1, 'stable': 0, 'substitute': 1, 'sweep': 1, 'target': 0, 'transitive': 0, 'vivify': 1, 'warmup': 0}", "{'backbone': 0, 'bump': 0, 'chrono': 0, 'congruence': 0, 'eliminate': 0, 'extract': 1, 'factor': 0, 'fastel': 1, 'forward': 1, 'lucky': 1, 'phase': 1, 'phasesaving': 1, 'preprocess': 1, 'probe': 0, 'randec': 0, 'reluctant': 0, 'reorder': 1, 'rephase': 1, 'restart': 0, 'stable': 2, 'substitute': 1, 'sweep': 1, 'target': 0, 'transitive': 1, 'vivify': 1, 'warmup': 1}", "{'backbone': 2, 'bump': 1, 'chrono': 1, 'congruence': 0, 'eliminate': 0, 'extract': 0, 'factor': 1, 'fastel': 1, 'forward': 0, 'lucky': 0, 'phase': 0, 'phasesaving': 1, 'preprocess': 0, 'probe': 0, 'randec': 1, 'reluctant': 1, 'reorder': 2, 'rephase': 1, 'restart': 1, 'stable': 2, 'substitute': 0, 'sweep': 1, 'target': 1, 'transitive': 1, 'vivify': 1, 'warmup': 0}", "{'backbone': 2, 'bump': 0, 'chrono': 1, 'congruence': 0, 'eliminate': 1, 'extract': 1, 'factor': 0, 'fastel': 0, 'forward': 1, 'lucky': 0, 'phase': 0, 'phasesaving': 0, 'preprocess': 1, 'probe': 1, 'randec': 1, 'reluctant': 0, 'reorder': 2, 'rephase': 0, 'restart': 0, 'stable': 1, 'substitute': 1, 'sweep': 0, 'target': 0, 'transitive': 1, 'vivify': 0, 'warmup': 1}", "{'backbone': 1, 'bump': 1, 'chrono': 0, 'congruence': 0, 'eliminate': 1, 'extract': 1, 'factor': 0, 'fastel': 1, 'forward': 0, 'lucky': 1, 'phase': 0, 'phasesaving': 0, 'preprocess': 1, 'probe': 1, 'randec': 1, 'reluctant': 0, 'reorder': 2, 'rephase': 0, 'restart': 1, 'stable': 1, 'substitute': 0, 'sweep': 0, 'target': 2, 'transitive': 1, 'vivify': 1, 'warmup': 1}", "{'backbone': 2, 'bump': 1, 'chrono': 1, 'congruence': 0, 'eliminate': 1, 'extract': 1, 'factor': 1, 'fastel': 0, 'forward': 1, 'lucky': 0, 'phase': 0, 'phasesaving': 1, 'preprocess': 0, 'probe': 0, 'randec': 0, 'reluctant': 1, 'reorder': 0, 'rephase': 0, 'restart': 1, 'stable': 2, 'substitute': 0, 'sweep': 0, 'target': 2, 'transitive': 1, 'vivify': 1, 'warmup': 1}", "{'backbone': 2, 'bump': 1, 'chrono': 1, 'congruence': 0, 'eliminate': 1, 'extract': 1, 'factor': 0, 'fastel': 1, 'forward': 0, 'lucky': 1, 'phase': 1, 'phasesaving': 1, 'preprocess': 1, 'probe': 1, 'randec': 1, 'reluctant': 1, 'reorder': 0, 'rephase': 1, 'restart': 1, 'stable': 1, 'substitute': 0, 'sweep': 0, 'target': 1, 'transitive': 1, 'vivify': 0, 'warmup': 0}", "{'backbone': 2, 'bump': 1, 'chrono': 0, 'congruence': 1, 'eliminate': 0, 'extract': 0, 'factor': 1, 'fastel': 1, 'forward': 0, 'lucky': 1, 'phase': 1, 'phasesaving': 1, 'preprocess': 0, 'probe': 1, 'randec': 1, 'reluctant': 1, 'reorder': 1, 'rephase': 0, 'restart': 0, 'stable': 2, 'substitute': 1, 'sweep': 1, 'target': 0, 'transitive': 1, 'vivify': 0, 'warmup': 0}", "{'backbone': 2, 'bump': 1, 'chrono': 1, 'congruence': 0, 'eliminate': 0, 'extract': 1, 'factor': 1, 'fastel': 1, 'forward': 1, 'lucky': 0, 'phase': 0, 'phasesaving': 0, 'preprocess': 0, 'probe': 0, 'randec': 0, 'reluctant': 1, 'reorder': 0, 'rephase': 1, 'restart': 1, 'stable': 2, 'substitute': 0, 'sweep': 0, 'target': 2, 'transitive': 1, 'vivify': 0, 'warmup': 1}", "{'backbone': 0, 'bump': 1, 'chrono': 1, 'congruence': 1, 'eliminate': 1, 'extract': 0, 'factor': 1, 'fastel': 0, 'forward': 1, 'lucky': 0, 'phase': 1, 'phasesaving': 1, 'preprocess': 0, 'probe': 1, 'randec': 1, 'reluctant': 1, 'reorder': 0, 'rephase': 0, 'restart': 1, 'stable': 0, 'substitute': 0, 'sweep': 1, 'target': 0, 'transitive': 0, 'vivify': 1, 'warmup': 1}", "{'backbone': 0, 'bump': 0, 'chrono': 0, 'congruence': 1, 'eliminate': 0, 'extract': 0, 'factor': 0, 'fastel': 0, 'forward': 0, 'lucky': 1, 'phase': 1, 'phasesaving': 1, 'preprocess': 0, 'probe': 0, 'randec': 0, 'reluctant': 1, 'reorder': 0, 'rephase': 1, 'restart': 1, 'stable': 0, 'substitute': 1, 'sweep': 0, 'target': 0, 'transitive': 1, 'vivify': 0, 'warmup': 0}", "{'backbone': 2, 'bump': 1, 'chrono': 0, 'congruence': 0, 'eliminate': 1, 'extract': 0, 'factor': 1, 'fastel': 1, 'forward': 1, 'lucky': 0, 'phase': 0, 'phasesaving': 1, 'preprocess': 0, 'probe': 0, 'randec': 0, 'reluctant': 0, 'reorder': 2, 'rephase': 0, 'restart': 0, 'stable': 2, 'substitute': 0, 'sweep': 1, 'target': 0, 'transitive': 0, 'vivify': 0, 'warmup': 0}", "{'backbone': 2, 'bump': 1, 'chrono': 0, 'congruence': 0, 'eliminate': 1, 'extract': 1, 'factor': 1, 'fastel': 0, 'forward': 0, 'lucky': 1, 'phase': 0, 'phasesaving': 1, 'preprocess': 1, 'probe': 1, 'randec': 1, 'reluctant': 0, 'reorder': 2, 'rephase': 1, 'restart': 0, 'stable': 2, 'substitute': 0, 'sweep': 1, 'target': 2, 'transitive': 0, 'vivify': 1, 'warmup': 0}", "{'backbone': 2, 'bump': 0, 'chrono': 0, 'congruence': 1, 'eliminate': 0, 'extract': 1, 'factor': 1, 'fastel': 0, 'forward': 0, 'lucky': 0, 'phase': 0, 'phasesaving': 1, 'preprocess': 0, 'probe': 0, 'randec': 1, 'reluctant': 1, 'reorder': 2, 'rephase': 1, 'restart': 1, 'stable': 2, 'substitute': 1, 'sweep': 0, 'target': 1, 'transitive': 1, 'vivify': 0, 'warmup': 0}", "{'backbone': 0, 'bump': 0, 'chrono': 0, 'congruence': 1, 'eliminate': 1, 'extract': 1, 'factor': 0, 'fastel': 0, 'forward': 1, 'lucky': 0, 'phase': 0, 'phasesaving': 0, 'preprocess': 0, 'probe': 1, 'randec': 1, 'reluctant': 1, 'reorder': 2, 'rephase': 0, 'restart': 0, 'stable': 2, 'substitute': 1, 'sweep': 1, 'target': 0, 'transitive': 0, 'vivify': 1, 'warmup': 1}", "{'backbone': 2, 'bump': 1, 'chrono': 0, 'congruence': 0, 'eliminate': 0, 'extract': 1, 'factor': 1, 'fastel': 0, 'forward': 0, 'lucky': 0, 'phase': 1, 'phasesaving': 0, 'preprocess': 0, 'probe': 0, 'randec': 0, 'reluctant': 1, 'reorder': 1, 'rephase': 1, 'restart': 1, 'stable': 1, 'substitute': 0, 'sweep': 0, 'target': 2, 'transitive': 1, 'vivify': 1, 'warmup': 1}", "{'backbone': 1, 'bump': 1, 'chrono': 1, 'congruence': 0, 'eliminate': 1, 'extract': 1, 'factor': 0, 'fastel': 1, 'forward': 1, 'lucky': 0, 'phase': 1, 'phasesaving': 1, 'preprocess': 0, 'probe': 1, 'randec': 0, 'reluctant': 0, 'reorder': 1, 'rephase': 0, 'restart': 1, 'stable': 0, 'substitute': 0, 'sweep': 1, 'target': 1, 'transitive': 1, 'vivify': 1, 'warmup': 0}", "{'backbone': 0, 'bump': 1, 'chrono': 0, 'congruence': 0, 'eliminate': 1, 'extract': 0, 'factor': 0, 'fastel': 1, 'forward': 0, 'lucky': 0, 'phase': 1, 'phasesaving': 0, 'preprocess': 1, 'probe': 1, 'randec': 0, 'reluctant': 1, 'reorder': 2, 'rephase': 1, 'restart': 0, 'stable': 1, 'substitute': 0, 'sweep': 0, 'target': 0, 'transitive': 1, 'vivify': 0, 'warmup': 0}", "{'backbone': 2, 'bump': 1, 'chrono': 1, 'congruence': 0, 'eliminate': 1, 'extract': 1, 'factor': 1, 'fastel': 1, 'forward': 1, 'lucky': 0, 'phase': 1, 'phasesaving': 1, 'preprocess': 1, 'probe': 1, 'randec': 1, 'reluctant': 0, 'reorder': 2, 'rephase': 0, 'restart': 0, 'stable': 2, 'substitute': 0, 'sweep': 0, 'target': 2, 'transitive': 0, 'vivify': 0, 'warmup': 0}", "{'backbone': 2, 'bump': 1, 'chrono': 0, 'congruence': 0, 'eliminate': 0, 'extract': 0, 'factor': 1, 'fastel': 1, 'forward': 0, 'lucky': 0, 'phase': 1, 'phasesaving': 1, 'preprocess': 0, 'probe': 1, 'randec': 1, 'reluctant': 1, 'reorder': 2, 'rephase': 0, 'restart': 1, 'stable': 0, 'substitute': 0, 'sweep': 0, 'target': 2, 'transitive': 1, 'vivify': 1, 'warmup': 1}", "{'backbone': 1, 'bump': 1, 'chrono': 0, 'congruence': 1, 'eliminate': 0, 'extract': 0, 'factor': 1, 'fastel': 0, 'forward': 0, 'lucky': 1, 'phase': 1, 'phasesaving': 1, 'preprocess': 1, 'probe': 1, 'randec': 0, 'reluctant': 0, 'reorder': 2, 'rephase': 1, 'restart': 0, 'stable': 2, 'substitute': 0, 'sweep': 0, 'target': 2, 'transitive': 0, 'vivify': 1, 'warmup': 0}", "{'backbone': 2, 'bump': 1, 'chrono': 1, 'congruence': 0, 'eliminate': 1, 'extract': 1, 'factor': 1, 'fastel': 0, 'forward': 1, 'lucky': 1, 'phase': 0, 'phasesaving': 0, 'preprocess': 1, 'probe': 1, 'randec': 0, 'reluctant': 1, 'reorder': 0, 'rephase': 0, 'restart': 1, 'stable': 0, 'substitute': 1, 'sweep': 1, 'target': 1, 'transitive': 1, 'vivify': 1, 'warmup': 0}", "{'backbone': 2, 'bump': 1, 'chrono': 1, 'congruence': 0, 'eliminate': 1, 'extract': 1, 'factor': 0, 'fastel': 1, 'forward': 0, 'lucky': 0, 'phase': 1, 'phasesaving': 0, 'preprocess': 0, 'probe': 0, 'randec': 0, 'reluctant': 0, 'reorder': 1, 'rephase': 1, 'restart': 1, 'stable': 2, 'substitute': 0, 'sweep': 0, 'target': 1, 'transitive': 0, 'vivify': 0, 'warmup': 1}", "{'backbone': 2, 'bump': 1, 'chrono': 1, 'congruence': 0, 'eliminate': 0, 'extract': 0, 'factor': 1, 'fastel': 1, 'forward': 0, 'lucky': 0, 'phase': 1, 'phasesaving': 0, 'preprocess': 1, 'probe': 0, 'randec': 0, 'reluctant': 1, 'reorder': 0, 'rephase': 1, 'restart': 1, 'stable': 1, 'substitute': 1, 'sweep': 1, 'target': 2, 'transitive': 1, 'vivify': 1, 'warmup': 1}", "{'backbone': 2, 'bump': 1, 'chrono': 0, 'congruence': 1, 'eliminate': 1, 'extract': 0, 'factor': 0, 'fastel': 1, 'forward': 0, 'lucky': 1, 'phase': 0, 'phasesaving': 1, 'preprocess': 0, 'probe': 1, 'randec': 0, 'reluctant': 0, 'reorder': 2, 'rephase': 1, 'restart': 1, 'stable': 0, 'substitute': 1, 'sweep': 1, 'target': 0, 'transitive': 1, 'vivify': 1, 'warmup': 1}", "{'backbone': 2, 'bump': 1, 'chrono': 1, 'congruence': 0, 'eliminate': 0, 'extract': 0, 'factor': 0, 'fastel': 1, 'forward': 0, 'lucky': 0, 'phase': 1, 'phasesaving': 1, 'preprocess': 1, 'probe': 1, 'randec': 0, 'reluctant': 0, 'reorder': 0, 'rephase': 0, 'restart': 1, 'stable': 0, 'substitute': 0, 'sweep': 0, 'target': 0, 'transitive': 1, 'vivify': 1, 'warmup': 0"

df = pd.read_csv('top40.csv')


# Find the configuration with the least time for each key
min_time_config = df.loc[df.groupby('key')['time'].idxmin()]

min_time_config = min_time_config[min_time_config['time'] <= 1800.0]

feat = get_available_features()
data = get_prediction_dataset(feat, "family")

family_hashes = {}
for family in data['family'].unique():
    family_hashes[family] = data[data['family'] == family]['hash'].tolist()
    family_hashes[family] = [h for h in family_hashes[family] if h in min_time_config['key'].values]
    if not family_hashes[family]:
        del family_hashes[family]


    # Create a dictionary to store the best configuration for each family
best_configurations = {}
# Iterate over each family in family_hashes
for family, hashes in family_hashes.items():

    # Filter the dataframe to include only the instances for the current family
    family_df = df[df['key'].isin(hashes)]
    family_df = family_df[family_df['configuration'].isin(configs)]
    # Group by configuration and sum the runtimes
    config_runtime_sum = family_df.groupby('configuration')['time'].sum()
    
    # Find the configuration with the minimum runtime sum
    best_config = config_runtime_sum.idxmin()
    
    # Store the best configuration for the current family
    best_configurations[family] = best_config

# Reverse the dictionary: for each key, check which value is assigned to it.
# Assign the key to the value in a new dictionary
reversed_best_configurations = {}
for family, config in best_configurations.items():

    if config not in reversed_best_configurations:
        reversed_best_configurations[config] = []

    reversed_best_configurations[config].append(family)

print(reversed_best_configurations)

with open("top_30_conf.txt", "w") as file:
    for key in reversed_best_configurations.keys():
        file.write(key)
        file.write("\n")

hashes = []
for key in reversed_best_configurations.keys():
    families = reversed_best_configurations[key]

    nhash = []
    for fam in families:
        nhash.extend(family_hashes[fam].copy())
    hashes.append(nhash)

feat = get_available_features()
data = get_dataset_by_hashes(hashes)

X_train, X_test, y_train, y_test = train_test_split(data[feat], data['index'], test_size=0.2, random_state=42)
# Count the number of instances of each index in the training set
#train_counts = y_train.value_counts()

# Count the number of instances of each index in the entire dataset
#data_counts = y_test.value_counts()

model = RandomForestClassifier()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

with open('random_forest_top30.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

{"{'backbone': 2, 'bump': 1, 'chrono': 1, 'congruence': 0, 'eliminate': 0, 'extract': 1, 'factor': 1, 'fastel': 1, 'forward': 1, 'lucky': 0, 'phase': 0, 'phasesaving': 0, 'preprocess': 0, 'probe': 0, 'randec': 0, 'reluctant': 1, 'reorder': 0, 'rephase': 1, 'restart': 1, 'stable': 2, 'substitute': 0, 'sweep': 0, 'target': 2, 'transitive': 1, 'vivify': 0, 'warmup': 1}": ['cellular-automata', 'random-modularity'], "{'backbone': 0, 'bump': 1, 'chrono': 1, 'congruence': 1, 'eliminate': 1, 'extract': 0, 'factor': 1, 'fastel': 0, 'forward': 1, 'lucky': 0, 'phase': 1, 'phasesaving': 1, 'preprocess': 0, 'probe': 1, 'randec': 1, 'reluctant': 1, 'reorder': 0, 'rephase': 0, 'restart': 1, 'stable': 0, 'substitute': 0, 'sweep': 1, 'target': 0, 'transitive': 0, 'vivify': 1, 'warmup': 1}": ['hardware-verification', 'tseitin-formulas', 'trigonometric-functions', 'minimal-disagreement-parity', 'independent-set'], "{'backbone': 2, 'bump': 1, 'chrono': 1, 'congruence': 0, 'eliminate': 0, 'extract': 0, 'fa

In [None]:
import pandas as pd
import pickle
from utils import *

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

def get_available_features():
    with GBD([ "/home/raphael-zipperer/Uni/BA/database/base.db" ]) as gbd:
        feat = gbd.get_features('base')
        feat.remove("base_features_runtime")
        feat.remove("ccs")
        feat.remove("bytes")
        return feat

def get_prediction_dataset(features, target):
    with GBD([ '/home/raphael-zipperer/Uni/BA/database/base.db', '/home/raphael-zipperer/Uni/BA/database/meta.db' ]) as gbd:
        df = gbd.query('(track=main_2023 or track=main_2024) and minisat1m!=yes', resolve=features+[target])
        df[features] = df[features].apply(pd.to_numeric)
        #for i, group in enumerate(groups):
        #    if df[target].isin(group).any():
        #        df[target] = df[target].apply(lambda x: i if x in group else x)
        return df

def get_dataset_by_hashes(hashes_list):
    with GBD(['/home/raphael-zipperer/Uni/BA/database/base.db', '/home/raphael-zipperer/Uni/BA/database/meta.db']) as gbd:
        features = get_available_features()
        all_hashes = [hash for sublist in hashes_list for hash in sublist]
        df = gbd.query('(track=main_2023 or track=main_2024) and minisat1m!=yes', resolve = features)
        df = df[df['hash'].isin(all_hashes)]
        df[features] = df[features].apply(pd.to_numeric)
        df['index'] = -1
        for i, sublist in enumerate(hashes_list):
            df.loc[df['hash'].isin(sublist), 'index'] = i
        return df


configs = ['Default', "{'backbone': 0, 'bump': 1, 'chrono': 1, 'congruence': 0, 'eliminate': 1, 'extract': 1, 'factor': 0, 'fastel': 1, 'forward': 1, 'lucky': 0, 'phase': 0, 'phasesaving': 0, 'preprocess': 0, 'probe': 1, 'randec': 1, 'reluctant': 0, 'reorder': 1, 'rephase': 0, 'restart': 1, 'stable': 0, 'substitute': 1, 'sweep': 1, 'target': 1, 'transitive': 0, 'vivify': 1, 'warmup': 0}", "{'backbone': 1, 'bump': 1, 'chrono': 1, 'congruence': 0, 'eliminate': 1, 'extract': 0, 'factor': 1, 'fastel': 0, 'forward': 0, 'lucky': 1, 'phase': 1, 'phasesaving': 0, 'preprocess': 1, 'probe': 1, 'randec': 1, 'reluctant': 1, 'reorder': 2, 'rephase': 1, 'restart': 0, 'stable': 1, 'substitute': 0, 'sweep': 0, 'target': 2, 'transitive': 0, 'vivify': 1, 'warmup': 0}", "{'backbone': 2, 'bump': 1, 'chrono': 0, 'congruence': 0, 'eliminate': 1, 'extract': 1, 'factor': 1, 'fastel': 0, 'forward': 1, 'lucky': 0, 'phase': 1, 'phasesaving': 1, 'preprocess': 1, 'probe': 0, 'randec': 1, 'reluctant': 1, 'reorder': 0, 'rephase': 0, 'restart': 1, 'stable': 2, 'substitute': 1, 'sweep': 0, 'target': 0, 'transitive': 0, 'vivify': 1, 'warmup': 0}", "{'backbone': 0, 'bump': 1, 'chrono': 1, 'congruence': 0, 'eliminate': 0, 'extract': 0, 'factor': 1, 'fastel': 1, 'forward': 1, 'lucky': 1, 'phase': 1, 'phasesaving': 1, 'preprocess': 1, 'probe': 1, 'randec': 0, 'reluctant': 0, 'reorder': 2, 'rephase': 0, 'restart': 1, 'stable': 0, 'substitute': 1, 'sweep': 1, 'target': 0, 'transitive': 0, 'vivify': 1, 'warmup': 0}", "{'backbone': 0, 'bump': 0, 'chrono': 0, 'congruence': 0, 'eliminate': 0, 'extract': 1, 'factor': 0, 'fastel': 1, 'forward': 1, 'lucky': 1, 'phase': 1, 'phasesaving': 1, 'preprocess': 1, 'probe': 0, 'randec': 0, 'reluctant': 0, 'reorder': 1, 'rephase': 1, 'restart': 0, 'stable': 2, 'substitute': 1, 'sweep': 1, 'target': 0, 'transitive': 1, 'vivify': 1, 'warmup': 1}", "{'backbone': 2, 'bump': 1, 'chrono': 1, 'congruence': 0, 'eliminate': 0, 'extract': 0, 'factor': 1, 'fastel': 1, 'forward': 0, 'lucky': 0, 'phase': 0, 'phasesaving': 1, 'preprocess': 0, 'probe': 0, 'randec': 1, 'reluctant': 1, 'reorder': 2, 'rephase': 1, 'restart': 1, 'stable': 2, 'substitute': 0, 'sweep': 1, 'target': 1, 'transitive': 1, 'vivify': 1, 'warmup': 0}", "{'backbone': 2, 'bump': 0, 'chrono': 1, 'congruence': 0, 'eliminate': 1, 'extract': 1, 'factor': 0, 'fastel': 0, 'forward': 1, 'lucky': 0, 'phase': 0, 'phasesaving': 0, 'preprocess': 1, 'probe': 1, 'randec': 1, 'reluctant': 0, 'reorder': 2, 'rephase': 0, 'restart': 0, 'stable': 1, 'substitute': 1, 'sweep': 0, 'target': 0, 'transitive': 1, 'vivify': 0, 'warmup': 1}", "{'backbone': 1, 'bump': 1, 'chrono': 0, 'congruence': 0, 'eliminate': 1, 'extract': 1, 'factor': 0, 'fastel': 1, 'forward': 0, 'lucky': 1, 'phase': 0, 'phasesaving': 0, 'preprocess': 1, 'probe': 1, 'randec': 1, 'reluctant': 0, 'reorder': 2, 'rephase': 0, 'restart': 1, 'stable': 1, 'substitute': 0, 'sweep': 0, 'target': 2, 'transitive': 1, 'vivify': 1, 'warmup': 1}", "{'backbone': 2, 'bump': 1, 'chrono': 1, 'congruence': 0, 'eliminate': 1, 'extract': 1, 'factor': 1, 'fastel': 0, 'forward': 1, 'lucky': 0, 'phase': 0, 'phasesaving': 1, 'preprocess': 0, 'probe': 0, 'randec': 0, 'reluctant': 1, 'reorder': 0, 'rephase': 0, 'restart': 1, 'stable': 2, 'substitute': 0, 'sweep': 0, 'target': 2, 'transitive': 1, 'vivify': 1, 'warmup': 1}", "{'backbone': 2, 'bump': 1, 'chrono': 1, 'congruence': 0, 'eliminate': 1, 'extract': 1, 'factor': 0, 'fastel': 1, 'forward': 0, 'lucky': 1, 'phase': 1, 'phasesaving': 1, 'preprocess': 1, 'probe': 1, 'randec': 1, 'reluctant': 1, 'reorder': 0, 'rephase': 1, 'restart': 1, 'stable': 1, 'substitute': 0, 'sweep': 0, 'target': 1, 'transitive': 1, 'vivify': 0, 'warmup': 0}", "{'backbone': 2, 'bump': 1, 'chrono': 0, 'congruence': 1, 'eliminate': 0, 'extract': 0, 'factor': 1, 'fastel': 1, 'forward': 0, 'lucky': 1, 'phase': 1, 'phasesaving': 1, 'preprocess': 0, 'probe': 1, 'randec': 1, 'reluctant': 1, 'reorder': 1, 'rephase': 0, 'restart': 0, 'stable': 2, 'substitute': 1, 'sweep': 1, 'target': 0, 'transitive': 1, 'vivify': 0, 'warmup': 0}", "{'backbone': 2, 'bump': 1, 'chrono': 1, 'congruence': 0, 'eliminate': 0, 'extract': 1, 'factor': 1, 'fastel': 1, 'forward': 1, 'lucky': 0, 'phase': 0, 'phasesaving': 0, 'preprocess': 0, 'probe': 0, 'randec': 0, 'reluctant': 1, 'reorder': 0, 'rephase': 1, 'restart': 1, 'stable': 2, 'substitute': 0, 'sweep': 0, 'target': 2, 'transitive': 1, 'vivify': 0, 'warmup': 1}", "{'backbone': 0, 'bump': 1, 'chrono': 1, 'congruence': 1, 'eliminate': 1, 'extract': 0, 'factor': 1, 'fastel': 0, 'forward': 1, 'lucky': 0, 'phase': 1, 'phasesaving': 1, 'preprocess': 0, 'probe': 1, 'randec': 1, 'reluctant': 1, 'reorder': 0, 'rephase': 0, 'restart': 1, 'stable': 0, 'substitute': 0, 'sweep': 1, 'target': 0, 'transitive': 0, 'vivify': 1, 'warmup': 1}", "{'backbone': 0, 'bump': 0, 'chrono': 0, 'congruence': 1, 'eliminate': 0, 'extract': 0, 'factor': 0, 'fastel': 0, 'forward': 0, 'lucky': 1, 'phase': 1, 'phasesaving': 1, 'preprocess': 0, 'probe': 0, 'randec': 0, 'reluctant': 1, 'reorder': 0, 'rephase': 1, 'restart': 1, 'stable': 0, 'substitute': 1, 'sweep': 0, 'target': 0, 'transitive': 1, 'vivify': 0, 'warmup': 0}", "{'backbone': 2, 'bump': 1, 'chrono': 0, 'congruence': 0, 'eliminate': 1, 'extract': 0, 'factor': 1, 'fastel': 1, 'forward': 1, 'lucky': 0, 'phase': 0, 'phasesaving': 1, 'preprocess': 0, 'probe': 0, 'randec': 0, 'reluctant': 0, 'reorder': 2, 'rephase': 0, 'restart': 0, 'stable': 2, 'substitute': 0, 'sweep': 1, 'target': 0, 'transitive': 0, 'vivify': 0, 'warmup': 0}", "{'backbone': 2, 'bump': 1, 'chrono': 0, 'congruence': 0, 'eliminate': 1, 'extract': 1, 'factor': 1, 'fastel': 0, 'forward': 0, 'lucky': 1, 'phase': 0, 'phasesaving': 1, 'preprocess': 1, 'probe': 1, 'randec': 1, 'reluctant': 0, 'reorder': 2, 'rephase': 1, 'restart': 0, 'stable': 2, 'substitute': 0, 'sweep': 1, 'target': 2, 'transitive': 0, 'vivify': 1, 'warmup': 0}", "{'backbone': 2, 'bump': 0, 'chrono': 0, 'congruence': 1, 'eliminate': 0, 'extract': 1, 'factor': 1, 'fastel': 0, 'forward': 0, 'lucky': 0, 'phase': 0, 'phasesaving': 1, 'preprocess': 0, 'probe': 0, 'randec': 1, 'reluctant': 1, 'reorder': 2, 'rephase': 1, 'restart': 1, 'stable': 2, 'substitute': 1, 'sweep': 0, 'target': 1, 'transitive': 1, 'vivify': 0, 'warmup': 0}", "{'backbone': 0, 'bump': 0, 'chrono': 0, 'congruence': 1, 'eliminate': 1, 'extract': 1, 'factor': 0, 'fastel': 0, 'forward': 1, 'lucky': 0, 'phase': 0, 'phasesaving': 0, 'preprocess': 0, 'probe': 1, 'randec': 1, 'reluctant': 1, 'reorder': 2, 'rephase': 0, 'restart': 0, 'stable': 2, 'substitute': 1, 'sweep': 1, 'target': 0, 'transitive': 0, 'vivify': 1, 'warmup': 1}", "{'backbone': 2, 'bump': 1, 'chrono': 1, 'congruence': 0, 'eliminate': 1, 'extract': 1, 'factor': 1, 'fastel': 1, 'forward': 1, 'lucky': 0, 'phase': 0, 'phasesaving': 1, 'preprocess': 1, 'probe': 1, 'randec': 0, 'reluctant': 0, 'reorder': 2, 'rephase': 0, 'restart': 0, 'stable': 2, 'substitute': 1, 'sweep': 1, 'target': 2, 'transitive': 0, 'vivify': 0, 'warmup': 1}", "{'backbone': 2, 'bump': 1, 'chrono': 0, 'congruence': 0, 'eliminate': 0, 'extract': 1, 'factor': 1, 'fastel': 0, 'forward': 0, 'lucky': 0, 'phase': 1, 'phasesaving': 0, 'preprocess': 0, 'probe': 0, 'randec': 0, 'reluctant': 1, 'reorder': 1, 'rephase': 1, 'restart': 1, 'stable': 1, 'substitute': 0, 'sweep': 0, 'target': 2, 'transitive': 1, 'vivify': 1, 'warmup': 1}", "{'backbone': 1, 'bump': 1, 'chrono': 1, 'congruence': 0, 'eliminate': 1, 'extract': 1, 'factor': 0, 'fastel': 1, 'forward': 1, 'lucky': 0, 'phase': 1, 'phasesaving': 1, 'preprocess': 0, 'probe': 1, 'randec': 0, 'reluctant': 0, 'reorder': 1, 'rephase': 0, 'restart': 1, 'stable': 0, 'substitute': 0, 'sweep': 1, 'target': 1, 'transitive': 1, 'vivify': 1, 'warmup': 0}", "{'backbone': 0, 'bump': 1, 'chrono': 0, 'congruence': 0, 'eliminate': 1, 'extract': 0, 'factor': 0, 'fastel': 1, 'forward': 0, 'lucky': 0, 'phase': 1, 'phasesaving': 0, 'preprocess': 1, 'probe': 1, 'randec': 0, 'reluctant': 1, 'reorder': 2, 'rephase': 1, 'restart': 0, 'stable': 1, 'substitute': 0, 'sweep': 0, 'target': 0, 'transitive': 1, 'vivify': 0, 'warmup': 0}", "{'backbone': 2, 'bump': 1, 'chrono': 1, 'congruence': 0, 'eliminate': 1, 'extract': 1, 'factor': 1, 'fastel': 1, 'forward': 1, 'lucky': 0, 'phase': 1, 'phasesaving': 1, 'preprocess': 1, 'probe': 1, 'randec': 1, 'reluctant': 0, 'reorder': 2, 'rephase': 0, 'restart': 0, 'stable': 2, 'substitute': 0, 'sweep': 0, 'target': 2, 'transitive': 0, 'vivify': 0, 'warmup': 0}", "{'backbone': 2, 'bump': 1, 'chrono': 0, 'congruence': 0, 'eliminate': 0, 'extract': 0, 'factor': 1, 'fastel': 1, 'forward': 0, 'lucky': 0, 'phase': 1, 'phasesaving': 1, 'preprocess': 0, 'probe': 1, 'randec': 1, 'reluctant': 1, 'reorder': 2, 'rephase': 0, 'restart': 1, 'stable': 0, 'substitute': 0, 'sweep': 0, 'target': 2, 'transitive': 1, 'vivify': 1, 'warmup': 1}", "{'backbone': 1, 'bump': 1, 'chrono': 0, 'congruence': 1, 'eliminate': 0, 'extract': 0, 'factor': 1, 'fastel': 0, 'forward': 0, 'lucky': 1, 'phase': 1, 'phasesaving': 1, 'preprocess': 1, 'probe': 1, 'randec': 0, 'reluctant': 0, 'reorder': 2, 'rephase': 1, 'restart': 0, 'stable': 2, 'substitute': 0, 'sweep': 0, 'target': 2, 'transitive': 0, 'vivify': 1, 'warmup': 0}", "{'backbone': 2, 'bump': 1, 'chrono': 1, 'congruence': 0, 'eliminate': 1, 'extract': 1, 'factor': 1, 'fastel': 0, 'forward': 1, 'lucky': 1, 'phase': 0, 'phasesaving': 0, 'preprocess': 1, 'probe': 1, 'randec': 0, 'reluctant': 1, 'reorder': 0, 'rephase': 0, 'restart': 1, 'stable': 0, 'substitute': 1, 'sweep': 1, 'target': 1, 'transitive': 1, 'vivify': 1, 'warmup': 0}", "{'backbone': 2, 'bump': 1, 'chrono': 1, 'congruence': 0, 'eliminate': 1, 'extract': 1, 'factor': 0, 'fastel': 1, 'forward': 0, 'lucky': 0, 'phase': 1, 'phasesaving': 0, 'preprocess': 0, 'probe': 0, 'randec': 0, 'reluctant': 0, 'reorder': 1, 'rephase': 1, 'restart': 1, 'stable': 2, 'substitute': 0, 'sweep': 0, 'target': 1, 'transitive': 0, 'vivify': 0, 'warmup': 1}", "{'backbone': 2, 'bump': 1, 'chrono': 1, 'congruence': 0, 'eliminate': 0, 'extract': 0, 'factor': 1, 'fastel': 1, 'forward': 0, 'lucky': 0, 'phase': 1, 'phasesaving': 0, 'preprocess': 1, 'probe': 0, 'randec': 0, 'reluctant': 1, 'reorder': 0, 'rephase': 1, 'restart': 1, 'stable': 1, 'substitute': 1, 'sweep': 1, 'target': 2, 'transitive': 1, 'vivify': 1, 'warmup': 1}", "{'backbone': 2, 'bump': 1, 'chrono': 0, 'congruence': 1, 'eliminate': 1, 'extract': 0, 'factor': 0, 'fastel': 1, 'forward': 0, 'lucky': 1, 'phase': 0, 'phasesaving': 1, 'preprocess': 0, 'probe': 1, 'randec': 0, 'reluctant': 0, 'reorder': 2, 'rephase': 1, 'restart': 1, 'stable': 0, 'substitute': 1, 'sweep': 1, 'target': 0, 'transitive': 1, 'vivify': 1, 'warmup': 1}", "{'backbone': 2, 'bump': 1, 'chrono': 1, 'congruence': 0, 'eliminate': 0, 'extract': 0, 'factor': 0, 'fastel': 1, 'forward': 0, 'lucky': 0, 'phase': 1, 'phasesaving': 1, 'preprocess': 1, 'probe': 1, 'randec': 0, 'reluctant': 0, 'reorder': 0, 'rephase': 0, 'restart': 1, 'stable': 0, 'substitute': 0, 'sweep': 0, 'target': 0, 'transitive': 1, 'vivify': 1, 'warmup': 0}"]

df = pd.read_csv('top40.csv')


# Find the configuration with the least time for each key
min_time_config = df.loc[df.groupby('key')['time'].idxmin()]

min_time_config = min_time_config[min_time_config['time'] <= 1800.0]

feat = get_available_features()
data = get_prediction_dataset(feat, "family")

family_hashes = {}
for family in data['family'].unique():
    family_hashes[family] = data[data['family'] == family]['hash'].tolist()
    family_hashes[family] = [h for h in family_hashes[family] if h in min_time_config['key'].values]
    if not family_hashes[family]:
        del family_hashes[family]


    # Create a dictionary to store the best configuration for each family
best_configurations = {}
# Iterate over each family in family_hashes
for family, hashes in family_hashes.items():

    # Filter the dataframe to include only the instances for the current family
    family_df = df[df['key'].isin(hashes)]
    family_df = family_df[family_df['configuration'].isin(configs)]
    # Group by configuration and sum the runtimes
    config_runtime_sum = family_df.groupby('configuration')['time'].sum()
    
    # Find the configuration with the minimum runtime sum
    best_config = config_runtime_sum.idxmin()
    
    # Store the best configuration for the current family
    best_configurations[family] = best_config

# Reverse the dictionary: for each key, check which value is assigned to it.
# Assign the key to the value in a new dictionary
reversed_best_configurations = {}
for family, config in best_configurations.items():

    if config not in reversed_best_configurations:
        reversed_best_configurations[config] = []

    reversed_best_configurations[config].append(family)

print(reversed_best_configurations)

with open("top_32_conf.txt", "w") as file:
    for key in reversed_best_configurations.keys():
        file.write(key)
        file.write("\n")

hashes = []
for key in reversed_best_configurations.keys():
    families = reversed_best_configurations[key]

    nhash = []
    for fam in families:
        nhash.extend(family_hashes[fam].copy())
    hashes.append(nhash)

feat = get_available_features()
data = get_dataset_by_hashes(hashes)

X_train, X_test, y_train, y_test = train_test_split(data[feat], data['index'], test_size=0.2, random_state=42)
# Count the number of instances of each index in the training set
#train_counts = y_train.value_counts()

# Count the number of instances of each index in the entire dataset
#data_counts = y_test.value_counts()

model = RandomForestClassifier()
model.fit(data[feat], data['index'])


with open('random_forest_top32.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

{"{'backbone': 2, 'bump': 1, 'chrono': 1, 'congruence': 0, 'eliminate': 0, 'extract': 1, 'factor': 1, 'fastel': 1, 'forward': 1, 'lucky': 0, 'phase': 0, 'phasesaving': 0, 'preprocess': 0, 'probe': 0, 'randec': 0, 'reluctant': 1, 'reorder': 0, 'rephase': 1, 'restart': 1, 'stable': 2, 'substitute': 0, 'sweep': 0, 'target': 2, 'transitive': 1, 'vivify': 0, 'warmup': 1}": ['cellular-automata', 'random-modularity'], "{'backbone': 0, 'bump': 1, 'chrono': 1, 'congruence': 1, 'eliminate': 1, 'extract': 0, 'factor': 1, 'fastel': 0, 'forward': 1, 'lucky': 0, 'phase': 1, 'phasesaving': 1, 'preprocess': 0, 'probe': 1, 'randec': 1, 'reluctant': 1, 'reorder': 0, 'rephase': 0, 'restart': 1, 'stable': 0, 'substitute': 0, 'sweep': 1, 'target': 0, 'transitive': 0, 'vivify': 1, 'warmup': 1}": ['hardware-verification', 'tseitin-formulas', 'minimal-disagreement-parity'], "{'backbone': 2, 'bump': 1, 'chrono': 1, 'congruence': 0, 'eliminate': 0, 'extract': 0, 'factor': 1, 'fastel': 1, 'forward': 0, 'lucky': 

Runtime clustering  
Laufzeitverhalten auf Klasse (heterogenität/mehrere gute Konfigurationen?) -> hierarchisch erst Klasse dann Konfiguration  
Minimum hitting set top cluster  
Stratified train test split (alle families sind included)  
Feature Records -> binärer feature record: 1 wenn bet konf, 0 wenn nicht -> clustering  
Klassenbildung X Algorithm selection -> Generisch! (Interaktion wichtig!)  
Experimental Setup zwischen Approach und Evaluation  
Zeige: weniger Konfigurationen = besserer Classifier -> Hitting set  
REDO: 4 clusters (overwrite) (Es war 571)  
Was ist interessant/sinnvoll?  

Evaluation vorerst nicht auf Liskov, sondern auf Trainingsset  
Mit ccs und file size
SNAP/Isac solver clustering
Holdout set?
Accuracy -> Virtual Best Solver?