In [112]:
from os.path import join
import os
import numpy as np
import pandas as pd
%matplotlib inline

In [9]:
INPUT_FOLDER = './output'

In [272]:
def get_tree_size(filename, folder=INPUT_FOLDER):
    with open(join(folder, filename), 'r') as file:
        while True:
            line = file.readline()
            if line.find('Size of the tree') >= 0:
                break
    return float(line.split()[-1])

In [269]:
def get_leaves(filename, folder=INPUT_FOLDER):
    with open(join(folder, filename), 'r') as file:
        while True:
            line = file.readline()
            if line.find('Number of Leaves') >= 0:
                break
    return float(line.split()[-1])

In [33]:
def get_precision(filename, folder=INPUT_FOLDER):
    with open(join(folder, filename), 'r') as file:
        while file.readline().find('=== Stratified cross-validation ===') < 0:
            pass
        while True:
            line = file.readline()
            if line.find('Correctly Classified Instances') >= 0:
                break
    return float(line.split()[-2])

In [264]:
def command_line_to_config(line):
    config = {}
    config['reduced_error_pruning'] = '-R' in line
    config['unpruned'] = '-U' in line
    config['binary_splits'] = '-B' in line
    config['save_instance_data'] = '-L' in line
    config['subtree_raising'] = '-S' in line
    config['config_laplace'] = '-A' in line

    config['confidence_factor'] = float(line[line.index('-C') + 1]) if '-C' in line else None
    config['min_num_obj'] = int(line[line.index('-M') + 1]) if '-M' in line else None
    config['num_folds'] = int(line[line.index('-N') + 1]) if '-N' in line else None
    
    return config

def get_config(filename, folder=INPUT_FOLDER):
    with open(join(folder, filename), 'r') as file:
        while True:
            line = file.readline()
            if line.find('Options:') >= 0:
                break
        
        return command_line_to_config(line.split()[1:])

In [270]:
def merge_dicts(*dicts):
    l = []
    for d in dicts:
        l += list(d.items())
    return dict(l)

In [274]:
dataset = pd.DataFrame([merge_dicts(get_config(name), {'precision': get_precision(name)}, {'leaves': get_leaves(name)}, {'tree_size': get_tree_size(name)}) for name in os.listdir(INPUT_FOLDER)])
dataset.to_csv('dataset.csv', index=False)
dataset = dataset.sort_values(by='precision')
dataset

Unnamed: 0,binary_splits,confidence_factor,config_laplace,leaves,min_num_obj,num_folds,precision,reduced_error_pruning,save_instance_data,subtree_raising,tree_size,unpruned
1608,True,,False,6.0000,6,4.0000,11.0000,True,False,False,11.0000,False
210,True,,False,6.0000,6,4.0000,11.0000,True,False,True,11.0000,False
207,True,,True,6.0000,9,8.0000,11.0000,True,False,False,11.0000,False
1549,True,,False,6.0000,9,8.0000,11.0000,True,False,True,11.0000,False
2424,True,,False,6.0000,9,8.0000,11.0000,True,True,False,11.0000,False
1124,True,,True,6.0000,9,8.0000,11.0000,True,True,True,11.0000,False
407,True,,False,6.0000,6,4.0000,11.0000,True,True,True,11.0000,False
2099,True,,True,6.0000,6,4.0000,11.0000,True,True,True,11.0000,False
517,True,,True,6.0000,6,4.0000,11.0000,True,False,False,11.0000,False
371,True,,False,6.0000,6,4.0000,11.0000,True,True,False,11.0000,False


# Linear regression

In [278]:
from sklearn.linear_model import LinearRegression

In [276]:
def normalize_df(df):
    return (df - df.mean()) / (df.max() - df.min())

In [288]:
def run_regression(dataset, y_key, ascending=False, drop=[]):
    cleaned_dataset = normalize_df(dataset
    .drop(drop, axis=1)
    .sort_values(by=y_key, ascending=ascending)
    ).fillna(0)
    
    x = cleaned_dataset.drop(y_key, axis=1)
    y = cleaned_dataset[y_key]
    
    lm = LinearRegression()
    lm.fit(x, y)
    pd.options.display.float_format = '{:.4f}'.format
    print('Linear regression coefficients')
    display(pd.DataFrame(list(zip(x.columns, lm.coef_)), columns=['features', 'coeffs']))    

In [291]:
def best_and_worst(dataset, y_key, ascending=False, drop=[]):
    sorted_dataset = (dataset
    .drop(drop, axis=1)
    .sort_values(by=y_key, ascending=ascending)
    )
    
    bests = sorted_dataset.iloc[:100]
    worsts = sorted_dataset.iloc[-100:]
    
    print('Bests:')
    display(bests.mean())
    print('Worsts:')
    display(worsts.mean())
    print('Mean:')
    display(sorted_dataset.mean())

# Precision

In [290]:
run_regression(dataset, y_key='precision', drop=['tree_size', 'leaves'])

Linear regression coefficients


Unnamed: 0,features,coeffs
0,binary_splits,-0.1691
1,confidence_factor,0.0942
2,config_laplace,-0.0
3,min_num_obj,-0.3762
4,num_folds,-0.0056
5,reduced_error_pruning,-0.1576
6,save_instance_data,-0.0
7,subtree_raising,0.0011
8,unpruned,0.0388


In [292]:
best_and_worst(dataset, y_key='precision', drop=['tree_size', 'leaves'])

Bests:


binary_splits             0.0000
confidence_factor         0.5538
config_laplace            0.5100
min_num_obj               1.6400
num_folds                    nan
precision               109.0400
reduced_error_pruning     0.0000
save_instance_data        0.5100
subtree_raising           0.5200
unpruned                  0.0900
dtype: float64

Worsts:


binary_splits            0.7100
confidence_factor           nan
config_laplace           0.4800
min_num_obj              7.6800
num_folds                4.1400
precision               13.5600
reduced_error_pruning    1.0000
save_instance_data       0.5000
subtree_raising          0.4900
unpruned                 0.0000
dtype: float64

Mean:


binary_splits            0.5000
confidence_factor        0.5000
config_laplace           0.5000
min_num_obj              5.0000
num_folds                5.5000
precision               42.6635
reduced_error_pruning    0.4571
save_instance_data       0.5000
subtree_raising          0.4857
unpruned                 0.0286
dtype: float64

# Tree size

In [295]:
run_regression(dataset, y_key='tree_size', drop=['precision', 'leaves'], ascending=True)

Linear regression coefficients


Unnamed: 0,features,coeffs
0,binary_splits,-0.1691
1,confidence_factor,0.0942
2,config_laplace,0.0
3,min_num_obj,-0.3762
4,num_folds,-0.0056
5,reduced_error_pruning,-0.1576
6,save_instance_data,-0.0
7,subtree_raising,0.0011
8,unpruned,0.0388


In [296]:
best_and_worst(dataset, y_key='tree_size', drop=['precision', 'leaves'], ascending=True)

Bests:


binary_splits            0.7200
confidence_factor           nan
config_laplace           0.4800
min_num_obj              7.6600
num_folds                4.1800
reduced_error_pruning    1.0000
save_instance_data       0.5100
subtree_raising          0.5000
tree_size               13.5600
unpruned                 0.0000
dtype: float64

Worsts:


binary_splits             0.0000
confidence_factor         0.5505
config_laplace            0.5200
min_num_obj               1.6400
num_folds                    nan
reduced_error_pruning     0.0000
save_instance_data        0.4900
subtree_raising           0.5300
tree_size               109.0400
unpruned                  0.0900
dtype: float64

Mean:


binary_splits            0.5000
confidence_factor        0.5000
config_laplace           0.5000
min_num_obj              5.0000
num_folds                5.5000
reduced_error_pruning    0.4571
save_instance_data       0.5000
subtree_raising          0.4857
tree_size               42.6635
unpruned                 0.0286
dtype: float64

# Leaves

In [297]:
run_regression(dataset, y_key='leaves', drop=['precision', 'tree_size'], ascending=True)

Linear regression coefficients


Unnamed: 0,features,coeffs
0,binary_splits,-0.2496
1,confidence_factor,0.0957
2,config_laplace,-0.0
3,min_num_obj,-0.3062
4,num_folds,0.0113
5,reduced_error_pruning,-0.1231
6,save_instance_data,-0.0
7,subtree_raising,0.0007
8,unpruned,0.0396


In [298]:
best_and_worst(dataset, y_key='leaves', drop=['precision', 'tree_size'], ascending=True)

Bests:


binary_splits           0.7200
confidence_factor          nan
config_laplace          0.4800
leaves                  7.2800
min_num_obj             7.6600
num_folds               4.1800
reduced_error_pruning   1.0000
save_instance_data      0.5100
subtree_raising         0.5000
unpruned                0.0000
dtype: float64

Worsts:


binary_splits            0.0000
confidence_factor        0.5369
config_laplace           0.5000
leaves                  84.0800
min_num_obj              1.5600
num_folds                2.0000
reduced_error_pruning    0.0800
save_instance_data       0.5100
subtree_raising          0.5200
unpruned                 0.0800
dtype: float64

Mean:


binary_splits            0.5000
confidence_factor        0.5000
config_laplace           0.5000
leaves                  28.3683
min_num_obj              5.0000
num_folds                5.5000
reduced_error_pruning    0.4571
save_instance_data       0.5000
subtree_raising          0.4857
unpruned                 0.0286
dtype: float64