In [1]:
from os.path import join
import os
import numpy as np
import pandas as pd
%matplotlib inline

In [2]:
INPUT_FOLDER = './output'

In [3]:
def get_tree_size(filename, folder=INPUT_FOLDER):
    with open(join(folder, filename), 'r') as file:
        while True:
            line = file.readline()
            if line.find('Size of the tree') >= 0:
                break
    return float(line.split()[-1])

In [4]:
def get_leaves(filename, folder=INPUT_FOLDER):
    with open(join(folder, filename), 'r') as file:
        while True:
            line = file.readline()
            if line.find('Number of Leaves') >= 0:
                break
    return float(line.split()[-1])

In [5]:
def get_precision(filename, folder=INPUT_FOLDER):
    with open(join(folder, filename), 'r') as file:
        while file.readline().find('=== Stratified cross-validation ===') < 0:
            pass
        while True:
            line = file.readline()
            if line.find('Correctly Classified Instances') >= 0:
                break
    return float(line.split()[-2])

In [6]:
def command_line_to_config(line):
    config = {}
    config['reduced_error_pruning'] = '-R' in line
    config['unpruned'] = '-U' in line
    config['binary_splits'] = '-B' in line
    config['save_instance_data'] = '-L' in line
    config['subtree_raising'] = '-S' in line
    config['config_laplace'] = '-A' in line

    config['confidence_factor'] = float(line[line.index('-C') + 1]) if '-C' in line else None
    config['min_num_obj'] = int(line[line.index('-M') + 1]) if '-M' in line else None
    config['num_folds'] = int(line[line.index('-N') + 1]) if '-N' in line else None
    
    return config

def get_config(filename, folder=INPUT_FOLDER):
    with open(join(folder, filename), 'r') as file:
        while True:
            line = file.readline()
            if line.find('Options:') >= 0:
                break
        
        return command_line_to_config(line.split()[1:])

In [7]:
def merge_dicts(*dicts):
    l = []
    for d in dicts:
        l += list(d.items())
    return dict(l)

In [8]:
dataset = pd.DataFrame([merge_dicts(get_config(name), {'precision': get_precision(name)}, {'leaves': get_leaves(name)}, {'tree_size': get_tree_size(name)}) for name in os.listdir(INPUT_FOLDER)])
dataset.to_csv('dataset.csv', index=False)
dataset = dataset.sort_values(by='precision')
dataset

Unnamed: 0,binary_splits,confidence_factor,config_laplace,leaves,min_num_obj,num_folds,precision,reduced_error_pruning,save_instance_data,subtree_raising,tree_size,unpruned
2301,True,,True,9.0,9,3.0,55.6098,True,False,False,17.0,False
1164,True,,True,9.0,9,3.0,55.6098,True,True,True,17.0,False
2487,True,,False,9.0,9,3.0,55.6098,True,False,False,17.0,False
83,True,,False,9.0,9,3.0,55.6098,True,True,True,17.0,False
166,True,,False,9.0,9,3.0,55.6098,True,False,True,17.0,False
1200,True,,True,9.0,9,3.0,55.6098,True,True,False,17.0,False
106,True,,True,9.0,9,3.0,55.6098,True,False,True,17.0,False
1441,True,,False,9.0,9,3.0,55.6098,True,True,False,17.0,False
77,False,,True,31.0,7,3.0,56.0976,True,False,True,36.0,False
540,False,,True,31.0,7,3.0,56.0976,True,True,False,36.0,False


# Linear regression

In [9]:
from sklearn.linear_model import LinearRegression

In [10]:
def normalize_df(df):
    return (df - df.mean()) / (df.max() - df.min())

In [11]:
def run_regression(dataset, y_key, ascending=False, drop=[]):
    cleaned_dataset = normalize_df(dataset
    .drop(drop, axis=1)
    .sort_values(by=y_key, ascending=ascending)
    ).fillna(0)
    
    x = cleaned_dataset.drop(y_key, axis=1)
    y = cleaned_dataset[y_key]
    if ascending:
        y = -y
    
    lm = LinearRegression()
    lm.fit(x, y)
    pd.options.display.float_format = '{:.4f}'.format
    print('Linear regression coefficients')
    display(pd.DataFrame(list(zip(x.columns, lm.coef_)), columns=['features', 'coeffs']))    

In [12]:
def best_and_worst(dataset, y_key, ascending=False, drop=[]):
    sorted_dataset = (dataset
    .drop(drop, axis=1)
    .sort_values(by=y_key, ascending=ascending)
    )
    
    bests = sorted_dataset.iloc[:100]
    worsts = sorted_dataset.iloc[-100:]
    
    print('Bests:')
    display(bests.mean())
    print('Worsts:')
    display(worsts.mean())
    print('Mean:')
    display(sorted_dataset.mean())

# Precision

In [13]:
run_regression(dataset, y_key='precision', drop=['tree_size', 'leaves'])

Linear regression coefficients


Unnamed: 0,features,coeffs
0,binary_splits,-0.096
1,confidence_factor,0.0235
2,config_laplace,-0.001
3,min_num_obj,-0.3868
4,num_folds,0.1184
5,reduced_error_pruning,-0.2609
6,save_instance_data,0.0
7,subtree_raising,-0.0008
8,unpruned,0.0035


In [14]:
best_and_worst(dataset, y_key='precision')

Bests:


binary_splits             0.0100
confidence_factor         0.6283
config_laplace            0.4800
leaves                   79.1800
min_num_obj               1.4000
num_folds                    nan
precision                83.9609
reduced_error_pruning     0.0000
save_instance_data        0.5000
subtree_raising           0.4300
tree_size               105.1700
unpruned                  0.0800
dtype: float64

Worsts:


binary_splits            0.6100
confidence_factor           nan
config_laplace           0.4900
leaves                  12.3400
min_num_obj              8.0200
num_folds                3.8900
precision               57.1122
reduced_error_pruning    1.0000
save_instance_data       0.4900
subtree_raising          0.5000
tree_size               19.7300
unpruned                 0.0000
dtype: float64

Mean:


binary_splits            0.5000
confidence_factor        0.5000
config_laplace           0.5000
leaves                  28.3683
min_num_obj              5.0000
num_folds                5.5000
precision               69.5269
reduced_error_pruning    0.4571
save_instance_data       0.5000
subtree_raising          0.4857
tree_size               42.6635
unpruned                 0.0286
dtype: float64

# Tree size

In [15]:
run_regression(dataset, y_key='tree_size', drop=['precision', 'leaves'], ascending=True)

Linear regression coefficients


Unnamed: 0,features,coeffs
0,binary_splits,0.1691
1,confidence_factor,-0.0942
2,config_laplace,-0.0
3,min_num_obj,0.3762
4,num_folds,0.0056
5,reduced_error_pruning,0.1576
6,save_instance_data,0.0
7,subtree_raising,-0.0011
8,unpruned,-0.0388


In [19]:
best_and_worst(dataset, y_key='tree_size', ascending=True)

Bests:


binary_splits            0.6900
confidence_factor           nan
config_laplace           0.5000
leaves                   7.2800
min_num_obj              7.6300
num_folds                4.3800
precision               61.7219
reduced_error_pruning    1.0000
save_instance_data       0.5000
subtree_raising          0.4800
tree_size               13.5600
unpruned                 0.0000
dtype: float64

Worsts:


binary_splits             0.0000
confidence_factor         0.5511
config_laplace            0.5100
leaves                   83.7500
min_num_obj               1.7000
num_folds                    nan
precision                80.6927
reduced_error_pruning     0.0000
save_instance_data        0.5000
subtree_raising           0.5200
tree_size               109.0400
unpruned                  0.0800
dtype: float64

Mean:


binary_splits            0.5000
confidence_factor        0.5000
config_laplace           0.5000
leaves                  28.3683
min_num_obj              5.0000
num_folds                5.5000
precision               69.5269
reduced_error_pruning    0.4571
save_instance_data       0.5000
subtree_raising          0.4857
tree_size               42.6635
unpruned                 0.0286
dtype: float64

# Leaves

In [17]:
run_regression(dataset, y_key='leaves', drop=['precision', 'tree_size'], ascending=True)

Linear regression coefficients


Unnamed: 0,features,coeffs
0,binary_splits,0.2496
1,confidence_factor,-0.0957
2,config_laplace,-0.0
3,min_num_obj,0.3062
4,num_folds,-0.0113
5,reduced_error_pruning,0.1231
6,save_instance_data,0.0
7,subtree_raising,-0.0007
8,unpruned,-0.0396


In [20]:
best_and_worst(dataset, y_key='leaves', ascending=True)

Bests:


binary_splits            0.7000
confidence_factor           nan
config_laplace           0.5100
leaves                   7.2800
min_num_obj              7.7600
num_folds                4.4200
precision               61.7024
reduced_error_pruning    1.0000
save_instance_data       0.4900
subtree_raising          0.4900
tree_size               13.5600
unpruned                 0.0000
dtype: float64

Worsts:


binary_splits             0.0000
confidence_factor         0.5393
config_laplace            0.4900
leaves                   84.0800
min_num_obj               1.5600
num_folds                 2.0000
precision                80.4097
reduced_error_pruning     0.0800
save_instance_data        0.5100
subtree_raising           0.5200
tree_size               108.8800
unpruned                  0.0800
dtype: float64

Mean:


binary_splits            0.5000
confidence_factor        0.5000
config_laplace           0.5000
leaves                  28.3683
min_num_obj              5.0000
num_folds                5.5000
precision               69.5269
reduced_error_pruning    0.4571
save_instance_data       0.5000
subtree_raising          0.4857
tree_size               42.6635
unpruned                 0.0286
dtype: float64