In [1]:
from os.path import join
import os
import numpy as np
import pandas as pd
%matplotlib inline

In [2]:
INPUT_FOLDER = './output'

In [3]:
# Return the number of non-leaves nodes in a tree
def get_nodes(filename, folder=INPUT_FOLDER):
    with open(join(folder, filename), 'r') as file:
        count = 0
        while file.readline().find('------------------') < 0:
            pass

        while True:
            line = file.readline()
            if line.find('Number of Leaves') >= 0:
                break
            if line.find('=') >= 0 and line.find(':') < 0:
                count += 1
            
    return count

In [4]:
# return the depth of a tree
def get_depth(filename, folder=INPUT_FOLDER):
    with open(join(folder, filename), 'r') as file:
        count = 0
        while file.readline().find('------------------') < 0:
            pass

        while True:
            line = file.readline()
            if line.find('Number of Leaves') >= 0:
                break
            if line.count('|') > count:
                count = line.count('|')
            
    return count + 1

In [5]:
# return the number of nodes in a tree
def get_tree_size(filename, folder=INPUT_FOLDER):
    with open(join(folder, filename), 'r') as file:
        while True:
            line = file.readline()
            if line.find('Size of the tree') >= 0:
                break
    return float(line.split()[-1])

In [6]:
# return the number of leaves in a tree
def get_leaves(filename, folder=INPUT_FOLDER):
    with open(join(folder, filename), 'r') as file:
        while True:
            line = file.readline()
            if line.find('Number of Leaves') >= 0:
                break
    return float(line.split()[-1])

In [7]:
# return the predictive precision of a tree
def get_precision(filename, folder=INPUT_FOLDER):
    with open(join(folder, filename), 'r') as file:
        while file.readline().find('=== Stratified cross-validation ===') < 0:
            pass
        while True:
            line = file.readline()
            if line.find('Correctly Classified Instances') >= 0:
                break
    return float(line.split()[-2])

In [8]:
# Parse the command-line parameters from a file a return a dictionary

def command_line_to_config(line):
    config = {}
    config['reduced_error_pruning'] = '-R' in line
    config['unpruned'] = '-U' in line
    config['binary_splits'] = '-B' in line
    config['save_instance_data'] = '-L' in line
    config['subtree_raising'] = '-S' in line
    config['config_laplace'] = '-A' in line

    config['confidence_factor'] = float(line[line.index('-C') + 1]) if '-C' in line else None
    config['min_num_obj'] = int(line[line.index('-M') + 1]) if '-M' in line else None
    config['num_folds'] = int(line[line.index('-N') + 1]) if '-N' in line else None
    
    return config

def get_config(filename, folder=INPUT_FOLDER):
    with open(join(folder, filename), 'r') as file:
        while True:
            line = file.readline()
            if line.find('Options:') >= 0:
                break
        
        return command_line_to_config(line.split()[1:])

In [9]:
def merge_dicts(*dicts):
    l = []
    for d in dicts:
        l += list(d.items())
    return dict(l)

In [10]:
# Build a dataset out of the files, with the parameters, and various statistics
dataset = pd.DataFrame([merge_dicts(get_config(name),
                                    {'precision': get_precision(name)},
                                    {'leaves': get_leaves(name)},
                                    {'tree_size': get_tree_size(name)},
                                    {'nodes': get_nodes(name)},
                                    {'depth': get_depth(name)}) for name in os.listdir(INPUT_FOLDER)])
dataset.to_csv('datasets/output/dataset.csv', index=False)
dataset = dataset.sort_values(by=['precision', 'depth', 'tree_size', 'leaves'])
dataset.to_csv('datasets/output/sorted_dataset.csv', index=False)
dataset

Unnamed: 0,binary_splits,confidence_factor,config_laplace,depth,leaves,min_num_obj,nodes,num_folds,precision,reduced_error_pruning,save_instance_data,subtree_raising,tree_size,unpruned
83,True,,False,4,9.0,9,5,3.0,55.6098,True,True,True,17.0,False
106,True,,True,4,9.0,9,5,3.0,55.6098,True,False,True,17.0,False
166,True,,False,4,9.0,9,5,3.0,55.6098,True,False,True,17.0,False
1164,True,,True,4,9.0,9,5,3.0,55.6098,True,True,True,17.0,False
1200,True,,True,4,9.0,9,5,3.0,55.6098,True,True,False,17.0,False
1441,True,,False,4,9.0,9,5,3.0,55.6098,True,True,False,17.0,False
2301,True,,True,4,9.0,9,5,3.0,55.6098,True,False,False,17.0,False
2487,True,,False,4,9.0,9,5,3.0,55.6098,True,False,False,17.0,False
77,False,,True,3,31.0,7,3,3.0,56.0976,True,False,True,36.0,False
540,False,,True,3,31.0,7,3,3.0,56.0976,True,True,False,36.0,False


# Linear regression

In [11]:
from sklearn.linear_model import LinearRegression

In [12]:
def normalize_df(df):
    return (df - df.mean()) / (df.max() - df.min())

In [13]:
def run_regression(dataset, y_key, ascending=False, drop=[], result_column='coeffs'):
    cleaned_dataset = normalize_df(dataset
    .drop(drop, axis=1)
    .sort_values(by=y_key, ascending=ascending)
    ).fillna(0)
    
    x = cleaned_dataset.drop(y_key, axis=1)
    y = cleaned_dataset[y_key]
    if ascending:
        y = -y
    
    lm = LinearRegression()
    lm.fit(x, y)
    pd.options.display.float_format = '{:.4f}'.format
    df = pd.DataFrame(list(zip(x.columns, lm.coef_)), columns=['features', result_column])
    return df

In [14]:
def best_and_worst(dataset, y_key, ascending=False, drop=[]):
    sorted_dataset = (dataset
    .drop(drop, axis=1)
    .sort_values(by=y_key, ascending=ascending)
    )
    
    bests = sorted_dataset.iloc[:100]
    worsts = sorted_dataset.iloc[-100:]
    
    print('Bests:')
    display(bests.mean())
    print('Worsts:')
    display(worsts.mean())
    print('Mean:')
    display(sorted_dataset.mean())

In [15]:
output_values = {'tree_size', 'leaves', 'depth', 'nodes', 'precision'}
regression_precision = run_regression(dataset, y_key='precision',
                                      drop=output_values - {'precision'},
                                      result_column='coefficients for precision')
regression_tree_size = run_regression(dataset, y_key='tree_size', ascending=True,
                                      drop=output_values - {'tree_size'},
                                      result_column='coefficients for leaves')
regression_leaves = run_regression(dataset, y_key='leaves', ascending=True,
                                   drop=output_values - {'leaves'},
                                   result_column='coefficients for tree size')
regression_depth = run_regression(dataset, y_key='depth', ascending=True,
                                   drop=output_values - {'depth'},
                                   result_column='coefficients for depth')
regression_nodes = run_regression(dataset, y_key='nodes', ascending=True,
                                   drop=output_values - {'nodes'},
                                   result_column='coefficients for nodes')

In [16]:
pd.concat([
    regression_precision,
    regression_depth.drop('features', axis=1),
    regression_tree_size.drop('features', axis=1),
    regression_nodes.drop('features', axis=1),
    regression_leaves.drop('features', axis=1),
], axis=1)

Unnamed: 0,features,coefficients for precision,coefficients for depth,coefficients for leaves,coefficients for nodes,coefficients for tree size
0,binary_splits,-0.096,-0.1934,0.1691,-0.1524,0.2496
1,confidence_factor,0.0235,-0.0127,-0.0942,-0.0573,-0.0957
2,config_laplace,-0.001,0.0,-0.0,0.0,-0.0
3,min_num_obj,-0.3868,0.3093,0.3762,0.4293,0.3062
4,num_folds,0.1184,-0.046,0.0056,0.0457,-0.0113
5,reduced_error_pruning,-0.2609,0.068,0.1576,0.1447,0.1231
6,save_instance_data,-0.0,0.0,0.0,0.0,-0.0
7,subtree_raising,-0.0008,0.0,-0.0011,-0.0016,-0.0007
8,unpruned,0.0035,-0.004,-0.0388,-0.0227,-0.0396


# Best and worsts

In [17]:
best_and_worst(dataset, y_key='precision')

Bests:


binary_splits             0.0000
confidence_factor         0.6272
config_laplace            0.5000
depth                     7.0000
leaves                   79.5600
min_num_obj               1.4000
nodes                    14.5200
num_folds                    nan
precision                83.9609
reduced_error_pruning     0.0000
save_instance_data        0.5100
subtree_raising           0.4400
tree_size               105.4000
unpruned                  0.0800
dtype: float64

Worsts:


binary_splits            0.6000
confidence_factor           nan
config_laplace           0.4800
depth                    4.3200
leaves                  12.4000
min_num_obj              8.0400
nodes                    4.2800
num_folds                3.9200
precision               57.1122
reduced_error_pruning    1.0000
save_instance_data       0.5000
subtree_raising          0.5000
tree_size               19.8000
unpruned                 0.0000
dtype: float64

Mean:


binary_splits            0.5000
confidence_factor        0.5000
config_laplace           0.5000
depth                    6.7889
leaves                  28.3683
min_num_obj              5.0000
nodes                    9.0190
num_folds                5.5000
precision               69.5269
reduced_error_pruning    0.4571
save_instance_data       0.5000
subtree_raising          0.4857
tree_size               42.6635
unpruned                 0.0286
dtype: float64

# Tree size

In [18]:
best_and_worst(dataset, y_key='tree_size', ascending=True)

Bests:


binary_splits            0.7100
confidence_factor           nan
config_laplace           0.5100
depth                    4.2400
leaves                   7.2800
min_num_obj              8.0200
nodes                    3.3500
num_folds                4.0900
precision               61.1073
reduced_error_pruning    1.0000
save_instance_data       0.5000
subtree_raising          0.5100
tree_size               13.5600
unpruned                 0.0000
dtype: float64

Worsts:


binary_splits             0.0000
confidence_factor         0.5587
config_laplace            0.5000
depth                     7.0000
leaves                   83.7500
min_num_obj               1.7000
nodes                    14.4200
num_folds                    nan
precision                80.6927
reduced_error_pruning     0.0000
save_instance_data        0.5000
subtree_raising           0.5200
tree_size               109.0400
unpruned                  0.0800
dtype: float64

Mean:


binary_splits            0.5000
confidence_factor        0.5000
config_laplace           0.5000
depth                    6.7889
leaves                  28.3683
min_num_obj              5.0000
nodes                    9.0190
num_folds                5.5000
precision               69.5269
reduced_error_pruning    0.4571
save_instance_data       0.5000
subtree_raising          0.4857
tree_size               42.6635
unpruned                 0.0286
dtype: float64

In [19]:
best_and_worst(dataset, y_key='leaves', ascending=True)

Bests:


binary_splits            0.6800
confidence_factor           nan
config_laplace           0.5000
depth                    4.2700
leaves                   7.2800
min_num_obj              7.9600
nodes                    3.2400
num_folds                3.9200
precision               61.4195
reduced_error_pruning    1.0000
save_instance_data       0.5100
subtree_raising          0.4900
tree_size               13.5600
unpruned                 0.0000
dtype: float64

Worsts:


binary_splits             0.0000
confidence_factor         0.5341
config_laplace            0.5000
depth                     6.9200
leaves                   84.0800
min_num_obj               1.5600
nodes                    14.0800
num_folds                 2.0000
precision                80.4097
reduced_error_pruning     0.0800
save_instance_data        0.5000
subtree_raising           0.5300
tree_size               108.8800
unpruned                  0.0700
dtype: float64

Mean:


binary_splits            0.5000
confidence_factor        0.5000
config_laplace           0.5000
depth                    6.7889
leaves                  28.3683
min_num_obj              5.0000
nodes                    9.0190
num_folds                5.5000
precision               69.5269
reduced_error_pruning    0.4571
save_instance_data       0.5000
subtree_raising          0.4857
tree_size               42.6635
unpruned                 0.0286
dtype: float64

In [20]:
best_and_worst(dataset, y_key='depth', ascending=True)

Bests:


binary_splits            0.0000
confidence_factor           nan
config_laplace           0.5600
depth                    2.7600
leaves                  23.8300
min_num_obj              6.4200
nodes                    2.8400
num_folds                6.5500
precision               64.8488
reduced_error_pruning    1.0000
save_instance_data       0.5000
subtree_raising          0.5000
tree_size               28.2600
unpruned                 0.0000
dtype: float64

Worsts:


binary_splits            1.0000
confidence_factor        0.6710
config_laplace           0.4500
depth                   13.2000
leaves                  31.0900
min_num_obj              1.3300
nodes                   23.8000
num_folds                5.2615
precision               74.1317
reduced_error_pruning    0.6500
save_instance_data       0.5100
subtree_raising          0.4800
tree_size               61.1800
unpruned                 0.0400
dtype: float64

Mean:


binary_splits            0.5000
confidence_factor        0.5000
config_laplace           0.5000
depth                    6.7889
leaves                  28.3683
min_num_obj              5.0000
nodes                    9.0190
num_folds                5.5000
precision               69.5269
reduced_error_pruning    0.4571
save_instance_data       0.5000
subtree_raising          0.4857
tree_size               42.6635
unpruned                 0.0286
dtype: float64

In [21]:
best_and_worst(dataset, y_key='nodes', ascending=True)

Bests:


binary_splits            0.2000
confidence_factor           nan
config_laplace           0.4600
depth                    3.3300
leaves                  19.4700
min_num_obj              7.2000
nodes                    2.3600
num_folds                6.5800
precision               64.1024
reduced_error_pruning    1.0000
save_instance_data       0.4900
subtree_raising          0.5400
tree_size               23.9400
unpruned                 0.0000
dtype: float64

Worsts:


binary_splits            1.0000
confidence_factor        0.5000
config_laplace           0.5000
depth                   12.7200
leaves                  36.0400
min_num_obj              1.0000
nodes                   27.5200
num_folds                5.3333
precision               78.4390
reduced_error_pruning    0.2400
save_instance_data       0.5000
subtree_raising          0.4800
tree_size               71.0800
unpruned                 0.0400
dtype: float64

Mean:


binary_splits            0.5000
confidence_factor        0.5000
config_laplace           0.5000
depth                    6.7889
leaves                  28.3683
min_num_obj              5.0000
nodes                    9.0190
num_folds                5.5000
precision               69.5269
reduced_error_pruning    0.4571
save_instance_data       0.5000
subtree_raising          0.4857
tree_size               42.6635
unpruned                 0.0286
dtype: float64