# Simpler ML models
On teste ici des modèles de machine learning plus divers que les réseaux de neurones.

On veut voir la performance de modèles plus simples, pour avoir une meilleure idée de la performance des NNs.

In [1]:
%%time

import os

import pylab
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sb_utils.read_data import get_trees

path = 'data/second_samples/normalized/train/'
trees = get_trees(path)
total_len = sum([len(t) for t in trees.values()])
print(f'Number of parents: {total_len:,}')

Number of parents: 27,873
Wall time: 1min 4s


In [2]:
from sklearn.model_selection import train_test_split

def tree_to_dataset(tree, number_of_childs=5):
    """
    Créer une ligne par parent.
    Il est possible qu'un parent ait moins d'enfants qu'attendu.
    Ils sont ignorés.
    """
    parents, childs, values = [], [], []
    
    for parent_node in tree.values():
        feature_names = sorted(parent_node.features.keys())
        final_row = []
        final_values = []
        
        if len(parent_node.children_nodes) != number_of_childs:
            continue  # Invalid parent
        
        for child in parent_node.children_nodes:
            child_row = np.array([child.features[f] for f in feature_names])
            final_row.append(child_row)
            final_values.append(child.value)
        
        childs.append(np.array(final_row))
        values.append(np.array(final_values))
        parents.append(np.array([parent_node.features[f] for f in feature_names]
                                + [parent_node.value]))
        
    return np.array(parents), np.array(childs), np.array(values)

def build_dataset(trees):
    parents, childs, values = [], [], []
    for tree in trees.values():
        parents_t, childs_t, values_t = tree_to_dataset(tree)
        if childs_t.shape[0] != 0:
            childs.append(childs_t)
            values.append(values_t)
            parents.append(parents_t)
    
    parents = np.concatenate(parents, axis=0)
    childs = np.concatenate(childs, axis=0)
    values = np.concatenate(values, axis=0)
    
    return parents, childs, values

parents, childs, values = build_dataset(trees)
print(f'Number of rows: {childs.shape[0]: ,}')
print(f'Number of features (childs): ({childs.shape[1]}, {childs.shape[2]})')
print(f'Number of features (parents): {parents.shape[1]}\n')

X = [(p, c) for p, c in zip(parents, childs)]
X_train, X_test, y_train, y_test = train_test_split(X, values, test_size=0.2)
childs_train, childs_test = [x[1] for x in X_train], [x[1] for x in X_test]
parents_train, parents_test = [x[0] for x in X_train], [x[0] for x in X_test]
print(f'Number of training examples: {len(childs_train): ,}')
print(f'Number of validation examples: {len(childs_test): ,}')

Number of rows:  27,371
Number of features (childs): (5, 12)
Number of features (parents): 13

Number of training examples:  21,896
Number of validation examples:  5,475


In [3]:
def dataset_to_rows(parents, childs):
    rows = np.zeros((len(parents) * childs[0].shape[0], parents[0].shape[0] + childs[0].shape[1]))
    indice = 0
    for parent, children in zip(parents, childs):
        for child in children:
            rows[indice] = np.append(parent, child, axis=0)
            indice += 1
    return rows

train = dataset_to_rows(parents_train, childs_train)
y_train = y_train.reshape(-1)
test = dataset_to_rows(parents_test, childs_test)
y_test = y_test.reshape(-1)

# Evaluation de modèles

In [6]:
def eval_sk_model(model, rows, values):
    """
    Retourne l'écart absolu moyen et sa déviation standard.
    """
    precision = np.abs(model.predict(rows) - values)
    return np.mean(precision), np.std(precision)

In [36]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(train, y_train)

mean, std = eval_sk_model(model, train, y_train)
print(f'Train precision: {mean:.2e} ({std:.2e})')

mean, std = eval_sk_model(model, test, y_test)
print(f'Test precision: {mean:.2e} ({std:.2e})')

Train precision: 2.28e-02 (4.10e-02)
Test precision: 2.32e-02 (4.18e-02)


In [37]:
from sklearn.neighbors import KNeighborsRegressor

model = KNeighborsRegressor(n_neighbors=5)
model.fit(train, y_train)

mean, std = eval_sk_model(model, train, y_train)
print(f'Train precision: {mean:.2e} ({std:.2e})')

mean, std = eval_sk_model(model, test, y_test)
print(f'Test precision: {mean:.2e} ({std:.2e})')

Train precision: 6.35e-02 (7.85e-02)
Test precision: 8.16e-02 (1.01e-01)


In [11]:
from sklearn.tree import DecisionTreeRegressor

model = DecisionTreeRegressor(criterion='mae')
model.fit(train, y_train)

mean, std = eval_sk_model(model, train, y_train)
print(f'Train precision: {mean:.2e} ({std:.2e})')

mean, std = eval_sk_model(model, test, y_test)
print(f'Test precision: {mean:.2e} ({std:.2e})')

Train precision: 0.00e+00 (0.00e+00)
Test precision: 2.07e-02 (5.24e-02)


In [7]:
from sklearn.svm import SVR

model = SVR()
model.fit(train, y_train)

mean, std = eval_sk_model(model, train, y_train)
print(f'Train precision: {mean:.2e} ({std:.2e})')

mean, std = eval_sk_model(model, test, y_test)
print(f'Test precision: {mean:.2e} ({std:.2e})')

Train precision: 4.30e-02 (3.68e-02)
Test precision: 4.36e-02 (3.98e-02)


In [8]:
from sklearn.linear_model import BayesianRidge

model = BayesianRidge()
model.fit(train, y_train)

mean, std = eval_sk_model(model, train, y_train)
print(f'Train precision: {mean:.2e} ({std:.2e})')

mean, std = eval_sk_model(model, test, y_test)
print(f'Test precision: {mean:.2e} ({std:.2e})')

Train precision: 2.29e-02 (4.10e-02)
Test precision: 2.26e-02 (4.17e-02)
