# Prédiction multi-noeud
On fait ici un modèle de prédiction multi-noeud.
On prends tous les noeuds fils en même temps, que l'on donne au modèle.

La prédiction est une regression. La classification one-hot a été testée auparavant, mais n'a pas donné de résultats probants.

In [1]:
%%time

import os

import pylab
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sb_utils.read_data import get_trees

path = 'data/second_samples/normalized/train/'
trees = get_trees(path)
total_len = sum([len(t) for t in trees.values()])
print(f'Number of parents: {total_len:,}')

Number of parents: 27,873
Wall time: 1min


In [2]:
from sklearn.model_selection import train_test_split

def tree_to_dataset(tree, number_of_childs=5):
    """
    Créer une ligne par parent.
    Il est possible qu'un parent ait moins d'enfants qu'attendu.
    Ils sont ignorés.
    """
    parents, childs, values = [], [], []
    
    for parent_node in tree.values():
        feature_names = sorted(parent_node.features.keys())
        final_row = []
        final_values = []
        
        if len(parent_node.children_nodes) != number_of_childs:
            continue  # Invalid parent
        
        for child in parent_node.children_nodes:
            child_row = np.array([child.features[f] for f in feature_names])
            final_row.append(child_row)
            final_values.append(child.value)
        
        childs.append(np.array(final_row))
        values.append(np.array(final_values))
        parents.append(np.array([parent_node.features[f] for f in feature_names]
                                + [parent_node.value]))
        
    return np.array(parents), np.array(childs), np.array(values)

def build_dataset(trees):
    parents, childs, values = [], [], []
    for tree in trees.values():
        parents_t, childs_t, values_t = tree_to_dataset(tree)
        if childs_t.shape[0] != 0:
            childs.append(childs_t)
            values.append(values_t)
            parents.append(parents_t)
    
    parents = np.concatenate(parents, axis=0)
    childs = np.concatenate(childs, axis=0)
    values = np.concatenate(values, axis=0)
    
    return parents, childs, values

parents, childs, values = build_dataset(trees)
print(f'Number of rows: {childs.shape[0]: ,}')
print(f'Number of features (childs): ({childs.shape[1]}, {childs.shape[2]})')
print(f'Number of features (parents): {parents.shape[1]}\n')

X = [(p, c) for p, c in zip(parents, childs)]
X_train, X_test, y_train, y_test = train_test_split(X, values, test_size=0.2)
childs_train, childs_test = [x[1] for x in X_train], [x[1] for x in X_test]
parents_train, parents_test = [x[0] for x in X_train], [x[0] for x in X_test]
print(f'Number of training examples: {len(childs_train): ,}')
print(f'Number of validation examples: {len(childs_test): ,}')

Number of rows:  27,371
Number of features (childs): (5, 12)
Number of features (parents): 13

Number of training examples:  21,896
Number of validation examples:  5,475


In [3]:
import torch

class Dataset(torch.utils.data.dataset.Dataset):
    def __init__(self, parents, childs, values):
        self.parents = torch.FloatTensor(parents)
        self.childs = torch.FloatTensor(childs)
        self.values = torch.tensor(values)
        self.number_of_nodes = self.childs.shape[1]
        self.n_features = self.childs.shape[1] * self.childs.shape[2] + self.parents.shape[1]
        
    def __len__(self):
        return len(self.childs)

    def __getitem__(self, index):
        perm = torch.randperm(self.number_of_nodes)
        row, values = self.childs[index][perm], self.values[index][perm]
        row = row.view(-1)
        row = torch.cat((row, self.parents[index]), axis=0)
        return row, values

train_dataset = Dataset(parents_train, childs_train, y_train)
test_dataset = Dataset(parents_test, childs_test, y_test)

In [15]:
def eval_dataset(model, criterion, dataloader):
    model.eval()
    losses = []
    precisions = []
    
    cuda = torch.cuda.is_available()
    if cuda:
        model = model.cuda()
        criterion = criterion.cuda()
    
    for rows, values in dataloader:
        if cuda:
            rows = rows.cuda()
            values = values.cuda()

        output = model(rows)
        # o, v = -torch.log(output).float(), -torch.log(values).float()
        o, v = output, values.float()
        losses.append(criterion(o, v).item())
        precisions.extend([
            abs(output.item() - value.item())
            for output, value in zip(o.view(-1), v.view(-1))
        ])
    
    return sum(losses) / len(losses), np.mean(precisions), np.std(precisions)

def train(model, data_train, data_val,
          criterion, optimizer, epochs,
          print_each=1):
    train_loss, train_prec = [], []
    val_loss, val_prec = [], []
    
    cuda = torch.cuda.is_available()
    if cuda:
        model = model.cuda()
        criterion = criterion.cuda()
    
    for e in range(1, epochs+1):
        model.train()
        for rows, values in data_train:
            if cuda:
                rows = rows.cuda()
                values = values.cuda()

            optimizer.zero_grad()
            
            pred = model(rows)
            values = values.float()

            loss = criterion(pred, values)
            loss.backward()
            optimizer.step()
        
        if e % print_each == 0:
            loss, prec, std_prec_t = eval_dataset(model, criterion, data_train)
            train_loss.append(loss)
            train_prec.append(prec)

            loss, prec, std_prec_v = eval_dataset(model, criterion, data_val)
            val_loss.append(loss)
            val_prec.append(prec)
            
            print(f'Epoch {e}')
            print(f'Train loss: {train_loss[-1]:.5f} \t\t\t\t\tVal loss: {val_loss[-1]:.5f}')
            print(f'Train precision: {train_prec[-1]:.2e} ({std_prec_t:.2e})\
                  \tVal precision: {val_prec[-1]:.2e} ({std_prec_v:.2e})')
    
    return train_loss, train_prec, val_loss, val_prec

In [5]:
import torch.nn as nn

class MLP(nn.Module):
    def __init__(self, input_size, output_size, n_layers, n_hidden):
        super(MLP, self).__init__()
        
        self.project = nn.Sequential(
            nn.Linear(input_size, n_hidden),
            nn.SELU(),
        )
        
        self.resnet = nn.ModuleList([
            nn.Sequential(
                nn.Linear(n_hidden, n_hidden // 2),
                nn.BatchNorm1d(n_hidden // 2),
                nn.SELU(),
                nn.Linear(n_hidden // 2, n_hidden),
                nn.BatchNorm1d(n_hidden),
                nn.SELU(),
            )
            for _ in range(n_layers)
        ])
        
        self.output = nn.Linear(n_hidden, output_size)
        
    def forward(self, x):
        x = self.project(x)
        for layer in self.resnet:
            x = x + layer(x)
        x = self.output(x)
        return torch.sigmoid(x)

In [36]:
import torch.optim as optim
from torch.utils.data import DataLoader

from torchsummary import summary

batch_size = 200
input_size = train_dataset[0][0].shape[0]
output_size = train_dataset[0][1].shape[0]

data_loader_train = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
data_loader_test = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

n_layers = 15
n_hidden = 800
model = MLP(input_size, output_size, n_layers, n_hidden)

lr = 1e-5
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

_ = summary(model, (input_size,))

Layer (type:depth-idx)                   Output Shape              Param #
├─Sequential: 1-1                        [-1, 800]                 --
|    └─Linear: 2-1                       [-1, 800]                 59,200
|    └─SELU: 2-2                         [-1, 800]                 --
├─ModuleList: 1                          []                        --
|    └─Sequential: 2-3                   [-1, 800]                 --
|    |    └─Linear: 3-1                  [-1, 400]                 320,400
|    |    └─BatchNorm1d: 3-2             [-1, 400]                 800
|    |    └─SELU: 3-3                    [-1, 400]                 --
|    |    └─Linear: 3-4                  [-1, 800]                 320,800
|    |    └─BatchNorm1d: 3-5             [-1, 800]                 1,600
|    |    └─SELU: 3-6                    [-1, 800]                 --
|    └─Sequential: 2-4                   [-1, 800]                 --
|    |    └─Linear: 3-7                  [-1, 400]                 

In [37]:
%%time
epochs = 100
print_each = 10
perfs = train(model, data_loader_train, data_loader_test,
              criterion, optimizer, epochs,
              print_each=print_each)

Epoch 10
Train loss: 0.34824 					Val loss: 0.34833
Train precision: 4.52e-02 (5.36e-02)                  	Val precision: 4.52e-02 (5.24e-02)
Epoch 20
Train loss: 0.34444 					Val loss: 0.34515
Train precision: 3.85e-02 (4.97e-02)                  	Val precision: 3.81e-02 (4.85e-02)
Epoch 30
Train loss: 0.34174 					Val loss: 0.34215
Train precision: 3.38e-02 (4.67e-02)                  	Val precision: 3.35e-02 (4.55e-02)
Epoch 40
Train loss: 0.33987 					Val loss: 0.34044
Train precision: 3.12e-02 (4.40e-02)                  	Val precision: 3.11e-02 (4.27e-02)
Epoch 50
Train loss: 0.33879 					Val loss: 0.33918
Train precision: 2.88e-02 (4.31e-02)                  	Val precision: 2.89e-02 (4.26e-02)
Epoch 60
Train loss: 0.33828 					Val loss: 0.33940
Train precision: 2.80e-02 (4.16e-02)                  	Val precision: 2.80e-02 (4.16e-02)
Epoch 70
Train loss: 0.33845 					Val loss: 0.33811
Train precision: 2.87e-02 (4.20e-02)                  	Val precision: 2.88e-02 (4.16e-02)
Epoch 