In [36]:
import os

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import numpy as np
import pandas as pd

import tensorflow as tf
import tensorflow.keras.backend as K

from scipy.spatial import distance_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_regression

from tensorflow.keras import models
from tensorflow.keras import layers


def model_create(shape, loss, metrics, X_shape):
    model = models.Sequential()
    model.add(layers.Dense(shape[0][0], activation=shape[0][1], input_shape=X_shape))

    for layer in shape[1:]:
        model.add(layers.Dense(layer[0], activation=layer[1]))

    model.compile(optimizer='sgd', loss=loss, metrics=metrics)

    return model


def custom_loss_1(d_matrix):
    def loss(y_true, y_pred):

        dist = tf.linalg.diag_part(K.equal(y_true, d_matrix[:,-1]))
        distances= tf.boolean_mask(d_matrix[:,:-1], dist)

        distances = K.cast(distances, dtype='float32')
        distances = K.constant(d_matrix[:,:-1], name='distance_matrix')

        errors_difference =K.abs(K.transpose(K.abs(y_true - y_pred)) - K.abs(y_true - y_pred))
        
        errors_by_distance = tf.math.divide(errors_difference, distances+K.constant(1), name='Division')

        top = K.mean(K.exp(-errors_by_distance), axis=1)

        mul = K.abs(y_true - y_pred) * top

        return K.mean(mul)

    return loss


def custom_loss_2(d_matrix):
    def loss(y_true, y_pred):
        return K.mean(K.square(y_pred - y_true) + K.square(), axis=-1)

    return loss


class Generalization(tf.keras.callbacks.Callback):

    def __init__(self, train, test, d_matrix):
        super(Generalization, self).__init__()
        self.test = test
        self.train = train
        self.dist = d_matrix

    def on_epoch_end(self, epoch, logs={}):
        logs['gen_score'] = float('-inf')

        X_train, y_train = self.train[0], self.train[1]
        X_test, y_test = self.test[0], self.test[1]

        y_train_pred = self.model.predict(X_train)
        y_test_pred = self.model.predict(X_test)

        k = len(y_train_pred) - 1

        NN = np.array([np.argsort(self.dist[:, i], axis=0)[:k] for i in range(self.dist.shape[1])])
        
        p_x = [np.mean([np.exp(
            -np.divide(np.abs(np.abs(y_test_pred[i] - y_test[i]) - np.abs(y_train_pred[j] - y_train[j])),
                       self.dist[j, i] + 1)) for j in NN[i]]) for i in range(len(y_test))]
        
        M = np.abs(y_test_pred - y_test) * p_x
        score = np.mean(M)
        
        logs['gen_score'] = np.round(score, 5)
        logs['p_score'] = np.round(p_x)


class GEN_NN_benchmark:
    def __init__(self, model_function, model_shape, loss_function, metrics, callback, filename):
        self.filename = filename
        self.model_shape = model_shape
        self.loss = loss_function
        self.metric = metrics
        self.model_function = model_function
        self.results = []
        self.callback = callback

    def build(self, X, y, partition_ratio, partition_seed):
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=partition_ratio,
                                                                                random_state=partition_seed)

        self.X_train = StandardScaler().fit_transform(self.X_train)
        self.y_train = StandardScaler().fit_transform(self.y_train.reshape(-1, 1))

        self.batch_size = int(len(self.X_train))

        self.X_test = StandardScaler().fit_transform(self.X_test)
        self.y_test = StandardScaler().fit_transform(self.y_test.reshape(-1, 1))

        self.d_train = np.c_[distance_matrix(np.c_[self.X_train, self.y_train], np.c_[self.X_train, self.y_train]), self.y_train]
        
        self.d_test = distance_matrix(self.y_train, self.y_test)
        
        if not isinstance(self.loss, str):
            built_loss = self.loss(self.d_train)
        else:
            built_loss = self.loss


        self.model = self.model_function(self.model_shape, built_loss, self.metric,
                                         (self.X_train.shape[1],))

        self.call = self.callback(train=(self.X_train, self.y_train), test=(self.X_test, self.y_test),
                                      d_matrix=self.d_test)

    def benchmark(self, seeds, epochs, datasets, example=0):

        if example:
            print('a')

        else:
            for dataset in datasets:
                print(dataset)
                if dataset == 'RESID_BUILD_SALE_PRICE':
                    data = pd.read_csv('data\\' + dataset + '.txt', header=None, sep='     ', error_bad_lines=False)
                else:
                    data = pd.read_csv('data\\' + dataset + '.txt', header=None, sep='\t', error_bad_lines=False)

                X = data[data.columns[:-1]].values
                y = data[data.columns[-1]].values.reshape(-1, 1)
                
                X, y = make_regression(1000,20)

                for seed in seeds:
                    self.build(X, y, .33, seed)
                    history = self.model.fit(self.X_train, self.y_train,# validation_data=(self.X_test, self.y_test),
                                             epochs=epochs, batch_size=self.batch_size, verbose=0, callbacks=[self.call])
                    
                    train_pred = self.model.predict(self.X_train).flatten()
                    test_pred = self.model.predict(self.X_test).flatten()
                    test_p_x = history.history['p_score'][-1]
                    
                    self.results.append([seed, dataset, train_pred, test_pred, test_p_x])
                    
                    

                
            np.save(self.filename, self.results)
            return self.results, self.model, self.y_test, self.X_test
                
                


In [37]:
import time

seeds = [20,30,42,50]
epochs = 2
datasets = ['CONCRETE']


tik = time.time()
test = GEN_NN_benchmark(model_create, [[10, 'relu'] * 5, [1, 'linear']], custom_loss_1, ['mae'], Generalization, 'custom_1')
t, model, y_test, X_test = test.benchmark(seeds, epochs, datasets)

print(time.time() - tik)

import time

tik = time.time()
test = GEN_NN_benchmark(model_create, [[10, 'relu'] * 5, [1, 'linear']], 'mae', ['mae'], Generalization, 'mae')
t, model, y_test, X_test = test.benchmark(seeds, epochs, datasets)

print(time.time() - tik)

CONCRETE
21.051690101623535
CONCRETE
19.034443378448486


In [38]:
results_custom = np.load('custom_1.npy', allow_pickle='True')
results_control = np.load('mae.npy', allow_pickle='True')

In [50]:
results_custom[:,2:]

(4, 3)

In [39]:
print(np.mean(results_custom[:,-1]))

print(np.mean(results_control[:,-1]))

[1.   0.75 1.   1.   1.   1.   1.   1.   1.   1.   1.   1.   1.   1.
 1.   1.   0.75 1.   1.   1.   0.75 1.   1.   1.   1.   1.   1.   1.
 0.75 1.   1.   0.75 1.   1.   1.   0.75 1.   1.   1.   0.75 0.75 1.
 0.75 1.   1.   1.   1.   1.   0.75 0.75 1.   1.   1.   1.   1.   1.
 1.   0.75 1.   0.75 1.   0.75 1.   1.   1.   0.75 0.75 0.75 1.   1.
 1.   0.75 1.   1.   0.75 1.   1.   0.75 0.75 1.   0.75 0.75 1.   1.
 0.75 1.   0.75 0.75 1.   1.   1.   1.   1.   1.   1.   1.   1.   1.
 0.75 0.75 0.75 0.75 0.75 1.   1.   0.75 0.75 1.   1.   1.   1.   1.
 1.   1.   1.   1.   0.75 0.75 1.   1.   1.   1.   1.   0.75 1.   1.
 1.   0.75 1.   0.75 1.   0.75 1.   1.   1.   0.75 0.75 1.   1.   0.75
 1.   1.   1.   0.5  1.   1.   1.   1.   1.   1.   0.75 1.   0.5  1.
 1.   1.   1.   1.   1.   1.   1.   0.75 1.   0.5  1.   1.   1.   0.5
 1.   1.   1.   1.   0.75 1.   1.   1.   1.   1.   1.   0.75 1.   0.75
 1.   1.   1.   1.   1.   1.   1.   1.   1.   1.   0.75 1.   1.   1.
 1.   1.   0.75 1.   0.75 1. 

In [21]:
custom = []
control = []

for i in range(4):
    custom.append(results_custom[i][2]['gen_score'][-1])
    control.append(results_control[i][2]['gen_score'][-1])

In [22]:
np.mean(custom)

0.74201

In [23]:
np.mean(control)

0.6735475