In [1]:
import re
import csv
import os
import glob

import numpy as np
import pandas as pd
from math import sqrt

from sklearn.model_selection import train_test_split
from scipy.stats import wasserstein_distance, ks_2samp
from sklearn.metrics import accuracy_score, balanced_accuracy_score

from sklearn.metrics import (
    explained_variance_score,
    max_error,
    mean_absolute_error,
    mean_squared_error,
    r2_score,
    median_absolute_error,
)

from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, ExtraTreesRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import lightgbm as lgb

In [2]:
       
def normalize_data_exclude_target(df, target):
    target_df=df[target]
    df = df.drop(target, axis=1)
    columnsToEncode = list(df.select_dtypes(include=['float','int']))
    for col in columnsToEncode:
        df[col]=(df[col]-df[col].mean())/df[col].std()
    df_merged = pd.merge(df,target_df, how='inner', left_index=True, right_index=True)
    return df_merged

    

def metrics_regression(y_test, y_pred):
    """
    Prints the standard of the metrics
    :param y_test: the true labels of the test set
    :param y_pred: the predicted labels of the test set
    :return: None, this prints out the results of the metrics
    """
    r2 = r2_score(y_test, y_pred)
    MAE = median_absolute_error(y_test, y_pred)
    RSE = mean_squared_error(y_test, y_pred)
    sum_preds = y_pred.sum()
    sum_actual = y_test.sum()
    return r2, MAE, RSE, sum_preds, sum_actual
    
def label_feature_split(df, column):
    label=df[[column]].values.ravel()
    feature=df.drop([column], axis=1)
    return feature, label

def run_datasets(df_train, df_target, target, columns, apply_preprocessing=True, reporter_object=None):
    reporter_object.normalized=apply_preprocessing
    
    if apply_preprocessing:
        df_train = normalize_data_exclude_target(df_train,target)
        df_target = normalize_data_exclude_target(df_target,target)
    X_train, y_train = label_feature_split(df_train, target)
    X_test, y_test = label_feature_split(df_target,target)
    reporter_object.actual_sum = y_test.sum()
    
    return run_generic_models_regression(X_train, y_train, X_test, y_test, reporter_object)
    
def keep_columns_single(proxy_df):
    keep_column_list=stored_spesifics.original_columns()
    train_book = proxy_df[keep_column_list]
    return train_book

def split_into_bins(df, bins=12, column=None):
    """
    This method adds which bin the column falls into based on the column and bins
    :param df: dataframe to be used
    :param column: the column which needs to be split
    :param bins: a list of the bins the dataframe is split into
    :return: the original dataframe with the new columns
    """
    list_of_persentiles=[]
    for i in range(1, bins):
        list_of_persentiles.append(i/bins)
    if column is None:
        dataset = pd.DataFrame({'predictions':df})
        persentile_outputs=list(dataset.predictions.describe(percentiles=list_of_persentiles))
    else:
        persentile_outputs=list(df[column].describe(percentiles=list_of_persentiles))
    
    count = persentile_outputs.pop(0)
    mean = persentile_outputs.pop(0)
    std = persentile_outputs.pop(0)
    return count, mean, std, persentile_outputs

def run_generic_models_regression(X_train, y_train, X_test, y_test, reporter_object):

    # models from https://arxiv.org/abs/1708.05070 slightly adaped for regression and speeds
    GBC = GradientBoostingRegressor(n_estimators=100, learning_rate=1.0,max_depth=1, random_state=0)
    RFC = RandomForestRegressor(n_estimators=500, max_features=0.25)
    SVM = SVR(C = 0.01, gamma=0.1, kernel="poly", degree=3, coef0=10.0)
    ETC = ExtraTreesRegressor(n_estimators=1000, max_features="log2")
    LR = LogisticRegression(C=1.5, penalty="l1",fit_intercept=True)
    # Models that were not included in the paper not from SKlearn
    XGC = XGBRegressor()
    CBC = CatBoostRegressor(silent=True,task_type="GPU")
    light_gb = lgb.LGBMRegressor()
    #Commenting out the later models variable will run all the variables

    models=[(ETC, "Extra tree classifier"), (RFC, "random forest classifier"), (GBC, "gradient boosted classifier"),
             (XGC, "XGBoost"),(light_gb,"Light GBM")]
    models=[(RFC, "random forest regressor")]
    for model, name in models:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        r2, MAE, RSE, sum_preds, sum_actual = metrics_regression(y_test, y_pred)
        count, mean, std, persentile_outputs = split_into_bins(y_pred)
        reporter_object.add_model(name, r2, MAE, RSE, sum_preds, count, mean, std, persentile_outputs)
    return reporter_object

def basic_drop_series(df1, df2):
    return drop_nans(df1), drop_nans(df2)

def same_length_lists(l1,l2):
    l1, l2 = sorted_list(l1, l2)
    if len(l1)>len(l2):
        s, l = l2, l1
    elif len(l1)<len(l2):
        s, l = l1, l2
    else :
        return l1, l2

    s_len = len(s)
    l_len = len(l)
    ratio = l_len/s_len
    keep_value = []
    for j in range(s_len):
        get_index = int(round((j+0.5)*ratio))
        if(get_index >= l_len):
            get_index = l_len - 1
        keep_value.append(l[get_index])
    return s, keep_value

def remove_negative(series_1,series_2):
    min_value = min(min(series_1), min(series_2))
    if min_value<=0:
        add_value=(abs(min_value)+1)
        series_1 = [x+add_value for x in series_1]
        series_2 = [x+add_value for x in series_2]
    return series_1, series_2

def KL_divergence_2(df1,df2):
    from sklearn import metrics
    return (metrics.mutual_info_score(df1,df2))

def calculate_wasserstein_distance(series_1, series_2):
    return wasserstein_distance(series_1, series_2)

def drop_nans(df):
    return df.dropna()

def hellinger2(p, q):
    import math
    return sum([(math.sqrt(t[0])-math.sqrt(t[1]))*(math.sqrt(t[0])-math.sqrt(t[1]))\
                for t in zip(p,q)])/math.sqrt(2.)

def ks_test(series_1, series_2):
    return ks_2samp(series_1,series_2)

def sorted_list(series_1,series_2):
    sorted_1 = sorted(series_1)
    sorted_2 = sorted(series_2)
    return sorted_1, sorted_2

def distance_metrics(df1,df2, column):
    series_1, series_2 = basic_drop_series(df1[column],df2[column] )
    
    series_1,series_2 = same_length_lists(series_1,series_2)
    series_1,series_2 = remove_negative(series_1,series_2)
    
    kl_divergence_result = KL_divergence_2(series_1, series_2)
    wasserstein_distance_result = calculate_wasserstein_distance(series_1, series_2)
    hellinger_distance_result = hellinger2(series_1, series_2)
    ks_test_result = ks_test(series_1, series_2)[1]
    return kl_divergence_result, wasserstein_distance_result, hellinger_distance_result, ks_test_result



class model_results(object):
    def __init__(self, model_name, r2, MAE, MSE, predicted_sum, count, mean, std, bins):
        self.model_name = model_name
        self.r2 = r2
        self.MAE = MAE
        self.MSE = MSE
        self.predicted_sum = predicted_sum
        self.std=std
        self.mean=mean
        self.count=count
        self.bins=bins
        
class model_data(object):
    def __init__(self, train_dataset, test_dataset):
        self.train_dataset = train_dataset
        self.test_dataset = test_dataset
        self.normalized = None
        self.actual_sum = None
        self.models=[]
        #per variable
        self.std_train=[]
        self.mean_train=[]
        self.count_train=[]
        self.bins_train=[]
        
        self.std_test=[]
        self.mean_test=[]
        self.count_test=[]
        self.bins_test=[]
        
        self.kl_divergence=[]
        self.wasserstein_distance=[]
        self.hellinger_distance=[]
        self.ks_test=[]
        
    def add_model(self, model_name, r2, MAE, MSE, predicted_sum, count, mean, std, bins):
        self.models.append(model_results(model_name, r2, MAE, MSE, predicted_sum, count, mean, std, bins))
        
    def print_model_details(self):
        print('Train dataset: {} Test dataset: {}'.format(self.train_dataset, self.test_dataset))
        print('Data normalization is {}'.format(self.normalized))
        print('The target amount is {}'.format(self.actual_sum))
        print('The following is a list of the models used and results obtained')
        print('Format name'.ljust(25)+'\t r2 \t MSE \t\t MAE \t predicted sum')
        for model in self.models:
            print('{}\t{}\t{}\t{}\t{}'.format(model.model_name.ljust(25),round(model.r2, 4), round(model.MSE,2), round(model.MAE,2), round(model.predicted_sum,2)))
        
    def output_to_csv(self,filename, features):
        
        new_model_outputs=[]
        for model in self.models:
            dictionary={'train_dataset': self.train_dataset, 'test_dataset':self.test_dataset, 'normalized':self.normalized,
            'actual_sum': self.actual_sum, 'model_name':model.model_name, 'r2':model.r2, 'MSE': model.MSE,
            'MAE':model.MAE, 'predicted_sum': model.predicted_sum, 'entries_amount': model.count, 'standard_deviation': model.std,
            'mean': model.mean}
            i=0
            for bin_value in model.bins:
                dictionary['model_bin_number_'+str(i)]=bin_value
                i+=1
            i=0
            for feature in features:
                dictionary[feature+'_std_train']=self.std_train[i]
                dictionary[feature+'_mean_train']=self.mean_train[i]
                dictionary[feature+'_count_train']=self.count_train[i]
                
                dictionary[feature+'_std_test']=self.std_test[i]
                dictionary[feature+'_mean']=self.mean_test[i]
                dictionary[feature+'_count']=self.count_test[i]
                
                dictionary[feature+'_kl_divergence']=self.kl_divergence[i]
                dictionary[feature+'_wasserstein_distance']=self.wasserstein_distance[i]
                dictionary[feature+'_hellinger_distance']=self.hellinger_distance[i]
                dictionary[feature+'_ks_test']=self.ks_test[i]
                
                j=0
                for bin_value in self.bins_train[i]:
                    dictionary[feature+'_bin_number_train_'+str(j)]=bin_value
                    j+=1
                j=0
                for bin_value in self.bins_test[i]:
                    dictionary[feature+'_bin_number_test_'+str(j)]=bin_value
                    j+=1
                i+=1
            new_model_outputs.append(dictionary)
        df = pd.DataFrame(new_model_outputs)
        
        df.to_csv(filename, mode='a', header= (not os.path.exists(filename)))
def get_name(string):
    return string.rsplit('/',1)[1][0:-4]        

In [3]:

noise_levels=[0.005, 0.01, 0.02, 0.05, 0.1]
noise_levels=[0.1]
for noise_level in noise_levels:
    
    run_dataset = 'friedman_10_noise_'+str(noise_level)+'_window_size_2'
    directory ='toy_datasets/'+run_dataset
    result = glob.glob(directory +'/*.csv')
    result.sort()
    dataset_names=result
    features =['0','1','2','3']
    for train_dataset_name in dataset_names:
        for target_dataset_name in dataset_names:
            if train_dataset_name != target_dataset_name:
                df_train = pd.read_csv(train_dataset_name)
                df_target = pd.read_csv(target_dataset_name)

                reporter_objects=[]
                reporter_object=model_data(train_dataset_name,target_dataset_name)
                for _ in range(1000):
                    reporter_objects.append(model_data(get_name(train_dataset_name),get_name(target_dataset_name)))

                i=0
                apply_processing_posibilities=[False]

                for apply_preprocessing in apply_processing_posibilities:
                    for feature in features:

                        count, mean, std, bins = split_into_bins(df_train, column=feature)
                        reporter_objects[i].std_train.append(std)
                        reporter_objects[i].mean_train.append(mean) 
                        reporter_objects[i].count_train.append(count)
                        reporter_objects[i].bins_train.append(bins)

                        count, mean, std, bins = split_into_bins(df_target, column=feature)
                        reporter_objects[i].std_test.append(std)
                        reporter_objects[i].mean_test.append(mean) 
                        reporter_objects[i].count_test.append(count)
                        reporter_objects[i].bins_test.append(bins)

                        kl_divergence, wasserstein_distance_result, hellinger_distance, ks_test_result = distance_metrics(df_train, df_target,column=feature)
                        reporter_objects[i].kl_divergence.append(kl_divergence)
                        reporter_objects[i].wasserstein_distance.append(wasserstein_distance_result)
                        reporter_objects[i].hellinger_distance.append(hellinger_distance)
                        reporter_objects[i].ks_test.append(ks_test_result)


                    reporter_objects[i] = run_datasets(df_train, df_target, target = 'target', columns=features,apply_preprocessing=apply_preprocessing, reporter_object=reporter_objects[i])
                    #reporter_objects[i].print_model_details()
                    i+=1
                for j in range(i):
                    if not os.path.exists('toy_datasets/ran_datasets/'):
                        os.makedirs('toy_datasets/ran_datasets/')
                    #reporter_objects[j].print_model_details()
                    reporter_objects[j].output_to_csv('toy_datasets/ran_datasets/'+run_dataset+'.csv',features)
    print('Models ran')

Models ran
