In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy
from sklearn.metrics import (
    explained_variance_score,
    max_error,
    mean_absolute_error,
    mean_squared_error,
    r2_score,
    median_absolute_error,
)
from catboost import CatBoostRegressor
import shap
import matplotlib.pyplot as plt
from scipy.stats import kendalltau


In [2]:
def keep_relavent_columns(df, column_names=None):
    if column_names is None:
        return df
    return df[column_names]

def encode_one_hot(df):
    columnsToEncode = list(df.select_dtypes(include=['category','object']))
    for col in columnsToEncode:
        if len(df[col].unique()) < 100:
            df = pd.concat([df,pd.get_dummies(df[col], prefix=[col])], axis=1)
        df.drop(col,inplace=True,axis=1)
    return df


def apply_to_numberic_selective(df):
    columnsToEncode = list(df.select_dtypes(include=['float','int']))
    for col in columnsToEncode:
        df = df.apply(pd.to_numeric, errors='coerce')
        
def process_column_names_xgboost(df):
    #XGboost has additional requirements on characters which can be in column names, so this removes the characters
    regex = re.compile(r"\[|\]|<", re.IGNORECASE)
    df.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in df.columns.values]
    return df
'''
def preprocessing_without_drop(df, target, column_names=None,bad_columns=None, apply_onehot=True, using_xgboost=True):
    target_df=df[target]
    df.drop(target, inplace=True, axis=1)
    
    df = keep_relavent_columns(df,column_names)
    
    df = df.dropna()
    
    if apply_onehot:
        df = encode_one_hot(df)
        df = df.astype('float64')
        df = df.apply(pd.to_numeric, errors='coerce')
    else :
        df = apply_to_numberic_selective(df)
    if using_xgboost:
        df = process_column_names_xgboost(df)
    df_merged = pd.merge(df,target_df, how='inner', left_index=True, right_index=True)
    return df_merged
'''
def preprocessing(df, target, column_names=None,bad_columns=None, apply_onehot=True, using_xgboost=True):
    target_df=df[target]
    df.drop(target, inplace=True, axis=1)
    
    df = keep_relavent_columns(df,column_names)
    df = drop_bad_columns(df, bad_columns)
    df = df.dropna()
    
    if apply_onehot:
        df = encode_one_hot(df)
        df = df.astype('float64')
        df = df.apply(pd.to_numeric, errors='coerce')
    else :
        df = apply_to_numberic_selective(df)
    if using_xgboost:
        df = process_column_names_xgboost(df)
    df_merged = pd.merge(df,target_df, how='inner', left_index=True, right_index=True)
    return df_merged
    




In [3]:


def label_feature_split(df, column):
    label=df[[column]].values.ravel()
    feature=df.drop([column], axis=1)
    return feature, label


def metrics_regression(y_test, y_pred):
    """
    Prints the standard of the metrics
    :param y_test: the true labels of the test set
    :param y_pred: the predicted labels of the test set
    :return: None, this prints out the results of the metrics
    """
    r2 = r2_score(y_test, y_pred)
    MAE = median_absolute_error(y_test, y_pred)
    RSE = mean_squared_error(y_test, y_pred)
    sum_preds = y_pred.sum()
    sum_actual = y_test.sum()
    return r2, MAE, RSE, sum_preds, sum_actual

In [4]:
# def run_generic_models_regression(X_train, y_train, X_test, y_test, reporter_object=None):
    
#     CBC = CatBoostRegressor(silent=True,task_type="GPU")
#     models = [(CBC, "catboost regressor")]
    
#     for model, name in models:
#         model.fit(X_train, y_train)
#         y_pred = model.predict(X_test)
#         shap_values = shap.TreeExplainer(model).shap_values(X_train)
#         shap.summary_plot(shap_values, X_train, plot_type="bar")
        
#         print(y_test[:10])
#         print(y_pred[:10])
#         r2, MAE, RSE, sum_preds, sum_actual = metrics_regression(y_test, y_pred)
        
#         print('r2, MAE, RSE, sum_preds, sum_actual')
#         print(r2, MAE, RSE, sum_preds, sum_actual)
        
#         plt.scatter(y_pred, y_test)
#         plt.show()
#         print("coeficient")
#         print(numpy.corrcoef(y_pred, y_test)[0, 1])
#         return (numpy.corrcoef(y_pred, y_test)[0, 1])
def analyse_generic_models_regression(X_train, y_train, X_test, y_test, reporter_object=None):
    
    
    CBC = CatBoostRegressor(silent=True,task_type="GPU")
    
    CBC.fit(X_train, y_train)
    y_pred = CBC.predict(X_test)
#     shap_values = shap.TreeExplainer(CBC).shap_values(X_train)
#     shap.summary_plot(shap_values, X_train, plot_type="bar")
    X_test['actual_result']=y_test
    X_test['predicted_result']=y_pred
#     print(X_test['predicted_result'])
#     print(X_test['actual_result'])
    return X_test, numpy.corrcoef(y_pred, y_test)[0, 1]
'''
def analise_performance(X_train, y_train, X_test, y_test, reporter_object=None):
   
    CBC = CatBoostRegressor(silent=True,task_type="GPU")
    model = CBC
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    df = pd.DataFrame()
    df['prediction']=y_pred
    df['id'] = X_test.index.values   
    return df
'''
def split_dataset(df, dataset):
    
    test_df = df[df["['test_dataset']_"+dataset]==1]
    train_df = df[df["['test_dataset']_"+dataset]!=1]
    return train_df, test_df

def drop_bad_columns(df, columns=None):
    if columns is not None:
        return df.drop(columns, axis=1)
    return df

In [6]:
datasets_sections=[0.5,1,1.5,2]
datasets_sections=[0.1]
for selected_section in datasets_sections:
    print('starting with section '+str(selected_section))
    correlation_coeficient=[]
    window=selected_section
    dataset='friedman_10_noise_'+str(selected_section)+'_window_size_2'
    directory='toy_datasets/ran_datasets/'+dataset+'.csv'
    dataset_names=dataset+'_seed_'
    numpy.random.seed(42)
    dataset = pd.read_csv(directory)
    target='MAE'
    processed_features_df = preprocessing(dataset, target, bad_columns=['MSE','r2','mean','actual_sum'],using_xgboost=False)
    
    klenditau_results=[]
    coeficient_results=[]
    for i in range(11):
        train_df, validation_df = split_dataset(processed_features_df,dataset_names+str(i))
        X_train, y_train = label_feature_split(train_df,target)
        X_validation, y_validation = label_feature_split(validation_df, target)
        analyse_df, correlation_coeficient=analyse_generic_models_regression(X_train, y_train, X_validation, y_validation)

        analyse_df = analyse_df.sort_values(by=['predicted_result'])
        analyse_df['predicted_rankings']=analyse_df.reset_index().index.values
        analyse_df = analyse_df.sort_values(by=['actual_result'])
        analyse_df['actual_rankings']=analyse_df.reset_index().index.values
        coeficient_results.append(correlation_coeficient)
        klenditau_results.append(kendalltau(analyse_df['predicted_rankings'],analyse_df['actual_rankings'])[0])

    print('coeficients')
    for coeficient in coeficient_results:
        print(coeficient)
    print('klenditau output')
    for klenditau_output in klenditau_results:
        print(klenditau_output)
    print('section completed')


starting with section 0.1
coeficients
0.9709843620082973
0.9653300982187403
0.9494013704445602
0.8859975707197832
0.9679981056255282
0.9965392495457951
0.9980266797396772
0.9705593122740719
0.9992375254528125
0.9971159017544045
0.9889432583348321
klenditau output
0.9999999999999999
0.8666666666666666
0.7777777777777777
0.8666666666666666
0.8222222222222221
0.911111111111111
0.6444444444444444
0.6888888888888888
0.8666666666666666
0.911111111111111
0.8666666666666666
section completed
