In [None]:
import pandas as pd
import numpy as np
import operator, math
from scipy.stats import pearsonr, spearmanr, entropy
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, mutual_info_regression, f_classif, mutual_info_classif, f_regression
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as st

In [None]:
EPSILON = 1.e-25

def pearson_correlation(x, y):
    return pearsonr(x, y)[0]



def spearman_correlation(x, y):
    return spearmanr(x, y)[0]



def normal_mutual_information(x, y):
    return -0.5 * math.log(1 - pearson_correlation(x,y) ** 2 + EPSILON)



def chi2_score(x, y):
    return list(chi2(np.array(x).reshape(-1, 1), y)[0])[0]


def gini(actual, pred):
    assert (len(actual) == len(pred))
    all = np.asarray(np.c_[actual, pred, np.arange(len(actual))], dtype=np.float)
    all = all[np.lexsort((all[:, 2], -1 * all[:, 1]))]
    totalLosses = all[:, 0].sum()
    giniSum = all[:, 0].cumsum().sum() / totalLosses

    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)


def gini_normalized(actual, pred):
    return gini(actual, pred) / gini(actual, actual)

In [None]:
data = pd.read_csv("metrics.csv")
data.drop(['host', 'jobId', 'stageId', 'jobGroup', 'taskId', 'launchTime', 'finishTime', 'executorId', 'taskLocality', 'speculative', 'gettingResultTime', 'successful', 'phase', 'experimentId', 'transformTime', 'numUpdatedBlockStatuses', 'diskBytesSpilled', 'memoryBytesSpilled', 'recordsWritten', 'bytesWritten', 'features', 'mse', 'mae', 'rmse', 'r2', 'algorithm', 'dataset', 'family', 'platform', 'platformId', 'runId', 'scenarioId', 'splitter', 'workflowId', 'silhouette', 'f1', 'weightedPrecision', 'weightedRecall','accuracy', 'Unnamed: 0'], inplace=True, axis=1, errors='igonre')

In [None]:
y = list(data.duration)
data.drop(["duration"], inplace=True, axis=1)

In [None]:
res = []
for c in data.columns:
    res.append((c, normal_mutual_information(list(data[c]), y)))
res = sorted(res, reverse=True, key=operator.itemgetter(1))
with open("normal_mutual_info.csv", "w") as f:
    f.write("\n".join(["{},{}".format(x[0], x[1]) for x in res]))

In [None]:
res = []
for c in data.columns:
    res.append((c, pearson_correlation(list(data[c]), y)))
res = [(y[0], y[1]) for y in sorted([(x[0], x[1], abs(x[1])) for x in res], reverse=True, key=operator.itemgetter(2))]
with open("pearson.csv", "w") as f:
    f.write("\n".join(["{},{}".format(x[0], x[1]) for x in res]))

In [None]:
res = []
for c in data.columns:
    res.append((c, spearman_correlation(list(data[c]), y)))
res = [(y[0], y[1]) for y in sorted([(x[0], x[1], abs(x[1])) for x in res], reverse=True, key=operator.itemgetter(2))]
with open("spearman.csv", "w") as f:
    f.write("\n".join(["{},{}".format(x[0], x[1]) for x in res]))

In [None]:
res = []
for c in data.columns:
    res.append((c, entropy(list(data[c]), y)))
res = sorted(res, key=operator.itemgetter(1))
with open("entropy.csv", "w") as f:
    f.write("\n".join(["{},{}".format(x[0], x[1]) for x in res]))