In [3]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from pandas.api.types import CategoricalDtype

In [4]:
applicative = pd.read_csv('results/applicative.csv')
configuration = pd.read_csv('results/configuration.csv')
configuration = configuration[~np.isin(configuration['platformId'], [0, 13, 14, 17, 18, 5, 6, 1, 2, 10, 9, 3, 4, 19, 20])]
data = pd.merge(applicative, configuration, on='platformId')

In [5]:
dtype_mix = {
    'algorithm': CategoricalDtype(['BisectingKMeans', 'GBT', 'GMM', 'KMeans', 'Linear', 'Logistic', 'MLP', 'RFR', 'Tree']),
    'dataset': CategoricalDtype(['drift', 'drivface', 'drugs', 'geomagnetic', 'higgs']),
    'family': CategoricalDtype(['classification', 'clustering', 'regression']),
    'platformId': 'int',
    'runId': 'int',
    'splitter': CategoricalDtype(['none', 'random-80-20']),
}

dtype_platform = {
    'jobId': 'int',
    'jobGroup': 'category',
    'stageId': 'int',
    'taskId': 'int',
    'launchTime': 'int',
    'finishTime': 'int',
    'duration': 'int',
    'schedulerDelay': 'int',
    'executorId': 'int',
    'host': 'category',
    'taskLocality': 'category',
    'speculative': 'bool',
    'gettingResultTime': 'int',
    'successful': 'bool',
    'executorRunTime': 'int',
    'executorCpuTime': 'int',
    'executorDeserializeTime': 'int',
    'executorDeserializeCpuTime': 'int',
    'resultSerializationTime': 'int',
    'jvmGCTime': 'int',
    'resultSize': 'int',
    'numUpdatedBlockStatuses': 'int',
    'diskBytesSpilled': 'int',
    'memoryBytesSpilled': 'int',
    'peakExecutionMemory': 'int',
    'recordsRead': 'int',
    'bytesRead': 'int',
    'recordsWritten': 'int',
    'bytesWritten': 'int',
    'shuffleFetchWaitTime': 'int',
    'shuffleTotalBytesRead': 'int',
    'shuffleTotalBlocksFetched': 'int',
    'shuffleLocalBlocksFetched': 'int',
    'shuffleRemoteBlocksFetched': 'int',
    'shuffleWriteTime': 'int',
    'shuffleBytesWritten': 'int',
    'shuffleRecordsWritten': 'int',
    'phase': 'category',
    **dtype_mix
}

In [6]:
metrics = pd.read_csv('results/platform.csv', header=0, engine='c', na_filter=False, dtype=dtype_platform)

In [14]:
metrics = metrics[np.isin(metrics['platformId'], configuration['platformId'].values)]

In [68]:
localities = {
    'PROCESS_LOCAL':0,
    'NODE_LOCAL':1,
    'RACK_LOCAL':2,
    'NO_PREF':3,
    'ANY':4
}

metrics['taskLocality'] = metrics['taskLocality'].map(localities)

In [74]:
metrics['speculative'] = metrics['speculative'].astype(str).str.lower()
metrics['successful'] = metrics['successful'].astype(str).str.lower()

In [58]:
import os

In [82]:
columns = ["jobId","jobGroup","stageId","index","launchTime","finishTime","duration","schedulerDelay","executorId","host","taskLocality","speculative","gettingResultTime","successful","executorRunTime","executorCpuTime","executorDeserializeTime","executorDeserializeCpuTime","resultSerializationTime","jvmGCTime","resultSize","numUpdatedBlockStatuses","diskBytesSpilled","memoryBytesSpilled","peakExecutionMemory","recordsRead","bytesRead","recordsWritten","bytesWritten","shuffleFetchWaitTime","shuffleTotalBytesRead","shuffleTotalBlocksFetched","shuffleLocalBlocksFetched","shuffleRemoteBlocksFetched","shuffleWriteTime","shuffleBytesWritten","shuffleRecordsWritten"]


def save(df):
    
    train = df[df['phase'] == 'fit']
    test = df[df['phase'] == 'transform']
    
    path = make_path(df.iloc[0])
    
    os.makedirs('./generated/' + path + '/platform-fit-metrics.csv/')
    os.makedirs('./generated/' + path + '/platform-transform-metrics.csv/')
    
    train.to_csv('./generated/' + path + '/platform-fit-metrics.csv/part-00000-19fd760c-1021-4486-b3c7-85fda3375dc7-c000.csv', index=False, columns=columns)
    open('./generated/' + path + '/platform-fit-metrics.csv/_SUCCESS', 'w').close()
    test.to_csv('./generated/' + path + '/platform-transform-metrics.csv/part-00000-19fd760c-1021-4486-b3c7-85fda3375dc7-c000.csv', index=False, columns=columns)
    open('./generated/' + path + '/platform-transform-metrics.csv/_SUCCESS', 'w').close()

In [101]:
metrics = metrics[metrics['platformId'] != 0]

In [102]:
grouped = metrics.groupby(['dataset', 'algorithm', 'runId', 'platformId'], as_index=False)

for name, group in tqdm(grouped):
    save(group)




  0%|          | 0/1290 [00:00<?, ?it/s][A[A[A


  0%|          | 1/1290 [00:07<2:35:18,  7.23s/it][A[A[A


  0%|          | 4/1290 [00:07<1:48:44,  5.07s/it][A[A[A


  1%|          | 7/1290 [00:07<1:16:09,  3.56s/it][A[A[A


  1%|          | 10/1290 [00:07<53:26,  2.50s/it] [A[A[A


  1%|          | 13/1290 [00:07<37:34,  1.77s/it][A[A[A


  1%|          | 16/1290 [00:07<26:30,  1.25s/it][A[A[A


  1%|▏         | 19/1290 [00:07<18:44,  1.13it/s][A[A[A


  2%|▏         | 22/1290 [00:08<13:19,  1.59it/s][A[A[A


  2%|▏         | 25/1290 [00:08<09:33,  2.21it/s][A[A[A


  2%|▏         | 28/1290 [00:08<06:55,  3.04it/s][A[A[A


  2%|▏         | 31/1290 [00:08<05:04,  4.13it/s][A[A[A


  3%|▎         | 34/1290 [00:08<03:51,  5.42it/s][A[A[A


  3%|▎         | 37/1290 [00:08<02:58,  7.02it/s][A[A[A


  3%|▎         | 40/1290 [00:08<02:22,  8.79it/s][A[A[A


  3%|▎         | 43/1290 [00:08<01:58, 10.50it/s][A[A[A


  4%|▎         | 46/1290 [

 29%|██▉       | 372/1290 [00:26<01:24, 10.85it/s][A[A[A


 29%|██▉       | 374/1290 [00:26<01:26, 10.63it/s][A[A[A


 29%|██▉       | 376/1290 [00:26<01:25, 10.66it/s][A[A[A


 29%|██▉       | 378/1290 [00:27<01:25, 10.68it/s][A[A[A


 29%|██▉       | 380/1290 [00:27<01:22, 10.98it/s][A[A[A


 30%|██▉       | 382/1290 [00:27<01:24, 10.74it/s][A[A[A


 30%|██▉       | 384/1290 [00:27<01:22, 10.95it/s][A[A[A


 30%|██▉       | 386/1290 [00:27<01:25, 10.61it/s][A[A[A


 30%|███       | 388/1290 [00:27<01:23, 10.81it/s][A[A[A


 30%|███       | 390/1290 [00:28<01:24, 10.70it/s][A[A[A


 30%|███       | 392/1290 [00:28<01:17, 11.52it/s][A[A[A


 31%|███       | 394/1290 [00:28<01:17, 11.55it/s][A[A[A


 31%|███       | 396/1290 [00:28<01:14, 12.06it/s][A[A[A


 31%|███       | 398/1290 [00:28<01:09, 12.84it/s][A[A[A


 31%|███       | 400/1290 [00:28<01:08, 12.96it/s][A[A[A


 31%|███       | 402/1290 [00:29<01:06, 13.33it/s][A[A[A


 31%|███

 45%|████▌     | 584/1290 [00:52<01:08, 10.34it/s][A[A[A


 45%|████▌     | 586/1290 [00:52<01:10,  9.93it/s][A[A[A


 46%|████▌     | 588/1290 [00:52<01:06, 10.51it/s][A[A[A


 46%|████▌     | 590/1290 [00:52<01:03, 10.99it/s][A[A[A


 46%|████▌     | 592/1290 [00:53<01:02, 11.13it/s][A[A[A


 46%|████▌     | 594/1290 [00:53<00:58, 11.93it/s][A[A[A


 46%|████▌     | 596/1290 [00:53<00:57, 12.09it/s][A[A[A


 46%|████▋     | 598/1290 [00:53<00:55, 12.44it/s][A[A[A


 47%|████▋     | 600/1290 [00:53<00:55, 12.39it/s][A[A[A


 47%|████▋     | 605/1290 [00:53<00:43, 15.79it/s][A[A[A


 47%|████▋     | 608/1290 [00:53<00:38, 17.68it/s][A[A[A


 47%|████▋     | 611/1290 [00:54<00:38, 17.54it/s][A[A[A


 48%|████▊     | 614/1290 [00:54<00:35, 19.07it/s][A[A[A


 48%|████▊     | 617/1290 [00:54<00:33, 19.86it/s][A[A[A


 48%|████▊     | 620/1290 [00:54<00:32, 20.75it/s][A[A[A


 48%|████▊     | 623/1290 [00:54<00:30, 22.20it/s][A[A[A


 49%|███

 75%|███████▌  | 971/1290 [01:11<00:08, 37.84it/s][A[A[A


 76%|███████▌  | 976/1290 [01:11<00:07, 39.60it/s][A[A[A


 76%|███████▌  | 981/1290 [01:11<00:07, 41.40it/s][A[A[A


 76%|███████▋  | 986/1290 [01:11<00:06, 43.63it/s][A[A[A


 77%|███████▋  | 991/1290 [01:11<00:06, 45.34it/s][A[A[A


 77%|███████▋  | 996/1290 [01:11<00:06, 46.07it/s][A[A[A


 78%|███████▊  | 1002/1290 [01:11<00:05, 48.69it/s][A[A[A


 78%|███████▊  | 1008/1290 [01:11<00:05, 51.03it/s][A[A[A


 79%|███████▊  | 1014/1290 [01:11<00:05, 53.03it/s][A[A[A


 79%|███████▉  | 1020/1290 [01:12<00:05, 53.71it/s][A[A[A


 80%|███████▉  | 1026/1290 [01:13<00:22, 11.57it/s][A[A[A


 80%|███████▉  | 1030/1290 [01:14<00:40,  6.43it/s][A[A[A


 80%|████████  | 1033/1290 [01:15<00:46,  5.52it/s][A[A[A


 80%|████████  | 1036/1290 [01:16<00:52,  4.82it/s][A[A[A


 80%|████████  | 1038/1290 [01:16<00:55,  4.56it/s][A[A[A


 81%|████████  | 1040/1290 [01:17<00:57,  4.34it/s][A[A[A


 91%|█████████ | 1177/1290 [02:12<00:17,  6.46it/s][A[A[A


 91%|█████████▏| 1178/1290 [02:12<00:18,  6.11it/s][A[A[A


 91%|█████████▏| 1179/1290 [02:13<00:18,  5.94it/s][A[A[A


 91%|█████████▏| 1180/1290 [02:13<00:19,  5.73it/s][A[A[A


 92%|█████████▏| 1181/1290 [02:13<00:19,  5.51it/s][A[A[A


 92%|█████████▏| 1182/1290 [02:13<00:19,  5.41it/s][A[A[A


 92%|█████████▏| 1183/1290 [02:13<00:19,  5.53it/s][A[A[A


 92%|█████████▏| 1184/1290 [02:14<00:19,  5.53it/s][A[A[A


 92%|█████████▏| 1185/1290 [02:14<00:19,  5.43it/s][A[A[A


 92%|█████████▏| 1186/1290 [02:14<00:19,  5.36it/s][A[A[A


 92%|█████████▏| 1187/1290 [02:14<00:18,  5.47it/s][A[A[A


 92%|█████████▏| 1188/1290 [02:14<00:18,  5.44it/s][A[A[A


 92%|█████████▏| 1189/1290 [02:14<00:18,  5.32it/s][A[A[A


 92%|█████████▏| 1190/1290 [02:15<00:18,  5.38it/s][A[A[A


 92%|█████████▏| 1191/1290 [02:15<00:18,  5.26it/s][A[A[A


 92%|█████████▏| 1192/1290 [02:15<00:18,  5.41it/s][A

In [91]:
import json

In [104]:
configuration

Unnamed: 0,platformId,spark.shuffle.compress,spark.master,spark.io.compression.codec,spark.shuffle.file.buffer,spark.storage.memoryFraction,spark.shuffle.io.preferDirectBufs,spark.rdd.compress,spark.dynamicAllocation.enabled,spark.executor.memory,spark.driver.cores,spark.executor.cores,spark.driver.memory,spark.reducer.maxSizeInFlight,spark.serializer,spark.shuffle.spill.compress,spark.executor.instances,spark.locality.wait
0,0,True,spark://master:7077,snappy,32k,0.6,True,False,False,5g,2,4,5g,48m,org.apache.spark.serializer.JavaSerializer,True,,3s
4,11,True,spark://master:7077,snappy,32k,0.6,True,False,False,5g,2,4,5g,48m,org.apache.spark.serializer.KryoSerializer,True,,3s
5,12,False,spark://master:7077,snappy,32k,0.6,True,False,False,5g,2,4,5g,48m,org.apache.spark.serializer.JavaSerializer,True,,3s
8,15,True,spark://master:7077,snappy,32k,0.6,False,False,False,5g,2,4,5g,48m,org.apache.spark.serializer.JavaSerializer,True,,3s
9,16,True,spark://master:7077,snappy,32k,0.6,True,False,False,5g,2,4,5g,48m,org.apache.spark.serializer.JavaSerializer,False,,3s
18,7,True,spark://master:7077,lz4,32k,0.6,True,False,False,5g,2,4,5g,48m,org.apache.spark.serializer.JavaSerializer,True,,3s
19,8,True,spark://master:7077,snappy,32k,0.6,True,True,False,5g,2,4,5g,48m,org.apache.spark.serializer.JavaSerializer,True,,3s


In [103]:
for _, row in tqdm(data.iterrows()):
    
    app = make_applicative(row)
    summary = make_summary(row)
    
    path = make_path(row)
    
    os.mkdir('./generated/' + path + '/applicative-metrics.csv/')
    
    app.to_csv('./generated/' + path + '/applicative-metrics.csv/part-00000-19fd760c-1021-4486-b3c7-85fda3375dc7-c000.csv', index=False)
    open('./generated/' + path + '/applicative-metrics.csv/_SUCCESS', 'w').close()

    with open('./generated/' + path + '/summary.json', 'w') as fp:
        json.dump(summary, fp)




0it [00:00, ?it/s][A[A[A


15it [00:00, 147.46it/s][A[A[A


36it [00:00, 161.12it/s][A[A[A


61it [00:00, 179.55it/s][A[A[A


90it [00:00, 201.38it/s][A[A[A


120it [00:00, 222.46it/s][A[A[A


142it [00:00, 219.47it/s][A[A[A


164it [00:00, 194.42it/s][A[A[A


191it [00:00, 211.04it/s][A[A[A


231it [00:00, 245.03it/s][A[A[A


266it [00:01, 268.48it/s][A[A[A


302it [00:01, 289.63it/s][A[A[A


340it [00:01, 310.91it/s][A[A[A


379it [00:01, 330.28it/s][A[A[A


419it [00:01, 347.66it/s][A[A[A


456it [00:01, 351.35it/s][A[A[A


493it [00:01, 351.98it/s][A[A[A


529it [00:01, 353.88it/s][A[A[A


565it [00:01, 349.60it/s][A[A[A


601it [00:01, 348.42it/s][A[A[A


639it [00:02, 355.88it/s][A[A[A


677it [00:02, 359.86it/s][A[A[A


714it [00:02, 362.09it/s][A[A[A


752it [00:02, 366.55it/s][A[A[A


789it [00:02, 360.36it/s][A[A[A


826it [00:02, 356.63it/s][A[A[A


862it [00:02, 354.93it/s][A[A[A


898it [00

In [85]:
from random import randint

7187581

In [96]:
def make_summary(row):
    return {  
       "scenarioId":randint(100000, 10000000),
       "platformId":row["platformId"] + 100,
       "splitter":row["splitter"],
       "family":row["family"],
       "algorithm":row["algorithm"],
       "dataset":row["dataset"],
       "experimentId":randint(100000, 10000000),
       "runId":row["runId"],
       "workflowId":randint(100000, 10000000),
       "platform":{
          "spark.io.compression.codec":row["spark.io.compression.codec"],
          "spark.executor.cores":str(row["spark.executor.cores"]),
          "spark.reducer.maxSizeInFlight":row["spark.reducer.maxSizeInFlight"],
          "spark.serializer":row["spark.serializer"],
          "spark.shuffle.io.preferDirectBufs":str(row["spark.shuffle.io.preferDirectBufs"]).lower(),
          "spark.shuffle.spill.compress":str(row["spark.shuffle.spill.compress"]).lower(),
          "spark.shuffle.compress":str(row["spark.shuffle.compress"]).lower(),
          "spark.locality.wait":row["spark.locality.wait"],
          "spark.rdd.compress":str(row["spark.rdd.compress"]).lower(),
          "spark.master":row["spark.master"],
          "spark.shuffle.file.buffer":row["spark.shuffle.file.buffer"],
          "spark.executor.memory":row["spark.executor.memory"],
          "spark.storage.memoryFraction":str(row["spark.storage.memoryFraction"])
       }
    }

In [81]:
def make_path(row):
    return row['dataset'] + '/' + row['family'] + '/' + row['algorithm'] + '/' + 'platform-' + str(row['platformId'] + 100) + '/' + 'run-' + str(row['runId'])

In [53]:
def make_applicative(row):
    if row['family'] == 'regression':
        metric = [
            'rmse',
            'r2',
            'mse',
            'mae'
        ]
        value = [
            row['rmse'],
            row['r2'],
            row['mse'],
            row['mae']
        ]

    if row['family'] == 'classification':
        metric = [
            'f1',
            'accuracy',
            'weightedPrecision',
            'weightedRecall'
        ]
        value = [
            row['f1'],
            row['accuracy'],
            row['weightedPrecision'],
            row['weightedRecall']
        ]

    if row['family'] == 'clustering':
        metric = [
            'silhouette'
        ]
        value = [
            row['silhouette']
        ]

    return pd.DataFrame({
        'metric': [
            'transformTime',
            'features',
            'fitTime',
            'testCount',
            'trainCount',
            *metric
        ],
        'value': [
            row['transformTime'],
            row['features'],
            row['fitTime'],
            row['testCount'],
            row['trainCount'],
            *value
        ]
    })