# Library

In [1]:
#import sys
import pandas as pd
import numpy as np
import random
import pyspark
import itertools
import matplotlib.pyplot as plt
import itertools
import seaborn as sns
import pickle
import statsmodels.api as sm

from pyspark import SparkContext, SQLContext

from math import sqrt
from time import time as ttt

from pyspark.sql import SparkSession
from pyspark.sql import functions as f

from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier as DTC_spark
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

import joblib
from joblib import parallel_backend
from joblib import Parallel, delayed
from joblib import parallel_backend


# Upload

In [2]:
spark = SparkSession.builder.master("Yarn").appName("spark_app_1234").getOrCreate()
sc = spark.sparkContext
spark
d0 = (
    spark
    .read
    .format("csv") 
    .option("header","true") 
    .option("inferSchema","true") 
    .load("gs://mas-a5-storage-1/notebooks/jupyter/application_train.csv")
)

d1 = d0.filter(d0.DAYS_EMPLOYED != 365243).select('TARGET','DAYS_EMPLOYED')

print(d1.rdd.getNumPartitions())

                                                                                

2


In [3]:
sc

# Function

In [4]:
def prepare_spark_data(n_part, k_mult):
    '''
    takes 'DAYS_EMPLOYED and 'TARGET'  from d1 (alreay filtered)
    rearrane in n partitions (if n==0 keeps initial number of partitions)
    prints final shape/ partition
    returns d2 - spark df
    '''
    data = d1
    data_new = data # first step in the cycle
    for i in range(k_mult-1):
        data_tmp = data.select('TARGET', \
                     f.col('DAYS_EMPLOYED')*(f.lit(0.9995) + f.rand()/1000)).\
                   toDF('TARGET','DAYS_EMPLOYED')
        data_tmp = data_tmp.select('TARGET', f.floor('DAYS_EMPLOYED'))
        data_new = data_new.union(data_tmp)

    assembler = VectorAssembler(inputCols=["DAYS_EMPLOYED"], 
                        outputCol="DAYS_EMPLOYED_vect")
    d2 = assembler.transform(data_new)
    if n_part != 0:
        d2 = d2.repartition(n_part)       
    #print(f'n-partitions initial: {d2.rdd.getNumPartitions()}; df size: {d2.count()}\n')

    return d2

# Params an run

In [None]:
## !! mind the fn NOT TO rewrite results
fn = '4cpu_by_8n__4m_yarn_ssd'
size_mult = [1, 2, 5, 10, 15, 20, 30, 50, 100]
partitions = [2, 4, 8, 16, 24] 
# initial, 2**i incl n_nodes till n_cpu*n_nodes and n_cpu*n_nodes
n_iter = 3

print('d1-size', d1.count())
n_part_base = d1.rdd.getNumPartitions()
print('n-partitions initial', n_part_base, '\n')

rd1 = {}
for k_size_mult in size_mult:
    rd2 = {}
    for n_part in partitions:
        df = prepare_spark_data(n_part, k_size_mult)
        df.cache()
        print('======================================================================')
        print(f'=== size_mult={k_size_mult}; \
        n_part_req={n_part}; n_part_act={df.rdd.getNumPartitions()}; df_count={df.count()} ===\n')
        times = [0 for i in range(n_iter)]
        models = {}
        for i in range(n_iter):
            dt = DTC_spark(labelCol="TARGET",
                       featuresCol="DAYS_EMPLOYED_vect",
                       minInfoGain=0.0001,
                       impurity='entropy',
                       maxDepth=14, maxBins=2**14, # it differs from scikit learn - it means number of canidate split points
                       #minInstancesPerNode = 1,
                       #checkpointInterval = 10
                       )
           
            t0 = ttt()
            model = dt.fit(df)
            t1 = ttt()
            times[i] = t1-t0
            models[i] = model 
            print('model', i, 'build time', round(times[i],2), '\n', model)
        rd2[n_part] = (models, times)
    rd1[k_size_mult] = rd2

result = rd1  

                                                                                

d1-size 252137
n-partitions initial 2 



                                                                                

=== size_mult=1;         n_part_req=2; n_part_act=2; df_count=252137 ===



                                                                                

model 0 build time 10.36 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_4bf3e1464934, depth=14, numNodes=123, numClasses=2, numFeatures=1


                                                                                

model 1 build time 7.47 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_787e14ea66a0, depth=14, numNodes=123, numClasses=2, numFeatures=1


                                                                                

model 2 build time 7.0 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_2effd517921b, depth=14, numNodes=123, numClasses=2, numFeatures=1


                                                                                

=== size_mult=1;         n_part_req=4; n_part_act=4; df_count=252137 ===



                                                                                

model 0 build time 8.32 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_418ed98697d0, depth=14, numNodes=123, numClasses=2, numFeatures=1


                                                                                

model 1 build time 8.41 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_fdff04196ea3, depth=14, numNodes=123, numClasses=2, numFeatures=1


                                                                                

model 2 build time 8.09 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_404d5c7810f2, depth=14, numNodes=123, numClasses=2, numFeatures=1


                                                                                

=== size_mult=1;         n_part_req=8; n_part_act=8; df_count=252137 ===



                                                                                

model 0 build time 10.6 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_14f67b7fffd1, depth=14, numNodes=123, numClasses=2, numFeatures=1


                                                                                

model 1 build time 10.55 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_a6efd50e1439, depth=14, numNodes=123, numClasses=2, numFeatures=1


                                                                                

model 2 build time 10.34 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_38f9b2e6b28e, depth=14, numNodes=123, numClasses=2, numFeatures=1


                                                                                

=== size_mult=1;         n_part_req=16; n_part_act=16; df_count=252137 ===



                                                                                

model 0 build time 18.61 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_c7535df25267, depth=14, numNodes=123, numClasses=2, numFeatures=1


                                                                                

model 1 build time 15.13 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_2e56a9eee5fe, depth=14, numNodes=123, numClasses=2, numFeatures=1


                                                                                

model 2 build time 14.67 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_9820d3f82633, depth=14, numNodes=123, numClasses=2, numFeatures=1


                                                                                

=== size_mult=1;         n_part_req=24; n_part_act=24; df_count=252137 ===



                                                                                

model 0 build time 18.87 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_e1830d989c1a, depth=14, numNodes=123, numClasses=2, numFeatures=1


                                                                                

model 1 build time 16.92 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_2022917dde3c, depth=14, numNodes=123, numClasses=2, numFeatures=1


                                                                                

model 2 build time 16.47 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_02828fab0a66, depth=14, numNodes=123, numClasses=2, numFeatures=1


                                                                                

=== size_mult=2;         n_part_req=2; n_part_act=2; df_count=504274 ===



                                                                                

model 0 build time 10.91 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_63e09be0257f, depth=14, numNodes=101, numClasses=2, numFeatures=1


                                                                                

model 1 build time 10.58 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_8fa94c3f5b50, depth=14, numNodes=101, numClasses=2, numFeatures=1


                                                                                

model 2 build time 10.25 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_aaab31aea6a6, depth=14, numNodes=101, numClasses=2, numFeatures=1


                                                                                

=== size_mult=2;         n_part_req=4; n_part_act=4; df_count=504274 ===



                                                                                

model 0 build time 7.34 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_b267a9b234d4, depth=14, numNodes=131, numClasses=2, numFeatures=1


                                                                                

model 1 build time 7.29 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_cc992bb52b54, depth=14, numNodes=131, numClasses=2, numFeatures=1


                                                                                

model 2 build time 6.83 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_13935e30566d, depth=14, numNodes=131, numClasses=2, numFeatures=1


                                                                                

=== size_mult=2;         n_part_req=8; n_part_act=8; df_count=504274 ===



                                                                                

model 0 build time 7.9 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_07c098395733, depth=14, numNodes=135, numClasses=2, numFeatures=1




model 1 build time 6.34 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_22e0cb1ec189, depth=14, numNodes=135, numClasses=2, numFeatures=1


                                                                                

model 2 build time 6.03 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_777cf6c97de7, depth=14, numNodes=135, numClasses=2, numFeatures=1


                                                                                

=== size_mult=2;         n_part_req=16; n_part_act=16; df_count=504274 ===



                                                                                

model 0 build time 8.83 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_64cd18c7b5d6, depth=14, numNodes=151, numClasses=2, numFeatures=1


                                                                                

model 1 build time 8.83 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_b0dae96f6998, depth=14, numNodes=151, numClasses=2, numFeatures=1


                                                                                

model 2 build time 8.95 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_c05c8a4a1c59, depth=14, numNodes=151, numClasses=2, numFeatures=1


                                                                                

=== size_mult=2;         n_part_req=24; n_part_act=24; df_count=504274 ===



                                                                                

model 0 build time 10.7 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_a0c20b6e3eaa, depth=14, numNodes=153, numClasses=2, numFeatures=1


                                                                                

model 1 build time 10.65 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_9d6dbaa64a1a, depth=14, numNodes=153, numClasses=2, numFeatures=1


                                                                                

model 2 build time 10.75 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_d2fe190fcb66, depth=14, numNodes=153, numClasses=2, numFeatures=1


                                                                                

=== size_mult=5;         n_part_req=2; n_part_act=2; df_count=1260685 ===



                                                                                

model 0 build time 12.42 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_4d753d1f4715, depth=13, numNodes=83, numClasses=2, numFeatures=1


                                                                                

model 1 build time 13.56 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_84cff992bef8, depth=13, numNodes=83, numClasses=2, numFeatures=1


                                                                                

model 2 build time 12.68 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_ded2b23a2890, depth=13, numNodes=83, numClasses=2, numFeatures=1


                                                                                

=== size_mult=5;         n_part_req=4; n_part_act=4; df_count=1260685 ===



                                                                                

model 0 build time 13.25 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_c58ee3fa52e6, depth=14, numNodes=113, numClasses=2, numFeatures=1


                                                                                

model 1 build time 10.27 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_46d18dca79c7, depth=14, numNodes=113, numClasses=2, numFeatures=1


                                                                                

model 2 build time 9.5 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_f5ac25390863, depth=14, numNodes=113, numClasses=2, numFeatures=1


                                                                                

=== size_mult=5;         n_part_req=8; n_part_act=8; df_count=1260685 ===



                                                                                

model 0 build time 15.4 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_298e2f027390, depth=14, numNodes=97, numClasses=2, numFeatures=1


                                                                                

model 1 build time 9.83 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_9a12528481d9, depth=14, numNodes=97, numClasses=2, numFeatures=1


                                                                                

model 2 build time 9.86 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_688ebe95891b, depth=14, numNodes=97, numClasses=2, numFeatures=1


                                                                                

=== size_mult=5;         n_part_req=16; n_part_act=16; df_count=1260685 ===



                                                                                

model 0 build time 26.16 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_d3060be27f84, depth=14, numNodes=109, numClasses=2, numFeatures=1


                                                                                

model 1 build time 24.51 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_c6e01ad260e5, depth=14, numNodes=109, numClasses=2, numFeatures=1


                                                                                

model 2 build time 24.11 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_e269644338f2, depth=14, numNodes=109, numClasses=2, numFeatures=1


                                                                                

=== size_mult=5;         n_part_req=24; n_part_act=24; df_count=1260685 ===



                                                                                

model 0 build time 15.59 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_2dbe0d5e1985, depth=14, numNodes=79, numClasses=2, numFeatures=1


                                                                                

model 1 build time 10.15 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_660bde9e2232, depth=14, numNodes=79, numClasses=2, numFeatures=1


                                                                                

model 2 build time 9.99 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_bfa8fe92483d, depth=14, numNodes=79, numClasses=2, numFeatures=1


                                                                                

=== size_mult=10;         n_part_req=2; n_part_act=2; df_count=2521370 ===



                                                                                

model 0 build time 17.48 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_f33261ccbfab, depth=14, numNodes=83, numClasses=2, numFeatures=1


                                                                                

model 1 build time 16.67 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_68c85e4d45dd, depth=14, numNodes=83, numClasses=2, numFeatures=1


                                                                                

model 2 build time 17.54 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_c0174bdb2baa, depth=14, numNodes=83, numClasses=2, numFeatures=1


                                                                                

=== size_mult=10;         n_part_req=4; n_part_act=4; df_count=2521370 ===



                                                                                

model 0 build time 17.66 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_9feeeb03b082, depth=14, numNodes=105, numClasses=2, numFeatures=1


                                                                                

model 1 build time 13.54 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_9def17115b71, depth=14, numNodes=105, numClasses=2, numFeatures=1


                                                                                

model 2 build time 11.53 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_fbd2de2897df, depth=14, numNodes=105, numClasses=2, numFeatures=1


                                                                                

=== size_mult=10;         n_part_req=8; n_part_act=8; df_count=2521370 ===



                                                                                

model 0 build time 14.11 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_83bbbb43d61b, depth=14, numNodes=81, numClasses=2, numFeatures=1


                                                                                

model 1 build time 9.81 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_b5d9fe661ab6, depth=14, numNodes=81, numClasses=2, numFeatures=1


                                                                                

model 2 build time 9.37 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_4a2651c2ca4e, depth=14, numNodes=81, numClasses=2, numFeatures=1


                                                                                

=== size_mult=10;         n_part_req=16; n_part_act=16; df_count=2521370 ===



                                                                                

model 0 build time 10.31 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_c23766f3ac74, depth=14, numNodes=71, numClasses=2, numFeatures=1


                                                                                

model 1 build time 7.18 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_0505a7a382aa, depth=14, numNodes=71, numClasses=2, numFeatures=1




model 2 build time 6.59 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_dbd2fcadd009, depth=14, numNodes=71, numClasses=2, numFeatures=1


                                                                                

=== size_mult=10;         n_part_req=24; n_part_act=24; df_count=2521370 ===



                                                                                

model 0 build time 8.13 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_cafe06028731, depth=14, numNodes=89, numClasses=2, numFeatures=1


                                                                                

model 1 build time 8.09 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_395e381fb141, depth=14, numNodes=89, numClasses=2, numFeatures=1


                                                                                

model 2 build time 7.53 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_3ad093aa3032, depth=14, numNodes=89, numClasses=2, numFeatures=1


                                                                                

=== size_mult=15;         n_part_req=2; n_part_act=2; df_count=3782055 ===



                                                                                

model 0 build time 23.49 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_572876ae82c7, depth=14, numNodes=51, numClasses=2, numFeatures=1


                                                                                

model 1 build time 25.37 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_22bddcd04e13, depth=14, numNodes=51, numClasses=2, numFeatures=1


                                                                                

model 2 build time 22.78 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_3a77878c7976, depth=14, numNodes=51, numClasses=2, numFeatures=1


                                                                                

=== size_mult=15;         n_part_req=4; n_part_act=4; df_count=3782055 ===



                                                                                

model 0 build time 15.43 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_f3b91e71b7ec, depth=13, numNodes=63, numClasses=2, numFeatures=1


                                                                                

model 1 build time 15.72 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_421d8c587516, depth=13, numNodes=63, numClasses=2, numFeatures=1


                                                                                

model 2 build time 16.11 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_8fbfb9d03a3a, depth=13, numNodes=63, numClasses=2, numFeatures=1


                                                                                

=== size_mult=15;         n_part_req=8; n_part_act=8; df_count=3782055 ===



                                                                                

model 0 build time 10.46 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_6899723e9a53, depth=14, numNodes=59, numClasses=2, numFeatures=1


                                                                                

model 1 build time 10.47 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_eac9a332671e, depth=14, numNodes=59, numClasses=2, numFeatures=1


                                                                                

model 2 build time 11.28 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_b4f11a8b16a4, depth=14, numNodes=59, numClasses=2, numFeatures=1


                                                                                

=== size_mult=15;         n_part_req=16; n_part_act=16; df_count=3782055 ===





model 0 build time 7.48 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_0443ae4bd68a, depth=14, numNodes=59, numClasses=2, numFeatures=1




model 1 build time 6.9 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_53a3c8e83007, depth=14, numNodes=59, numClasses=2, numFeatures=1


                                                                                

model 2 build time 7.41 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_59d1d542b65a, depth=14, numNodes=59, numClasses=2, numFeatures=1


                                                                                

=== size_mult=15;         n_part_req=24; n_part_act=24; df_count=3782055 ===



                                                                                

model 0 build time 8.53 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_1a22a356eea2, depth=14, numNodes=79, numClasses=2, numFeatures=1


                                                                                

model 1 build time 10.0 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_a627509ed52c, depth=14, numNodes=79, numClasses=2, numFeatures=1


                                                                                

model 2 build time 8.98 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_4b5062442cd4, depth=14, numNodes=79, numClasses=2, numFeatures=1


                                                                                

=== size_mult=20;         n_part_req=2; n_part_act=2; df_count=5042740 ===



                                                                                

model 0 build time 30.84 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_e7147ef8b5c0, depth=14, numNodes=63, numClasses=2, numFeatures=1


                                                                                

model 1 build time 31.04 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_ab87a757624d, depth=14, numNodes=63, numClasses=2, numFeatures=1


                                                                                

model 2 build time 27.51 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_3096f4e8792e, depth=14, numNodes=63, numClasses=2, numFeatures=1


                                                                                

=== size_mult=20;         n_part_req=4; n_part_act=4; df_count=5042740 ===



                                                                                

model 0 build time 20.22 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_748592311af3, depth=14, numNodes=57, numClasses=2, numFeatures=1


                                                                                

model 1 build time 15.86 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_74de7c001f4c, depth=14, numNodes=57, numClasses=2, numFeatures=1


                                                                                

model 2 build time 16.76 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_402cf1701b5b, depth=14, numNodes=57, numClasses=2, numFeatures=1


                                                                                

=== size_mult=20;         n_part_req=8; n_part_act=8; df_count=5042740 ===



                                                                                

model 0 build time 11.66 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_90a397361ce1, depth=14, numNodes=57, numClasses=2, numFeatures=1


                                                                                

model 1 build time 11.25 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_61fce01e9ab8, depth=14, numNodes=57, numClasses=2, numFeatures=1


                                                                                

model 2 build time 11.89 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_aa49368bbe92, depth=14, numNodes=57, numClasses=2, numFeatures=1


                                                                                

=== size_mult=20;         n_part_req=16; n_part_act=16; df_count=5042740 ===



                                                                                

model 0 build time 9.55 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_3c3a6b96eecb, depth=10, numNodes=31, numClasses=2, numFeatures=1




model 1 build time 8.34 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_62a6b7bfcc61, depth=10, numNodes=31, numClasses=2, numFeatures=1


                                                                                

model 2 build time 8.64 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_40619115ccf2, depth=10, numNodes=31, numClasses=2, numFeatures=1


                                                                                

=== size_mult=20;         n_part_req=24; n_part_act=24; df_count=5042740 ===



                                                                                

model 0 build time 10.13 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_1a35abd6be34, depth=12, numNodes=59, numClasses=2, numFeatures=1


                                                                                

model 1 build time 10.61 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_de6071d2a05b, depth=12, numNodes=59, numClasses=2, numFeatures=1


                                                                                

model 2 build time 9.01 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_d98f5b0bfefe, depth=12, numNodes=59, numClasses=2, numFeatures=1


                                                                                

=== size_mult=30;         n_part_req=2; n_part_act=2; df_count=7564110 ===



                                                                                

model 0 build time 38.6 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_1aa83c5bf11e, depth=11, numNodes=41, numClasses=2, numFeatures=1


                                                                                

model 1 build time 37.0 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_7dd9523c30d4, depth=11, numNodes=41, numClasses=2, numFeatures=1


                                                                                

model 2 build time 39.69 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_daa9bd9a123e, depth=11, numNodes=41, numClasses=2, numFeatures=1


                                                                                

=== size_mult=30;         n_part_req=4; n_part_act=4; df_count=7564110 ===



                                                                                

model 0 build time 21.82 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_aa76bd0dfe89, depth=14, numNodes=43, numClasses=2, numFeatures=1


                                                                                

model 1 build time 22.62 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_9381445148c7, depth=14, numNodes=43, numClasses=2, numFeatures=1


                                                                                

model 2 build time 20.94 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_cbc378824abd, depth=14, numNodes=43, numClasses=2, numFeatures=1


                                                                                

=== size_mult=30;         n_part_req=8; n_part_act=8; df_count=7564110 ===



                                                                                

model 0 build time 14.57 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_eba1daf1895c, depth=12, numNodes=55, numClasses=2, numFeatures=1


                                                                                

model 1 build time 14.22 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_63d997ff0729, depth=12, numNodes=55, numClasses=2, numFeatures=1


                                                                                

model 2 build time 15.3 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_00925a82be39, depth=12, numNodes=55, numClasses=2, numFeatures=1


                                                                                

=== size_mult=30;         n_part_req=16; n_part_act=16; df_count=7564110 ===



                                                                                

model 0 build time 9.72 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_69d032681349, depth=10, numNodes=43, numClasses=2, numFeatures=1


                                                                                

model 1 build time 9.25 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_1b5697ce2c9d, depth=10, numNodes=43, numClasses=2, numFeatures=1


                                                                                

model 2 build time 10.25 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_65a28dabb65a, depth=10, numNodes=43, numClasses=2, numFeatures=1


                                                                                

=== size_mult=30;         n_part_req=24; n_part_act=24; df_count=7564110 ===



                                                                                

model 0 build time 11.98 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_6b0e2b4586cc, depth=10, numNodes=31, numClasses=2, numFeatures=1


                                                                                

model 1 build time 11.62 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_6a37f411bd6a, depth=10, numNodes=31, numClasses=2, numFeatures=1


                                                                                

model 2 build time 11.56 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_83f51cc8a687, depth=10, numNodes=31, numClasses=2, numFeatures=1


                                                                                

=== size_mult=50;         n_part_req=2; n_part_act=2; df_count=12606850 ===



                                                                                

model 0 build time 50.64 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_f8fd20cd03d7, depth=12, numNodes=41, numClasses=2, numFeatures=1


                                                                                

model 1 build time 53.47 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_b1ddde1e51fe, depth=12, numNodes=41, numClasses=2, numFeatures=1


                                                                                

model 2 build time 53.81 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_4f8cd2cca779, depth=12, numNodes=41, numClasses=2, numFeatures=1


                                                                                

=== size_mult=50;         n_part_req=4; n_part_act=4; df_count=12606850 ===



                                                                                

model 0 build time 33.73 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_18cd83c098b6, depth=10, numNodes=39, numClasses=2, numFeatures=1


                                                                                

model 1 build time 34.11 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_ddedffa8e73b, depth=10, numNodes=39, numClasses=2, numFeatures=1


                                                                                

model 2 build time 31.84 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_4c061091e2d7, depth=10, numNodes=39, numClasses=2, numFeatures=1


                                                                                

=== size_mult=50;         n_part_req=8; n_part_act=8; df_count=12606850 ===



                                                                                

model 0 build time 22.16 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_890fdaf07ed7, depth=13, numNodes=45, numClasses=2, numFeatures=1


                                                                                

model 1 build time 20.82 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_4df7db879129, depth=13, numNodes=45, numClasses=2, numFeatures=1


                                                                                

model 2 build time 20.54 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_55d700bc797a, depth=13, numNodes=45, numClasses=2, numFeatures=1


                                                                                

=== size_mult=50;         n_part_req=16; n_part_act=16; df_count=12606850 ===



                                                                                

model 0 build time 13.9 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_fb9416989745, depth=12, numNodes=35, numClasses=2, numFeatures=1


                                                                                

model 1 build time 14.27 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_f745613ae4f7, depth=12, numNodes=35, numClasses=2, numFeatures=1


                                                                                

model 2 build time 12.5 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_771da83bb8f5, depth=12, numNodes=35, numClasses=2, numFeatures=1


                                                                                

=== size_mult=50;         n_part_req=24; n_part_act=24; df_count=12606850 ===



                                                                                

model 0 build time 15.72 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_ff652b312651, depth=9, numNodes=27, numClasses=2, numFeatures=1


                                                                                

model 1 build time 14.37 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_21f4d824f982, depth=9, numNodes=27, numClasses=2, numFeatures=1


                                                                                

model 2 build time 13.63 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_547d6bb8c358, depth=9, numNodes=27, numClasses=2, numFeatures=1


                                                                                

=== size_mult=100;         n_part_req=2; n_part_act=2; df_count=25213700 ===



                                                                                

model 0 build time 98.14 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_07b558e06e76, depth=13, numNodes=39, numClasses=2, numFeatures=1


                                                                                

model 1 build time 90.25 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_d3b2b3611a9d, depth=13, numNodes=39, numClasses=2, numFeatures=1


                                                                                

model 2 build time 100.63 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_110f4d482966, depth=13, numNodes=39, numClasses=2, numFeatures=1


                                                                                

=== size_mult=100;         n_part_req=4; n_part_act=4; df_count=25213700 ===



                                                                                

model 0 build time 60.3 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_17221af5aab9, depth=11, numNodes=33, numClasses=2, numFeatures=1


                                                                                

model 1 build time 51.16 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_8bb8052879e4, depth=11, numNodes=33, numClasses=2, numFeatures=1


                                                                                

model 2 build time 58.93 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_31d6a19fff5b, depth=11, numNodes=33, numClasses=2, numFeatures=1


                                                                                

=== size_mult=100;         n_part_req=8; n_part_act=8; df_count=25213700 ===



                                                                                

model 0 build time 39.42 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_50a4fb9b7f9b, depth=11, numNodes=45, numClasses=2, numFeatures=1


                                                                                

model 1 build time 31.07 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_c748362b5726, depth=11, numNodes=45, numClasses=2, numFeatures=1


                                                                                

model 2 build time 39.34 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_f52ae8e37684, depth=11, numNodes=45, numClasses=2, numFeatures=1


                                                                                

=== size_mult=100;         n_part_req=16; n_part_act=16; df_count=25213700 ===



                                                                                

model 0 build time 26.26 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_c286068bdebb, depth=12, numNodes=43, numClasses=2, numFeatures=1


                                                                                

model 1 build time 17.93 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_8104b8bd2f1e, depth=12, numNodes=43, numClasses=2, numFeatures=1


                                                                                

model 2 build time 21.71 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_a3e676d26e46, depth=12, numNodes=43, numClasses=2, numFeatures=1


                                                                                

=== size_mult=100;         n_part_req=24; n_part_act=24; df_count=25213700 ===



                                                                                

model 0 build time 25.55 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_7df6102373a6, depth=11, numNodes=43, numClasses=2, numFeatures=1


                                                                                

model 1 build time 22.32 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_f391e6126ec4, depth=11, numNodes=43, numClasses=2, numFeatures=1




model 2 build time 22.54 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_4dc2a955b56c, depth=11, numNodes=43, numClasses=2, numFeatures=1




# Results

In [None]:
di = result
df_res_time = pd.DataFrame()
df_res_nodes = pd.DataFrame()
for p1 in di.keys():
    for p2 in di[p1].keys():
        df_res_time.loc[p1, p2] = np.round(np.median(di[p1][p2][1]), 1)
        m_tmp = di[p1][p2][0]
        n_nodes = []
        for i,_ in enumerate(m_tmp):
            tmp = f'{m_tmp[i]}'.split(' ')
            print(i, tmp)
            n_nodes.append([int(x.split('=')[1][:-1]) for x in tmp if x[:4]== 'numN'][0])
        df_res_nodes.loc[p1, p2] = np.round(np.mean(n_nodes), 1)

df_res_time.to_csv(f'gs://mas-a5-storage-1/notebooks/jupyter/obj/{fn}_t.csv')
df_res_nodes.to_csv(f'gs://mas-a5-storage-1/notebooks/jupyter/obj/{fn}_n.csv')

display(df_res_time)      
display(df_res_nodes)     

0 ['DecisionTreeClassificationModel:', 'uid=DecisionTreeClassifier_4bf3e1464934,', 'depth=14,', 'numNodes=123,', 'numClasses=2,', 'numFeatures=1']
1 ['DecisionTreeClassificationModel:', 'uid=DecisionTreeClassifier_787e14ea66a0,', 'depth=14,', 'numNodes=123,', 'numClasses=2,', 'numFeatures=1']
2 ['DecisionTreeClassificationModel:', 'uid=DecisionTreeClassifier_2effd517921b,', 'depth=14,', 'numNodes=123,', 'numClasses=2,', 'numFeatures=1']
0 ['DecisionTreeClassificationModel:', 'uid=DecisionTreeClassifier_418ed98697d0,', 'depth=14,', 'numNodes=123,', 'numClasses=2,', 'numFeatures=1']
1 ['DecisionTreeClassificationModel:', 'uid=DecisionTreeClassifier_fdff04196ea3,', 'depth=14,', 'numNodes=123,', 'numClasses=2,', 'numFeatures=1']
2 ['DecisionTreeClassificationModel:', 'uid=DecisionTreeClassifier_404d5c7810f2,', 'depth=14,', 'numNodes=123,', 'numClasses=2,', 'numFeatures=1']
0 ['DecisionTreeClassificationModel:', 'uid=DecisionTreeClassifier_14f67b7fffd1,', 'depth=14,', 'numNodes=123,', 'numC

Unnamed: 0,2,4,8,16,24
1,7.5,8.3,10.6,15.1,16.9
2,10.6,7.3,6.3,8.8,10.7
5,12.7,10.3,9.9,24.5,10.2
10,17.5,13.5,9.8,7.2,8.1
15,23.5,15.7,10.5,7.4,9.0
20,30.8,16.8,11.7,8.6,10.1
30,38.6,21.8,14.6,9.7,11.6
50,53.5,33.7,20.8,13.9,14.4
100,98.1,58.9,39.3,21.7,22.5


Unnamed: 0,2,4,8,16,24
1,123.0,123.0,123.0,123.0,123.0
2,101.0,131.0,135.0,151.0,153.0
5,83.0,113.0,97.0,109.0,79.0
10,83.0,105.0,81.0,71.0,89.0
15,51.0,63.0,59.0,59.0,79.0
20,63.0,57.0,57.0,31.0,59.0
30,41.0,43.0,55.0,43.0,31.0
50,41.0,39.0,45.0,35.0,27.0
100,39.0,33.0,45.0,43.0,43.0


In [7]:
#fn = '4cpu_by_8n__4m_yarn_ssd'
tmp = pd.read_csv(f'gs://mas-a5-storage-1/notebooks/jupyter/obj/{fn}_t.csv')
print(fn)
tmp

4cpu_by_8n__4m_yarn_ssd


Unnamed: 0.1,Unnamed: 0,2,4,8,16,24
0,1,7.5,8.3,10.6,15.1,16.9
1,2,10.6,7.3,6.3,8.8,10.7
2,5,12.7,10.3,9.9,24.5,10.2
3,10,17.5,13.5,9.8,7.2,8.1
4,15,23.5,15.7,10.5,7.4,9.0
5,20,30.8,16.8,11.7,8.6,10.1
6,30,38.6,21.8,14.6,9.7,11.6
7,50,53.5,33.7,20.8,13.9,14.4
8,100,98.1,58.9,39.3,21.7,22.5


In [8]:
#fn = '4cpu_by_8n__4m_yarn_ssd'
tmp = pd.read_csv(f'gs://mas-a5-storage-1/notebooks/jupyter/obj/{fn}_n.csv')
print(fn)
tmp

4cpu_by_8n__4m_yarn_ssd


Unnamed: 0.1,Unnamed: 0,2,4,8,16,24
0,1,123.0,123.0,123.0,123.0,123.0
1,2,101.0,131.0,135.0,151.0,153.0
2,5,83.0,113.0,97.0,109.0,79.0
3,10,83.0,105.0,81.0,71.0,89.0
4,15,51.0,63.0,59.0,59.0,79.0
5,20,63.0,57.0,57.0,31.0,59.0
6,30,41.0,43.0,55.0,43.0,31.0
7,50,41.0,39.0,45.0,35.0,27.0
8,100,39.0,33.0,45.0,43.0,43.0
