# Library

In [1]:
#import sys
import pandas as pd
import numpy as np
import random
import pyspark
import itertools
import matplotlib.pyplot as plt
import itertools
import seaborn as sns
import pickle
import statsmodels.api as sm

from pyspark import SparkContext, SQLContext

from math import sqrt
from time import time as ttt

from pyspark.sql import SparkSession
from pyspark.sql import functions as f

from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier as DTC_spark
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

import joblib
from joblib import parallel_backend
from joblib import Parallel, delayed
from joblib import parallel_backend


# Upload

In [2]:
spark = SparkSession.builder.master("Yarn").appName("spark_app_1234").getOrCreate()
sc = spark.sparkContext
spark
d0 = (
    spark
    .read
    .format("csv") 
    .option("header","true") 
    .option("inferSchema","true") 
    .load("gs://mas-a5-storage-1/notebooks/jupyter/application_train.csv")
)

d1 = d0.filter(d0.DAYS_EMPLOYED != 365243).select('TARGET','DAYS_EMPLOYED')

print(d1.rdd.getNumPartitions())

                                                                                

8


In [3]:
sc

# Function

In [4]:
def prepare_spark_data(n_part, k_mult):
    '''
    takes 'DAYS_EMPLOYED and 'TARGET'  from d1 (alreay filtered)
    rearrane in n partitions (if n==0 keeps initial number of partitions)
    prints final shape/ partition
    returns d2 - spark df
    '''
    data = d1
    data_new = data # first step in the cycle
    for i in range(k_mult-1):
        data_tmp = data.select('TARGET', \
                     f.col('DAYS_EMPLOYED')*(f.lit(0.9995) + f.rand()/1000)).\
                   toDF('TARGET','DAYS_EMPLOYED')
        data_tmp = data_tmp.select('TARGET', f.floor('DAYS_EMPLOYED'))
        data_new = data_new.union(data_tmp)

    assembler = VectorAssembler(inputCols=["DAYS_EMPLOYED"], 
                        outputCol="DAYS_EMPLOYED_vect")
    d2 = assembler.transform(data_new)
    if n_part != 0:
        d2 = d2.repartition(n_part, "DAYS_EMPLOYED_vect")       
    #print(f'n-partitions initial: {d2.rdd.getNumPartitions()}; df size: {d2.count()}\n')

    return d2

# Params an run

In [None]:
## !! mind the fn NOT TO rewrite results
fn = '8cpu_by_12n__8m_yarn_ssd_repart'
size_mult = [1, 2, 5, 10, 15, 20, 30, 50, 100]
partitions = [8, 12, 16, 24, 36, 72] 
# initial, 2**i incl n_nodes till n_cpu*n_nodes and n_cpu*n_nodes
n_iter = 3

print('d1-size', d1.count())
n_part_base = d1.rdd.getNumPartitions()
print('n-partitions initial', n_part_base, '\n')

rd1 = {}
for k_size_mult in size_mult:
    rd2 = {}
    for n_part in partitions:
        df = prepare_spark_data(n_part, k_size_mult)
        df.cache()
        print('======================================================================')
        print(f'=== size_mult={k_size_mult}; \
        n_part_req={n_part}; n_part_act={df.rdd.getNumPartitions()}; df_count={df.count()} ===\n')
        times = [0 for i in range(n_iter)]
        models = {}
        for i in range(n_iter):
            dt = DTC_spark(labelCol="TARGET",
                       featuresCol="DAYS_EMPLOYED_vect",
                       minInfoGain=0.0001,
                       impurity='entropy',
                       maxDepth=14, maxBins=2**14, # it differs from scikit learn - it means number of canidate split points
                       #minInstancesPerNode = 1,
                       #checkpointInterval = 10
                       )
           
            t0 = ttt()
            model = dt.fit(df)
            t1 = ttt()
            times[i] = t1-t0
            models[i] = model 
            print('model', i, 'build time', round(times[i],2), '\n', model)
        rd2[n_part] = (models, times)
    rd1[k_size_mult] = rd2

result = rd1  

                                                                                

d1-size 252137
n-partitions initial 8 



                                                                                

=== size_mult=1;         n_part_req=8; n_part_act=8; df_count=252137 ===

model 0 build time 7.0 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_9fc4c863286d, depth=14, numNodes=123, numClasses=2, numFeatures=1
model 1 build time 4.35 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_8d5a1ab56ea6, depth=14, numNodes=123, numClasses=2, numFeatures=1
model 2 build time 3.98 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_1124f765e59b, depth=14, numNodes=123, numClasses=2, numFeatures=1
=== size_mult=1;         n_part_req=12; n_part_act=12; df_count=252137 ===

model 0 build time 5.11 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_06b36774c047, depth=14, numNodes=123, numClasses=2, numFeatures=1
model 1 build time 5.01 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_5478081bbb39, depth=14, numNodes=123, numClasses=2, numFeatures=1


                                                                                

model 2 build time 5.86 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_ec66c7941687, depth=14, numNodes=123, numClasses=2, numFeatures=1
=== size_mult=1;         n_part_req=16; n_part_act=16; df_count=252137 ===

model 0 build time 4.85 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_bb2348ef1edf, depth=14, numNodes=123, numClasses=2, numFeatures=1
model 1 build time 4.81 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_5184f902ea85, depth=14, numNodes=123, numClasses=2, numFeatures=1
model 2 build time 4.81 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_3e146121923f, depth=14, numNodes=123, numClasses=2, numFeatures=1
=== size_mult=1;         n_part_req=24; n_part_act=24; df_count=252137 ===



                                                                                

model 0 build time 6.09 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_ceb39fae9930, depth=14, numNodes=123, numClasses=2, numFeatures=1
model 1 build time 6.14 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_7d2e0394d2de, depth=14, numNodes=123, numClasses=2, numFeatures=1




model 2 build time 5.84 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_20e938d51014, depth=14, numNodes=123, numClasses=2, numFeatures=1
=== size_mult=1;         n_part_req=36; n_part_act=36; df_count=252137 ===



                                                                                

model 0 build time 7.97 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_88dd62fb840e, depth=14, numNodes=123, numClasses=2, numFeatures=1


                                                                                

model 1 build time 8.01 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_12ee740302f0, depth=14, numNodes=123, numClasses=2, numFeatures=1


                                                                                

model 2 build time 7.85 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_9f3eaa3996dd, depth=14, numNodes=123, numClasses=2, numFeatures=1
=== size_mult=1;         n_part_req=72; n_part_act=72; df_count=252137 ===



                                                                                

model 0 build time 13.73 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_4f731dcc1675, depth=14, numNodes=123, numClasses=2, numFeatures=1


                                                                                

model 1 build time 13.3 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_c3ea8621cfd7, depth=14, numNodes=123, numClasses=2, numFeatures=1


                                                                                

model 2 build time 13.3 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_ad5876dd2a2f, depth=14, numNodes=123, numClasses=2, numFeatures=1


                                                                                

=== size_mult=2;         n_part_req=8; n_part_act=8; df_count=504274 ===

model 0 build time 4.59 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_ff226af2c7c1, depth=14, numNodes=145, numClasses=2, numFeatures=1
model 1 build time 4.53 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_0eccf33de376, depth=14, numNodes=145, numClasses=2, numFeatures=1
model 2 build time 4.38 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_e684f9691523, depth=14, numNodes=145, numClasses=2, numFeatures=1


                                                                                

=== size_mult=2;         n_part_req=12; n_part_act=12; df_count=504274 ===





model 0 build time 5.87 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_a1848304df41, depth=14, numNodes=129, numClasses=2, numFeatures=1


                                                                                

model 1 build time 6.67 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_bdf0ecc3c751, depth=14, numNodes=129, numClasses=2, numFeatures=1




model 2 build time 5.78 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_0bd7ec1a34b6, depth=14, numNodes=129, numClasses=2, numFeatures=1


                                                                                

=== size_mult=2;         n_part_req=16; n_part_act=16; df_count=504274 ===





model 0 build time 5.59 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_9d6d2c952ab0, depth=14, numNodes=131, numClasses=2, numFeatures=1
model 1 build time 5.71 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_b6e51f478deb, depth=14, numNodes=131, numClasses=2, numFeatures=1


                                                                                

model 2 build time 5.71 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_a0b6ea311fca, depth=14, numNodes=131, numClasses=2, numFeatures=1


                                                                                

=== size_mult=2;         n_part_req=24; n_part_act=24; df_count=504274 ===



                                                                                

model 0 build time 7.74 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_4ebe30c8a167, depth=14, numNodes=137, numClasses=2, numFeatures=1


                                                                                

model 1 build time 7.85 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_a39d4608bc14, depth=14, numNodes=137, numClasses=2, numFeatures=1


                                                                                

model 2 build time 7.85 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_8e0d0d0f7d0a, depth=14, numNodes=137, numClasses=2, numFeatures=1


                                                                                

=== size_mult=2;         n_part_req=36; n_part_act=36; df_count=504274 ===



                                                                                

model 0 build time 9.33 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_d35c0ca09125, depth=14, numNodes=135, numClasses=2, numFeatures=1


                                                                                

model 1 build time 9.35 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_1cda596289b2, depth=14, numNodes=135, numClasses=2, numFeatures=1


                                                                                

model 2 build time 9.33 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_f55b7a259d18, depth=14, numNodes=135, numClasses=2, numFeatures=1


                                                                                

=== size_mult=2;         n_part_req=72; n_part_act=72; df_count=504274 ===



                                                                                

model 0 build time 20.75 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_aa0b11d11c13, depth=14, numNodes=135, numClasses=2, numFeatures=1


                                                                                

model 1 build time 22.0 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_95848e28f399, depth=14, numNodes=135, numClasses=2, numFeatures=1


                                                                                

model 2 build time 21.82 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_df757bc90801, depth=14, numNodes=135, numClasses=2, numFeatures=1


                                                                                

=== size_mult=5;         n_part_req=8; n_part_act=8; df_count=1260685 ===



                                                                                

model 0 build time 7.0 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_f870b772bd48, depth=14, numNodes=139, numClasses=2, numFeatures=1


                                                                                

model 1 build time 6.1 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_dd286a112fd4, depth=14, numNodes=139, numClasses=2, numFeatures=1


[Stage 2175:>                                                       (0 + 8) / 8]                                                                                

model 2 build time 5.98 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_8774469ed62e, depth=14, numNodes=139, numClasses=2, numFeatures=1


                                                                                

=== size_mult=5;         n_part_req=12; n_part_act=12; df_count=1260685 ===



                                                                                

model 0 build time 9.13 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_bd33fa2eb15c, depth=14, numNodes=141, numClasses=2, numFeatures=1


                                                                                

model 1 build time 7.99 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_36eac757ea84, depth=14, numNodes=141, numClasses=2, numFeatures=1


                                                                                

model 2 build time 7.8 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_03d6131f0a88, depth=14, numNodes=141, numClasses=2, numFeatures=1


                                                                                

=== size_mult=5;         n_part_req=16; n_part_act=16; df_count=1260685 ===



                                                                                

model 0 build time 7.82 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_be65acac95fe, depth=14, numNodes=115, numClasses=2, numFeatures=1


                                                                                

model 1 build time 7.76 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_78594e4387c5, depth=14, numNodes=115, numClasses=2, numFeatures=1


                                                                                

model 2 build time 7.93 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_636012dc84fe, depth=14, numNodes=115, numClasses=2, numFeatures=1


                                                                                

=== size_mult=5;         n_part_req=24; n_part_act=24; df_count=1260685 ===



                                                                                

model 0 build time 11.39 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_bb73b8297925, depth=14, numNodes=127, numClasses=2, numFeatures=1


                                                                                

model 1 build time 10.18 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_0324a5fe5d9c, depth=14, numNodes=127, numClasses=2, numFeatures=1


                                                                                

model 2 build time 10.17 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_fe21dfa7437b, depth=14, numNodes=127, numClasses=2, numFeatures=1


                                                                                

=== size_mult=5;         n_part_req=36; n_part_act=36; df_count=1260685 ===



                                                                                

model 0 build time 13.77 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_278084c11820, depth=13, numNodes=51, numClasses=2, numFeatures=1


                                                                                

model 1 build time 13.76 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_1ab52b13511b, depth=13, numNodes=51, numClasses=2, numFeatures=1


                                                                                

model 2 build time 13.73 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_e8ca2ec2e2a1, depth=13, numNodes=51, numClasses=2, numFeatures=1


                                                                                

=== size_mult=5;         n_part_req=72; n_part_act=72; df_count=1260685 ===



                                                                                

model 0 build time 21.21 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_e13d8c288d53, depth=13, numNodes=95, numClasses=2, numFeatures=1


                                                                                

model 1 build time 22.0 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_e71028602a48, depth=13, numNodes=95, numClasses=2, numFeatures=1


                                                                                

model 2 build time 21.68 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_8c8362a92606, depth=13, numNodes=95, numClasses=2, numFeatures=1


                                                                                

=== size_mult=10;         n_part_req=8; n_part_act=8; df_count=2521370 ===



                                                                                

model 0 build time 9.59 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_a8676a6d5eab, depth=14, numNodes=55, numClasses=2, numFeatures=1


                                                                                

model 1 build time 8.64 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_0809c9e03f60, depth=14, numNodes=55, numClasses=2, numFeatures=1


                                                                                

model 2 build time 8.06 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_a6595dc19d2a, depth=14, numNodes=55, numClasses=2, numFeatures=1


                                                                                

=== size_mult=10;         n_part_req=12; n_part_act=12; df_count=2521370 ===



                                                                                

model 0 build time 10.47 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_4b3407cc5eb1, depth=13, numNodes=85, numClasses=2, numFeatures=1


                                                                                

model 1 build time 7.45 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_7bc4015de902, depth=13, numNodes=85, numClasses=2, numFeatures=1




model 2 build time 6.74 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_f24c8aa9c55d, depth=13, numNodes=85, numClasses=2, numFeatures=1


                                                                                

=== size_mult=10;         n_part_req=16; n_part_act=16; df_count=2521370 ===



                                                                                

model 0 build time 8.63 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_a569205911a3, depth=14, numNodes=107, numClasses=2, numFeatures=1
model 1 build time 6.04 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_9cb08f8cac4b, depth=14, numNodes=107, numClasses=2, numFeatures=1
model 2 build time 5.9 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_3106bb127f18, depth=14, numNodes=107, numClasses=2, numFeatures=1


                                                                                

=== size_mult=10;         n_part_req=24; n_part_act=24; df_count=2521370 ===

model 0 build time 6.45 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_4b2b50d38975, depth=14, numNodes=73, numClasses=2, numFeatures=1
model 1 build time 5.41 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_3a86b6fc98d2, depth=14, numNodes=73, numClasses=2, numFeatures=1
model 2 build time 5.35 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_1c2eca300044, depth=14, numNodes=73, numClasses=2, numFeatures=1


                                                                                

=== size_mult=10;         n_part_req=36; n_part_act=36; df_count=2521370 ===

model 0 build time 5.86 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_1620df4c2309, depth=14, numNodes=53, numClasses=2, numFeatures=1


21/11/27 06:48:51 WARN org.apache.spark.storage.BlockManager: Asked to remove block broadcast_3483, which does not exist


model 1 build time 7.45 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_ebe13b900e5b, depth=14, numNodes=53, numClasses=2, numFeatures=1


                                                                                

model 2 build time 6.19 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_84ad38e7b37c, depth=14, numNodes=53, numClasses=2, numFeatures=1


                                                                                

=== size_mult=10;         n_part_req=72; n_part_act=72; df_count=2521370 ===



                                                                                

model 0 build time 11.36 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_642cdd184f28, depth=14, numNodes=69, numClasses=2, numFeatures=1


                                                                                

model 1 build time 7.88 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_f269d4431751, depth=14, numNodes=69, numClasses=2, numFeatures=1


                                                                                

model 2 build time 7.85 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_354d8469a6c5, depth=14, numNodes=69, numClasses=2, numFeatures=1


                                                                                

=== size_mult=15;         n_part_req=8; n_part_act=8; df_count=3782055 ===



                                                                                

model 0 build time 10.48 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_9596cfa465fc, depth=14, numNodes=59, numClasses=2, numFeatures=1


                                                                                

model 1 build time 9.23 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_8d7663f7a6ed, depth=14, numNodes=59, numClasses=2, numFeatures=1


                                                                                

model 2 build time 9.22 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_3f7d79f93f14, depth=14, numNodes=59, numClasses=2, numFeatures=1


                                                                                

=== size_mult=15;         n_part_req=12; n_part_act=12; df_count=3782055 ===

model 0 build time 7.91 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_53170bb33247, depth=14, numNodes=113, numClasses=2, numFeatures=1




model 1 build time 7.33 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_884f77c61fc4, depth=14, numNodes=113, numClasses=2, numFeatures=1


                                                                                

model 2 build time 7.22 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_43e5bfa8b359, depth=14, numNodes=113, numClasses=2, numFeatures=1


                                                                                

=== size_mult=15;         n_part_req=16; n_part_act=16; df_count=3782055 ===



                                                                                

model 0 build time 7.56 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_bb66fd88e49b, depth=12, numNodes=57, numClasses=2, numFeatures=1




model 1 build time 6.32 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_a6953dfe69bf, depth=12, numNodes=57, numClasses=2, numFeatures=1




model 2 build time 6.34 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_d60351076543, depth=12, numNodes=57, numClasses=2, numFeatures=1


                                                                                

=== size_mult=15;         n_part_req=24; n_part_act=24; df_count=3782055 ===

model 0 build time 7.17 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_e6719e36f76d, depth=14, numNodes=49, numClasses=2, numFeatures=1
model 1 build time 6.17 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_e07f9a666e9a, depth=14, numNodes=49, numClasses=2, numFeatures=1
model 2 build time 5.94 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_106819ba4b57, depth=14, numNodes=49, numClasses=2, numFeatures=1


                                                                                

=== size_mult=15;         n_part_req=36; n_part_act=36; df_count=3782055 ===





model 0 build time 6.19 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_7974fd36b7f8, depth=14, numNodes=77, numClasses=2, numFeatures=1
model 1 build time 6.22 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_62985d36ddc7, depth=14, numNodes=77, numClasses=2, numFeatures=1




model 2 build time 6.56 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_b04467e6b0e8, depth=14, numNodes=77, numClasses=2, numFeatures=1


                                                                                

=== size_mult=15;         n_part_req=72; n_part_act=72; df_count=3782055 ===



                                                                                

model 0 build time 7.88 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_e4afa192ef5f, depth=14, numNodes=61, numClasses=2, numFeatures=1


                                                                                

model 1 build time 8.46 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_3684c47d92f5, depth=14, numNodes=61, numClasses=2, numFeatures=1


                                                                                

model 2 build time 8.34 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_b9f105ac46d6, depth=14, numNodes=61, numClasses=2, numFeatures=1


                                                                                

=== size_mult=20;         n_part_req=8; n_part_act=8; df_count=5042740 ===



                                                                                

model 0 build time 11.02 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_13a08bf27d66, depth=14, numNodes=71, numClasses=2, numFeatures=1


                                                                                

model 1 build time 9.85 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_db7bb5b3056c, depth=14, numNodes=71, numClasses=2, numFeatures=1


                                                                                

model 2 build time 9.91 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_5d579d3f2bd0, depth=14, numNodes=71, numClasses=2, numFeatures=1


                                                                                

=== size_mult=20;         n_part_req=12; n_part_act=12; df_count=5042740 ===



                                                                                

model 0 build time 9.25 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_98870ee6e644, depth=13, numNodes=65, numClasses=2, numFeatures=1




model 1 build time 7.63 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_4dc3c85ed601, depth=13, numNodes=65, numClasses=2, numFeatures=1


                                                                                

model 2 build time 7.57 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_087b76f3c36d, depth=13, numNodes=65, numClasses=2, numFeatures=1


                                                                                

=== size_mult=20;         n_part_req=16; n_part_act=16; df_count=5042740 ===



                                                                                

model 0 build time 8.64 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_ce0f6d178d42, depth=10, numNodes=41, numClasses=2, numFeatures=1
model 1 build time 7.1 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_a68daed6d049, depth=10, numNodes=41, numClasses=2, numFeatures=1
model 2 build time 6.63 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_e519e1d579cd, depth=10, numNodes=41, numClasses=2, numFeatures=1


                                                                                

=== size_mult=20;         n_part_req=24; n_part_act=24; df_count=5042740 ===





model 0 build time 8.36 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_7d5ae58a03f2, depth=14, numNodes=77, numClasses=2, numFeatures=1


                                                                                

model 1 build time 6.75 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_23f3c9d84548, depth=14, numNodes=77, numClasses=2, numFeatures=1
model 2 build time 5.95 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_80a838bafe2d, depth=14, numNodes=77, numClasses=2, numFeatures=1


                                                                                

=== size_mult=20;         n_part_req=36; n_part_act=36; df_count=5042740 ===

model 0 build time 7.56 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_28c88e2d560e, depth=11, numNodes=37, numClasses=2, numFeatures=1
model 1 build time 6.36 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_6aa12386e872, depth=11, numNodes=37, numClasses=2, numFeatures=1
model 2 build time 6.32 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_9412d5b70df0, depth=11, numNodes=37, numClasses=2, numFeatures=1


                                                                                

=== size_mult=20;         n_part_req=72; n_part_act=72; df_count=5042740 ===



                                                                                

model 0 build time 11.05 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_e401392d9b6a, depth=14, numNodes=73, numClasses=2, numFeatures=1


                                                                                

model 1 build time 7.85 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_5badf521a28c, depth=14, numNodes=73, numClasses=2, numFeatures=1


                                                                                

model 2 build time 7.32 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_9c306cc1b21c, depth=14, numNodes=73, numClasses=2, numFeatures=1


                                                                                

=== size_mult=30;         n_part_req=8; n_part_act=8; df_count=7564110 ===



                                                                                

model 0 build time 12.8 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_545940607191, depth=14, numNodes=49, numClasses=2, numFeatures=1


                                                                                

model 1 build time 11.59 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_e3e79146697b, depth=14, numNodes=49, numClasses=2, numFeatures=1


                                                                                

model 2 build time 11.54 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_229ef0f76bfe, depth=14, numNodes=49, numClasses=2, numFeatures=1


                                                                                

=== size_mult=30;         n_part_req=12; n_part_act=12; df_count=7564110 ===



                                                                                

model 0 build time 11.22 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_403b95667952, depth=12, numNodes=41, numClasses=2, numFeatures=1


                                                                                

model 1 build time 9.46 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_598301989f9d, depth=12, numNodes=41, numClasses=2, numFeatures=1


                                                                                

model 2 build time 9.83 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_33b04dc31368, depth=12, numNodes=41, numClasses=2, numFeatures=1


                                                                                

=== size_mult=30;         n_part_req=16; n_part_act=16; df_count=7564110 ===



                                                                                

model 0 build time 11.7 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_043db8f745de, depth=11, numNodes=43, numClasses=2, numFeatures=1




model 1 build time 8.54 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_2aa692a27576, depth=11, numNodes=43, numClasses=2, numFeatures=1




model 2 build time 8.62 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_cad5db9098b2, depth=11, numNodes=43, numClasses=2, numFeatures=1


                                                                                

=== size_mult=30;         n_part_req=24; n_part_act=24; df_count=7564110 ===

model 0 build time 8.53 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_5864f7927468, depth=14, numNodes=57, numClasses=2, numFeatures=1
model 1 build time 7.4 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_1abcf4418b73, depth=14, numNodes=57, numClasses=2, numFeatures=1
model 2 build time 7.18 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_565396ed5470, depth=14, numNodes=57, numClasses=2, numFeatures=1


                                                                                

=== size_mult=30;         n_part_req=36; n_part_act=36; df_count=7564110 ===



                                                                                

model 0 build time 7.86 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_8a49884c8654, depth=10, numNodes=37, numClasses=2, numFeatures=1
model 1 build time 7.04 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_3a6a351f456c, depth=10, numNodes=37, numClasses=2, numFeatures=1
model 2 build time 6.38 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_56e6b28a5fee, depth=10, numNodes=37, numClasses=2, numFeatures=1


                                                                                

=== size_mult=30;         n_part_req=72; n_part_act=72; df_count=7564110 ===



                                                                                

model 0 build time 8.48 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_9019499a1f4d, depth=10, numNodes=35, numClasses=2, numFeatures=1


                                                                                

model 1 build time 9.18 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_2886cc4c30ff, depth=10, numNodes=35, numClasses=2, numFeatures=1


                                                                                

model 2 build time 8.03 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_dc3cca4fd47d, depth=10, numNodes=35, numClasses=2, numFeatures=1


                                                                                

=== size_mult=50;         n_part_req=8; n_part_act=8; df_count=12606850 ===



                                                                                

model 0 build time 17.57 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_529a3657bfd0, depth=12, numNodes=39, numClasses=2, numFeatures=1


                                                                                

model 1 build time 16.32 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_03a44a2fa640, depth=12, numNodes=39, numClasses=2, numFeatures=1


                                                                                

model 2 build time 16.37 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_3a3824dbe439, depth=12, numNodes=39, numClasses=2, numFeatures=1


                                                                                

=== size_mult=50;         n_part_req=12; n_part_act=12; df_count=12606850 ===



                                                                                

model 0 build time 12.41 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_f7c9562c708c, depth=10, numNodes=35, numClasses=2, numFeatures=1


                                                                                

model 1 build time 12.62 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_3632a3268532, depth=10, numNodes=35, numClasses=2, numFeatures=1


                                                                                

model 2 build time 12.52 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_a7902f8d24b7, depth=10, numNodes=35, numClasses=2, numFeatures=1


                                                                                

=== size_mult=50;         n_part_req=16; n_part_act=16; df_count=12606850 ===



                                                                                

model 0 build time 10.6 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_58e6b8ac6094, depth=13, numNodes=47, numClasses=2, numFeatures=1


                                                                                

model 1 build time 10.44 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_b22308d74a75, depth=13, numNodes=47, numClasses=2, numFeatures=1


                                                                                

model 2 build time 10.21 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_2812887985ea, depth=13, numNodes=47, numClasses=2, numFeatures=1


                                                                                

=== size_mult=50;         n_part_req=24; n_part_act=24; df_count=12606850 ===



                                                                                

model 0 build time 9.57 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_0cc14299fd60, depth=11, numNodes=43, numClasses=2, numFeatures=1


                                                                                

model 1 build time 11.18 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_3abb26e127dc, depth=11, numNodes=43, numClasses=2, numFeatures=1




model 2 build time 8.67 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_3ed9e5fe4dc0, depth=11, numNodes=43, numClasses=2, numFeatures=1


                                                                                

=== size_mult=50;         n_part_req=36; n_part_act=36; df_count=12606850 ===



                                                                                

model 0 build time 8.61 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_b21b186f9bc9, depth=9, numNodes=29, numClasses=2, numFeatures=1
model 1 build time 7.8 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_9b65de294898, depth=9, numNodes=29, numClasses=2, numFeatures=1
model 2 build time 7.72 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_65cdb1e20c90, depth=9, numNodes=29, numClasses=2, numFeatures=1


                                                                                

=== size_mult=50;         n_part_req=72; n_part_act=72; df_count=12606850 ===



                                                                                

model 0 build time 8.93 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_adc0e2880697, depth=12, numNodes=47, numClasses=2, numFeatures=1


                                                                                

model 1 build time 9.23 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_5a3f0287aa34, depth=12, numNodes=47, numClasses=2, numFeatures=1


                                                                                

model 2 build time 9.15 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_d7e4aa9020d6, depth=12, numNodes=47, numClasses=2, numFeatures=1


                                                                                

=== size_mult=100;         n_part_req=8; n_part_act=8; df_count=25213700 ===



                                                                                

model 0 build time 36.92 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_3e49db78f805, depth=10, numNodes=33, numClasses=2, numFeatures=1


                                                                                

model 1 build time 28.57 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_0bad2a3ef950, depth=10, numNodes=33, numClasses=2, numFeatures=1


                                                                                

model 2 build time 26.52 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_21b8cf4d2396, depth=10, numNodes=33, numClasses=2, numFeatures=1


                                                                                

=== size_mult=100;         n_part_req=12; n_part_act=12; df_count=25213700 ===



                                                                                

model 0 build time 21.25 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_df2cc112371f, depth=13, numNodes=41, numClasses=2, numFeatures=1


                                                                                

model 1 build time 22.61 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_12092179ac03, depth=13, numNodes=41, numClasses=2, numFeatures=1


                                                                                

model 2 build time 18.94 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_d27925b3c83a, depth=13, numNodes=41, numClasses=2, numFeatures=1


                                                                                

=== size_mult=100;         n_part_req=16; n_part_act=16; df_count=25213700 ===



                                                                                

model 0 build time 17.28 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_5175603fc1c5, depth=12, numNodes=41, numClasses=2, numFeatures=1


                                                                                

model 1 build time 15.07 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_94946f264917, depth=12, numNodes=41, numClasses=2, numFeatures=1


                                                                                

model 2 build time 15.02 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_01cb162d0f29, depth=12, numNodes=41, numClasses=2, numFeatures=1


                                                                                

=== size_mult=100;         n_part_req=24; n_part_act=24; df_count=25213700 ===



                                                                                

model 0 build time 12.67 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_bb0d1a2b7762, depth=11, numNodes=37, numClasses=2, numFeatures=1


                                                                                

model 1 build time 13.61 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_d9ecac8babd0, depth=11, numNodes=37, numClasses=2, numFeatures=1


                                                                                

model 2 build time 12.11 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_f31a382d0663, depth=11, numNodes=37, numClasses=2, numFeatures=1


                                                                                

=== size_mult=100;         n_part_req=36; n_part_act=36; df_count=25213700 ===



                                                                                

model 0 build time 12.3 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_f1766868e2e6, depth=12, numNodes=45, numClasses=2, numFeatures=1


                                                                                

model 1 build time 10.69 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_126a157ce37c, depth=12, numNodes=45, numClasses=2, numFeatures=1


                                                                                

model 2 build time 12.01 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_7dd97685a5fe, depth=12, numNodes=45, numClasses=2, numFeatures=1


                                                                                

=== size_mult=100;         n_part_req=72; n_part_act=72; df_count=25213700 ===



                                                                                

model 0 build time 16.24 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_bc8e2539669c, depth=11, numNodes=37, numClasses=2, numFeatures=1


                                                                                

model 1 build time 12.57 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_b068b6947708, depth=11, numNodes=37, numClasses=2, numFeatures=1




model 2 build time 11.12 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_06f71545d618, depth=11, numNodes=37, numClasses=2, numFeatures=1




# Results

In [6]:
di = result
df_res_time = pd.DataFrame()
df_res_nodes = pd.DataFrame()
for p1 in di.keys():
    for p2 in di[p1].keys():
        df_res_time.loc[p1, p2] = np.round(np.median(di[p1][p2][1]), 1)
        m_tmp = di[p1][p2][0]
        n_nodes = []
        for i,_ in enumerate(m_tmp):
            tmp = f'{m_tmp[i]}'.split(' ')
            print(i, tmp)
            n_nodes.append([int(x.split('=')[1][:-1]) for x in tmp if x[:4]== 'numN'][0])
        df_res_nodes.loc[p1, p2] = np.round(np.mean(n_nodes), 1)

df_res_time.to_csv(f'gs://mas-a5-storage-1/notebooks/jupyter/obj/{fn}_t.csv')
df_res_nodes.to_csv(f'gs://mas-a5-storage-1/notebooks/jupyter/obj/{fn}_n.csv')

display(df_res_time)      
display(df_res_nodes)     

0 ['DecisionTreeClassificationModel:', 'uid=DecisionTreeClassifier_9fc4c863286d,', 'depth=14,', 'numNodes=123,', 'numClasses=2,', 'numFeatures=1']
1 ['DecisionTreeClassificationModel:', 'uid=DecisionTreeClassifier_8d5a1ab56ea6,', 'depth=14,', 'numNodes=123,', 'numClasses=2,', 'numFeatures=1']
2 ['DecisionTreeClassificationModel:', 'uid=DecisionTreeClassifier_1124f765e59b,', 'depth=14,', 'numNodes=123,', 'numClasses=2,', 'numFeatures=1']
0 ['DecisionTreeClassificationModel:', 'uid=DecisionTreeClassifier_06b36774c047,', 'depth=14,', 'numNodes=123,', 'numClasses=2,', 'numFeatures=1']
1 ['DecisionTreeClassificationModel:', 'uid=DecisionTreeClassifier_5478081bbb39,', 'depth=14,', 'numNodes=123,', 'numClasses=2,', 'numFeatures=1']
2 ['DecisionTreeClassificationModel:', 'uid=DecisionTreeClassifier_ec66c7941687,', 'depth=14,', 'numNodes=123,', 'numClasses=2,', 'numFeatures=1']
0 ['DecisionTreeClassificationModel:', 'uid=DecisionTreeClassifier_bb2348ef1edf,', 'depth=14,', 'numNodes=123,', 'numC

Unnamed: 0,8,12,16,24,36,72
1,4.3,5.1,4.8,6.1,8.0,13.3
2,4.5,5.9,5.7,7.9,9.3,21.8
5,6.1,8.0,7.8,10.2,13.8,21.7
10,8.6,7.4,6.0,5.4,6.2,7.9
15,9.2,7.3,6.3,6.2,6.2,8.3
20,9.9,7.6,7.1,6.8,6.4,7.9
30,11.6,9.8,8.6,7.4,7.0,8.5
50,16.4,12.5,10.4,9.6,7.8,9.2
100,28.6,21.2,15.1,12.7,12.0,12.6


Unnamed: 0,8,12,16,24,36,72
1,123.0,123.0,123.0,123.0,123.0,123.0
2,145.0,129.0,131.0,137.0,135.0,135.0
5,139.0,141.0,115.0,127.0,51.0,95.0
10,55.0,85.0,107.0,73.0,53.0,69.0
15,59.0,113.0,57.0,49.0,77.0,61.0
20,71.0,65.0,41.0,77.0,37.0,73.0
30,49.0,41.0,43.0,57.0,37.0,35.0
50,39.0,35.0,47.0,43.0,29.0,47.0
100,33.0,41.0,41.0,37.0,45.0,37.0


In [7]:
#fn = '4cpu_by_8n__4m_yarn_ssd'
tmp = pd.read_csv(f'gs://mas-a5-storage-1/notebooks/jupyter/obj/{fn}_t.csv')
print(fn)
tmp

8cpu_by_12n__8m_yarn_ssd_repart


Unnamed: 0.1,Unnamed: 0,8,12,16,24,36,72
0,1,4.3,5.1,4.8,6.1,8.0,13.3
1,2,4.5,5.9,5.7,7.9,9.3,21.8
2,5,6.1,8.0,7.8,10.2,13.8,21.7
3,10,8.6,7.4,6.0,5.4,6.2,7.9
4,15,9.2,7.3,6.3,6.2,6.2,8.3
5,20,9.9,7.6,7.1,6.8,6.4,7.9
6,30,11.6,9.8,8.6,7.4,7.0,8.5
7,50,16.4,12.5,10.4,9.6,7.8,9.2
8,100,28.6,21.2,15.1,12.7,12.0,12.6


In [8]:
#fn = '4cpu_by_8n__4m_yarn_ssd'
tmp = pd.read_csv(f'gs://mas-a5-storage-1/notebooks/jupyter/obj/{fn}_n.csv')
print(fn)
tmp

8cpu_by_12n__8m_yarn_ssd_repart


Unnamed: 0.1,Unnamed: 0,8,12,16,24,36,72
0,1,123.0,123.0,123.0,123.0,123.0,123.0
1,2,145.0,129.0,131.0,137.0,135.0,135.0
2,5,139.0,141.0,115.0,127.0,51.0,95.0
3,10,55.0,85.0,107.0,73.0,53.0,69.0
4,15,59.0,113.0,57.0,49.0,77.0,61.0
5,20,71.0,65.0,41.0,77.0,37.0,73.0
6,30,49.0,41.0,43.0,57.0,37.0,35.0
7,50,39.0,35.0,47.0,43.0,29.0,47.0
8,100,33.0,41.0,41.0,37.0,45.0,37.0
