# Library

In [2]:
#import sys
import pandas as pd
import numpy as np
import random
import pyspark
import itertools
import matplotlib.pyplot as plt
import itertools
import seaborn as sns
import pickle
import statsmodels.api as sm

from pyspark import SparkContext, SQLContext

from math import sqrt
from time import time as ttt

from pyspark.sql import SparkSession
from pyspark.sql import functions as f

from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier as DTC_spark
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

import joblib
from joblib import parallel_backend
from joblib import Parallel, delayed
from joblib import parallel_backend


# Upload

In [2]:
spark = SparkSession.builder.master("local").appName("spark_app_1234").getOrCreate()
sc = spark.sparkContext
spark
d0 = (
    spark
    .read
    .format("csv") 
    .option("header","true") 
    .option("inferSchema","true") 
    .load("gs://mas-a5-storage-1/notebooks/jupyter/application_train.csv")
)

d1 = d0.filter(d0.DAYS_EMPLOYED != 365243).select('TARGET','DAYS_EMPLOYED')

print(d1.rdd.getNumPartitions())

                                                                                

4


# Function

In [3]:
def prepare_spark_data(n_part, k_mult):
    '''
    takes 'DAYS_EMPLOYED and 'TARGET'  from d1 (alreay filtered)
    rearrane in n partitions (if n==0 keeps initial number of partitions)
    prints final shape/ partition
    returns d2 - spark df
    '''
    data = d1
    data_new = data # first step in the cycle
    for i in range(k_mult-1):
        data_tmp = data.select('TARGET', \
                     f.col('DAYS_EMPLOYED')*(f.lit(0.9995) + f.rand()/1000)).\
                   toDF('TARGET','DAYS_EMPLOYED')
        data_tmp = data_tmp.select('TARGET', f.floor('DAYS_EMPLOYED'))
        data_new = data_new.union(data_tmp)

    assembler = VectorAssembler(inputCols=["DAYS_EMPLOYED"], 
                        outputCol="DAYS_EMPLOYED_vect")
    d2 = assembler.transform(data_new)
    if n_part != 0:
        d2 = d2.repartition(n_part)       
    #print(f'n-partitions initial: {d2.rdd.getNumPartitions()}; df size: {d2.count()}\n')

    return d2

# Params an run

In [None]:
fn = '4cpu_by_6n__4m'
size_mult = [1, 2, 5, 10, 15, 20, 30, 50]
partitions = [0, 2, 4, 6, 8, 16, 24] 
# initial, 2**i incl n_nodes till n_cpu*n_nodes and n_cpu*n_nodes
n_iter = 3

print('d1-size', d1.count())
n_part_base = d1.rdd.getNumPartitions()
print('n-partitions initial', n_part_base, '\n')

rd1 = {}
for k_size_mult in size_mult:
    rd2 = {}
    for n_part in partitions:
        df = prepare_spark_data(n_part, k_size_mult)
        df.cache()
        print('======================================================================')
        print(f'=== size_mult={k_size_mult}; \
        n_part_req={n_part}; n_part_act={df.rdd.getNumPartitions()}; df_count={df.count()} ===\n')
        times = [0 for i in range(n_iter)]
        models = {}
        for i in range(n_iter):
            dt = DTC_spark(labelCol="TARGET",
                       featuresCol="DAYS_EMPLOYED_vect",
                       minInfoGain=0.0001,
                       impurity='entropy',
                       maxDepth=14, maxBins=2**14, # it differs from scikit learn - it means number of canidate split points
                       #minInstancesPerNode = 1,
                       #checkpointInterval = 10
                       )
           
            t0 = ttt()
            model = dt.fit(df)
            t1 = ttt()
            times[i] = t1-t0
            models[i] = model 
            print('model', i, 'build time', round(times[i],2), '\n', model)
        rd2[n_part] = (models, times)
    rd1[k_size_mult] = rd2

result = rd1  

                                                                                

d1-size 252137
n-partitions initial 4 





=== size_mult=1;         n_part_req=0; n_part_act=14; df_count=252137 ===

model 0 build time 6.19 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_5d1823802f8f, depth=14, numNodes=123, numClasses=2, numFeatures=1
model 1 build time 4.72 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_158adef97fe0, depth=14, numNodes=123, numClasses=2, numFeatures=1
model 2 build time 4.65 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_4723484652af, depth=14, numNodes=123, numClasses=2, numFeatures=1
=== size_mult=1;         n_part_req=2; n_part_act=2; df_count=252137 ===



                                                                                

model 0 build time 7.36 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_e21e7f3b1495, depth=14, numNodes=123, numClasses=2, numFeatures=1


                                                                                

model 1 build time 7.16 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_13872054975f, depth=14, numNodes=123, numClasses=2, numFeatures=1


                                                                                

model 2 build time 6.85 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_3d6639470fe1, depth=14, numNodes=123, numClasses=2, numFeatures=1
=== size_mult=1;         n_part_req=4; n_part_act=4; df_count=252137 ===



                                                                                

model 0 build time 5.79 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_96b9486294a7, depth=14, numNodes=123, numClasses=2, numFeatures=1




model 1 build time 5.78 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_aa01a4e7e44d, depth=14, numNodes=123, numClasses=2, numFeatures=1
model 2 build time 5.48 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_0ebaaccf6fca, depth=14, numNodes=123, numClasses=2, numFeatures=1
=== size_mult=1;         n_part_req=6; n_part_act=6; df_count=252137 ===





model 0 build time 5.49 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_3e7a598034bd, depth=14, numNodes=123, numClasses=2, numFeatures=1




model 1 build time 5.54 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_6efae50d3c69, depth=14, numNodes=123, numClasses=2, numFeatures=1




model 2 build time 5.32 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_e28c769f4177, depth=14, numNodes=123, numClasses=2, numFeatures=1
=== size_mult=1;         n_part_req=8; n_part_act=8; df_count=252137 ===





model 0 build time 5.63 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_51dcefce62b4, depth=14, numNodes=123, numClasses=2, numFeatures=1
model 1 build time 5.38 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_ac749450cce2, depth=14, numNodes=123, numClasses=2, numFeatures=1
model 2 build time 5.4 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_788f78ad9918, depth=14, numNodes=123, numClasses=2, numFeatures=1
=== size_mult=1;         n_part_req=16; n_part_act=16; df_count=252137 ===



                                                                                

model 0 build time 8.72 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_aef97809cb29, depth=14, numNodes=123, numClasses=2, numFeatures=1


                                                                                

model 1 build time 8.48 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_61037083d803, depth=14, numNodes=123, numClasses=2, numFeatures=1


                                                                                

model 2 build time 8.24 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_0a4dddd70d3b, depth=14, numNodes=123, numClasses=2, numFeatures=1
=== size_mult=1;         n_part_req=24; n_part_act=24; df_count=252137 ===



                                                                                

model 0 build time 9.82 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_7bc2ffe02771, depth=14, numNodes=123, numClasses=2, numFeatures=1


                                                                                

model 1 build time 9.87 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_30022cf2dc49, depth=14, numNodes=123, numClasses=2, numFeatures=1


                                                                                

model 2 build time 9.7 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_41e6387eb38d, depth=14, numNodes=123, numClasses=2, numFeatures=1


                                                                                

=== size_mult=2;         n_part_req=0; n_part_act=28; df_count=504274 ===



                                                                                

model 0 build time 7.29 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_0c648a8b1ea5, depth=14, numNodes=151, numClasses=2, numFeatures=1


                                                                                

model 1 build time 6.86 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_a1f4e4c2a7e3, depth=14, numNodes=151, numClasses=2, numFeatures=1


                                                                                

model 2 build time 6.81 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_b3ed26cb3275, depth=14, numNodes=151, numClasses=2, numFeatures=1
=== size_mult=2;         n_part_req=2; n_part_act=2; df_count=504274 ===



                                                                                

model 0 build time 9.53 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_a5f85088de3b, depth=14, numNodes=151, numClasses=2, numFeatures=1


                                                                                

model 1 build time 9.49 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_feb949901f41, depth=14, numNodes=151, numClasses=2, numFeatures=1


                                                                                

model 2 build time 9.56 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_c9ab9c3375a8, depth=14, numNodes=151, numClasses=2, numFeatures=1
=== size_mult=2;         n_part_req=4; n_part_act=4; df_count=504274 ===



                                                                                

model 0 build time 6.24 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_f88eb2d47bb3, depth=14, numNodes=151, numClasses=2, numFeatures=1


                                                                                

model 1 build time 6.15 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_d8e5eaaae95e, depth=14, numNodes=151, numClasses=2, numFeatures=1


                                                                                

model 2 build time 6.15 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_ec802a2fe38c, depth=14, numNodes=151, numClasses=2, numFeatures=1
=== size_mult=2;         n_part_req=6; n_part_act=6; df_count=504274 ===



                                                                                

model 0 build time 7.95 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_ca7cbc81ff4d, depth=14, numNodes=151, numClasses=2, numFeatures=1


                                                                                

model 1 build time 8.11 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_d58a6edc47d5, depth=14, numNodes=151, numClasses=2, numFeatures=1


                                                                                

model 2 build time 7.94 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_11e74ccee84c, depth=14, numNodes=151, numClasses=2, numFeatures=1
=== size_mult=2;         n_part_req=8; n_part_act=8; df_count=504274 ===



                                                                                

model 0 build time 8.09 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_28f4be3fffb0, depth=14, numNodes=151, numClasses=2, numFeatures=1


                                                                                

model 1 build time 8.13 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_fb9583a018c5, depth=14, numNodes=151, numClasses=2, numFeatures=1


                                                                                

model 2 build time 7.72 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_1a72cf282d61, depth=14, numNodes=151, numClasses=2, numFeatures=1
=== size_mult=2;         n_part_req=16; n_part_act=16; df_count=504274 ===



                                                                                

model 0 build time 11.45 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_6293073a4606, depth=14, numNodes=151, numClasses=2, numFeatures=1


                                                                                

model 1 build time 11.68 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_39ab698ba9ff, depth=14, numNodes=151, numClasses=2, numFeatures=1


                                                                                

model 2 build time 11.4 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_87bb18b19851, depth=14, numNodes=151, numClasses=2, numFeatures=1
=== size_mult=2;         n_part_req=24; n_part_act=24; df_count=504274 ===



                                                                                

model 0 build time 14.54 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_8566c96529cd, depth=14, numNodes=151, numClasses=2, numFeatures=1


                                                                                

model 1 build time 14.41 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_12e25b92499d, depth=14, numNodes=151, numClasses=2, numFeatures=1


                                                                                

model 2 build time 14.27 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_6e73d70eec0c, depth=14, numNodes=151, numClasses=2, numFeatures=1


                                                                                

=== size_mult=5;         n_part_req=0; n_part_act=70; df_count=1260685 ===



                                                                                

model 0 build time 16.05 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_bb5f26c1dc3a, depth=13, numNodes=67, numClasses=2, numFeatures=1


                                                                                

model 1 build time 17.87 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_b4d2f09f40bc, depth=13, numNodes=67, numClasses=2, numFeatures=1


                                                                                

model 2 build time 18.4 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_e2192f2d7200, depth=13, numNodes=67, numClasses=2, numFeatures=1


                                                                                

=== size_mult=5;         n_part_req=2; n_part_act=2; df_count=1260685 ===



                                                                                

model 0 build time 15.88 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_56efb18276d0, depth=13, numNodes=67, numClasses=2, numFeatures=1


                                                                                

model 1 build time 13.83 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_1952cb801d2e, depth=13, numNodes=67, numClasses=2, numFeatures=1


                                                                                

model 2 build time 13.21 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_e03a51fdc08c, depth=13, numNodes=67, numClasses=2, numFeatures=1


                                                                                

=== size_mult=5;         n_part_req=4; n_part_act=4; df_count=1260685 ===



                                                                                

model 0 build time 9.6 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_8ed849004745, depth=13, numNodes=67, numClasses=2, numFeatures=1


                                                                                

model 1 build time 8.8 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_3a02c3e8f05d, depth=13, numNodes=67, numClasses=2, numFeatures=1


                                                                                

model 2 build time 9.67 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_f17863b4bc4f, depth=13, numNodes=67, numClasses=2, numFeatures=1
=== size_mult=5;         n_part_req=6; n_part_act=6; df_count=1260685 ===



                                                                                

model 0 build time 9.74 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_516e5d184a85, depth=13, numNodes=67, numClasses=2, numFeatures=1


                                                                                

model 1 build time 9.58 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_4d333f3cb5af, depth=13, numNodes=67, numClasses=2, numFeatures=1


                                                                                

model 2 build time 9.38 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_16d4c4d0c576, depth=13, numNodes=67, numClasses=2, numFeatures=1
=== size_mult=5;         n_part_req=8; n_part_act=8; df_count=1260685 ===



                                                                                

model 0 build time 10.24 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_9ee675cd3331, depth=13, numNodes=67, numClasses=2, numFeatures=1


                                                                                

model 1 build time 10.39 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_f4fe748e8f9f, depth=13, numNodes=67, numClasses=2, numFeatures=1


                                                                                

model 2 build time 10.07 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_26644266ae6b, depth=13, numNodes=67, numClasses=2, numFeatures=1
=== size_mult=5;         n_part_req=16; n_part_act=16; df_count=1260685 ===



                                                                                

model 0 build time 13.75 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_048fd5602380, depth=13, numNodes=67, numClasses=2, numFeatures=1


                                                                                

model 1 build time 14.4 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_afd1145326bb, depth=13, numNodes=67, numClasses=2, numFeatures=1


                                                                                

model 2 build time 13.43 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_7b849ad9c356, depth=13, numNodes=67, numClasses=2, numFeatures=1
=== size_mult=5;         n_part_req=24; n_part_act=24; df_count=1260685 ===



                                                                                

model 0 build time 16.99 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_4818a1de1f7d, depth=13, numNodes=67, numClasses=2, numFeatures=1


                                                                                

model 1 build time 16.79 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_1b53f355b238, depth=13, numNodes=67, numClasses=2, numFeatures=1


                                                                                

model 2 build time 16.98 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_315c7e817f0f, depth=13, numNodes=67, numClasses=2, numFeatures=1


                                                                                

=== size_mult=10;         n_part_req=0; n_part_act=180; df_count=2521370 ===



                                                                                

model 0 build time 42.13 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_d7ee8c05d2df, depth=13, numNodes=65, numClasses=2, numFeatures=1


                                                                                

model 1 build time 38.71 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_2e3fefe97478, depth=13, numNodes=65, numClasses=2, numFeatures=1


                                                                                

model 2 build time 36.26 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_1c21242dc834, depth=13, numNodes=65, numClasses=2, numFeatures=1


                                                                                

=== size_mult=10;         n_part_req=2; n_part_act=2; df_count=2521370 ===



                                                                                

model 0 build time 17.98 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_64bbd53ecdeb, depth=13, numNodes=65, numClasses=2, numFeatures=1


                                                                                

model 1 build time 18.16 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_f4ab65f631a1, depth=13, numNodes=65, numClasses=2, numFeatures=1


                                                                                

model 2 build time 18.02 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_c4308c727ad4, depth=13, numNodes=65, numClasses=2, numFeatures=1


                                                                                

=== size_mult=10;         n_part_req=4; n_part_act=4; df_count=2521370 ===



                                                                                

model 0 build time 13.1 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_6aa9dbb85743, depth=13, numNodes=65, numClasses=2, numFeatures=1


                                                                                

model 1 build time 10.94 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_7597b94e60a7, depth=13, numNodes=65, numClasses=2, numFeatures=1


                                                                                

model 2 build time 11.25 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_fa87d718693a, depth=13, numNodes=65, numClasses=2, numFeatures=1


                                                                                

=== size_mult=10;         n_part_req=6; n_part_act=6; df_count=2521370 ===



                                                                                

model 0 build time 16.26 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_522992ae613d, depth=13, numNodes=65, numClasses=2, numFeatures=1


                                                                                

model 1 build time 16.19 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_52ce886ad9be, depth=13, numNodes=65, numClasses=2, numFeatures=1


                                                                                

model 2 build time 15.46 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_f1ea6fa9703a, depth=13, numNodes=65, numClasses=2, numFeatures=1


                                                                                

=== size_mult=10;         n_part_req=8; n_part_act=8; df_count=2521370 ===



                                                                                

model 0 build time 20.33 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_4ec13c8556ea, depth=13, numNodes=65, numClasses=2, numFeatures=1


                                                                                

model 1 build time 19.09 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_e71dc84913cc, depth=13, numNodes=65, numClasses=2, numFeatures=1


                                                                                

model 2 build time 19.45 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_2ece1335cb6f, depth=13, numNodes=65, numClasses=2, numFeatures=1


                                                                                

=== size_mult=10;         n_part_req=16; n_part_act=16; df_count=2521370 ===



                                                                                

model 0 build time 25.83 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_101aadcb28b0, depth=13, numNodes=65, numClasses=2, numFeatures=1


                                                                                

model 1 build time 26.77 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_aa7aa3467735, depth=13, numNodes=65, numClasses=2, numFeatures=1


                                                                                

model 2 build time 25.94 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_2d9669c9be47, depth=13, numNodes=65, numClasses=2, numFeatures=1


                                                                                

=== size_mult=10;         n_part_req=24; n_part_act=24; df_count=2521370 ===



                                                                                

model 0 build time 32.83 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_06fb4d37acd2, depth=13, numNodes=65, numClasses=2, numFeatures=1


                                                                                

model 1 build time 32.21 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_9875359e2770, depth=13, numNodes=65, numClasses=2, numFeatures=1


                                                                                

model 2 build time 32.6 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_6feacebde2cf, depth=13, numNodes=65, numClasses=2, numFeatures=1


                                                                                

=== size_mult=15;         n_part_req=0; n_part_act=330; df_count=3782055 ===



                                                                                

model 0 build time 72.74 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_478edb2a7247, depth=10, numNodes=35, numClasses=2, numFeatures=1


                                                                                

model 1 build time 68.98 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_bfed00935357, depth=10, numNodes=35, numClasses=2, numFeatures=1


                                                                                

model 2 build time 67.55 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_0369f639a339, depth=10, numNodes=35, numClasses=2, numFeatures=1


                                                                                

=== size_mult=15;         n_part_req=2; n_part_act=2; df_count=3782055 ===



                                                                                

model 0 build time 24.47 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_966cbb4cd087, depth=10, numNodes=35, numClasses=2, numFeatures=1


                                                                                

model 1 build time 25.08 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_0ff5665ed51a, depth=10, numNodes=35, numClasses=2, numFeatures=1


                                                                                

model 2 build time 24.17 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_1fddd016d7fb, depth=10, numNodes=35, numClasses=2, numFeatures=1


                                                                                

=== size_mult=15;         n_part_req=4; n_part_act=4; df_count=3782055 ===



                                                                                

model 0 build time 14.15 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_f4c9421d5430, depth=10, numNodes=35, numClasses=2, numFeatures=1


                                                                                

model 1 build time 14.96 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_4222217ebba3, depth=10, numNodes=35, numClasses=2, numFeatures=1


                                                                                

model 2 build time 13.85 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_fea78ab9a1ef, depth=10, numNodes=35, numClasses=2, numFeatures=1


                                                                                

=== size_mult=15;         n_part_req=6; n_part_act=6; df_count=3782055 ===



                                                                                

model 0 build time 19.96 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_08acb2acfe27, depth=10, numNodes=35, numClasses=2, numFeatures=1


                                                                                

model 1 build time 19.37 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_d3941c7330c5, depth=10, numNodes=35, numClasses=2, numFeatures=1


                                                                                

model 2 build time 20.61 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_892e7bec4417, depth=10, numNodes=35, numClasses=2, numFeatures=1


                                                                                

=== size_mult=15;         n_part_req=8; n_part_act=8; df_count=3782055 ===



                                                                                

model 0 build time 25.84 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_b9cc28dfaa95, depth=10, numNodes=35, numClasses=2, numFeatures=1


                                                                                

model 1 build time 25.77 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_48b92580d74e, depth=10, numNodes=35, numClasses=2, numFeatures=1


                                                                                

model 2 build time 25.42 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_70b41f1dfebb, depth=10, numNodes=35, numClasses=2, numFeatures=1


                                                                                

=== size_mult=15;         n_part_req=16; n_part_act=16; df_count=3782055 ===



                                                                                

model 0 build time 32.15 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_bdc4ec6c4680, depth=10, numNodes=35, numClasses=2, numFeatures=1


                                                                                

model 1 build time 33.85 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_761e5bf5a66c, depth=10, numNodes=35, numClasses=2, numFeatures=1


                                                                                

model 2 build time 31.72 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_538d4f1551a8, depth=10, numNodes=35, numClasses=2, numFeatures=1


                                                                                

=== size_mult=15;         n_part_req=24; n_part_act=24; df_count=3782055 ===



                                                                                

model 0 build time 39.49 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_c77e53b78839, depth=10, numNodes=35, numClasses=2, numFeatures=1


                                                                                

model 1 build time 39.96 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_a744678eadd1, depth=10, numNodes=35, numClasses=2, numFeatures=1


                                                                                

model 2 build time 40.07 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_6529a8113495, depth=10, numNodes=35, numClasses=2, numFeatures=1


                                                                                

=== size_mult=20;         n_part_req=0; n_part_act=480; df_count=5042740 ===



                                                                                

model 0 build time 97.18 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_65ee52a156c8, depth=14, numNodes=57, numClasses=2, numFeatures=1


                                                                                

model 1 build time 96.86 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_177db8c0c1ce, depth=14, numNodes=57, numClasses=2, numFeatures=1


                                                                                

model 2 build time 95.52 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_61ebb82a742c, depth=14, numNodes=57, numClasses=2, numFeatures=1


                                                                                

=== size_mult=20;         n_part_req=2; n_part_act=2; df_count=5042740 ===



                                                                                

model 0 build time 27.77 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_dc5cb3e5710b, depth=14, numNodes=57, numClasses=2, numFeatures=1


                                                                                

model 1 build time 27.9 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_e49f1a8473b3, depth=14, numNodes=57, numClasses=2, numFeatures=1


                                                                                

model 2 build time 27.34 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_8400d537f0a1, depth=14, numNodes=57, numClasses=2, numFeatures=1


                                                                                

=== size_mult=20;         n_part_req=4; n_part_act=4; df_count=5042740 ===



                                                                                

model 0 build time 17.09 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_734830a011fe, depth=14, numNodes=57, numClasses=2, numFeatures=1


                                                                                

model 1 build time 15.14 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_e0637b65a952, depth=14, numNodes=57, numClasses=2, numFeatures=1


                                                                                

model 2 build time 15.47 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_337830fe511b, depth=14, numNodes=57, numClasses=2, numFeatures=1


                                                                                

=== size_mult=20;         n_part_req=6; n_part_act=6; df_count=5042740 ===



                                                                                

model 0 build time 17.56 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_75cbd1668743, depth=14, numNodes=57, numClasses=2, numFeatures=1


                                                                                

model 1 build time 15.45 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_1a5a8fd6ec48, depth=14, numNodes=57, numClasses=2, numFeatures=1


                                                                                

model 2 build time 15.54 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_d41d4be066a7, depth=14, numNodes=57, numClasses=2, numFeatures=1


                                                                                

=== size_mult=20;         n_part_req=8; n_part_act=8; df_count=5042740 ===



                                                                                

model 0 build time 15.78 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_372cdb2ad98c, depth=14, numNodes=57, numClasses=2, numFeatures=1


                                                                                

model 1 build time 16.1 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_73b37f493417, depth=14, numNodes=57, numClasses=2, numFeatures=1


                                                                                

model 2 build time 15.03 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_ef1ceb37ca0e, depth=14, numNodes=57, numClasses=2, numFeatures=1


                                                                                

=== size_mult=20;         n_part_req=16; n_part_act=16; df_count=5042740 ===



                                                                                

model 0 build time 19.61 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_62754d90535c, depth=14, numNodes=57, numClasses=2, numFeatures=1


                                                                                

model 1 build time 19.78 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_faceeaea92de, depth=14, numNodes=57, numClasses=2, numFeatures=1


                                                                                

model 2 build time 18.57 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_d4be9eaad29f, depth=14, numNodes=57, numClasses=2, numFeatures=1


                                                                                

=== size_mult=20;         n_part_req=24; n_part_act=24; df_count=5042740 ===



                                                                                

model 0 build time 22.43 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_927f7418ccc0, depth=14, numNodes=57, numClasses=2, numFeatures=1


                                                                                

model 1 build time 23.0 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_5f36220f7f81, depth=14, numNodes=57, numClasses=2, numFeatures=1


                                                                                

model 2 build time 23.91 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_1fb8273c0764, depth=14, numNodes=57, numClasses=2, numFeatures=1


                                                                                

=== size_mult=30;         n_part_req=0; n_part_act=720; df_count=7564110 ===



                                                                                

model 0 build time 128.91 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_b4209e444e18, depth=11, numNodes=41, numClasses=2, numFeatures=1


                                                                                

model 1 build time 127.93 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_d04c974fee94, depth=11, numNodes=41, numClasses=2, numFeatures=1


                                                                                

model 2 build time 127.77 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_74df4e95fc86, depth=11, numNodes=41, numClasses=2, numFeatures=1


                                                                                

=== size_mult=30;         n_part_req=2; n_part_act=2; df_count=7564110 ===



                                                                                

model 0 build time 31.3 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_7e7bbbfcd9d7, depth=11, numNodes=41, numClasses=2, numFeatures=1


                                                                                

model 1 build time 34.82 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_c373809f365f, depth=11, numNodes=41, numClasses=2, numFeatures=1


                                                                                

model 2 build time 30.28 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_2454711ee9a4, depth=11, numNodes=41, numClasses=2, numFeatures=1


                                                                                

=== size_mult=30;         n_part_req=4; n_part_act=4; df_count=7564110 ===



                                                                                

model 0 build time 17.3 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_56e444a545e9, depth=11, numNodes=41, numClasses=2, numFeatures=1


                                                                                

model 1 build time 19.63 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_d9f88fe2409d, depth=11, numNodes=41, numClasses=2, numFeatures=1


                                                                                

model 2 build time 18.07 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_e8149e7e984d, depth=11, numNodes=41, numClasses=2, numFeatures=1


                                                                                

=== size_mult=30;         n_part_req=6; n_part_act=6; df_count=7564110 ===



                                                                                

model 0 build time 23.09 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_8a780ec4b533, depth=11, numNodes=41, numClasses=2, numFeatures=1


                                                                                

model 1 build time 25.47 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_c0242cb16d94, depth=11, numNodes=41, numClasses=2, numFeatures=1


                                                                                

model 2 build time 23.13 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_ed2db0f6ab70, depth=11, numNodes=41, numClasses=2, numFeatures=1


                                                                                

=== size_mult=30;         n_part_req=8; n_part_act=8; df_count=7564110 ===



                                                                                

model 0 build time 31.87 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_7a64dc9b35df, depth=11, numNodes=41, numClasses=2, numFeatures=1


                                                                                

model 1 build time 30.28 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_ed1a4a7e3f48, depth=11, numNodes=41, numClasses=2, numFeatures=1


                                                                                

model 2 build time 31.47 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_47e5afefaa6e, depth=11, numNodes=41, numClasses=2, numFeatures=1


                                                                                

=== size_mult=30;         n_part_req=16; n_part_act=16; df_count=7564110 ===



                                                                                

model 0 build time 38.5 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_0de5ac80e438, depth=11, numNodes=41, numClasses=2, numFeatures=1


                                                                                

model 1 build time 39.37 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_0e454e6441fe, depth=11, numNodes=41, numClasses=2, numFeatures=1


                                                                                

model 2 build time 39.82 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_e2af1631596c, depth=11, numNodes=41, numClasses=2, numFeatures=1


                                                                                

=== size_mult=30;         n_part_req=24; n_part_act=24; df_count=7564110 ===



                                                                                

model 0 build time 46.76 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_12dcd42380cd, depth=11, numNodes=41, numClasses=2, numFeatures=1


                                                                                

model 1 build time 46.61 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_9019849c20e3, depth=11, numNodes=41, numClasses=2, numFeatures=1


                                                                                

model 2 build time 46.48 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_fb0cf2dae2d9, depth=11, numNodes=41, numClasses=2, numFeatures=1


                                                                                

=== size_mult=50;         n_part_req=0; n_part_act=1200; df_count=12606850 ===



                                                                                

model 0 build time 214.82 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_1568b159a9e2, depth=13, numNodes=47, numClasses=2, numFeatures=1


                                                                                

model 1 build time 213.92 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_1662241a4034, depth=13, numNodes=47, numClasses=2, numFeatures=1


                                                                                

model 2 build time 214.39 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_c8fda7c359e5, depth=13, numNodes=47, numClasses=2, numFeatures=1


                                                                                

=== size_mult=50;         n_part_req=2; n_part_act=2; df_count=12606850 ===



                                                                                

model 0 build time 47.6 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_6061994d94e5, depth=13, numNodes=47, numClasses=2, numFeatures=1


                                                                                

model 1 build time 46.2 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_ecbef55acb0f, depth=13, numNodes=47, numClasses=2, numFeatures=1


                                                                                

model 2 build time 51.33 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_c6206a6aa425, depth=13, numNodes=47, numClasses=2, numFeatures=1


                                                                                

=== size_mult=50;         n_part_req=4; n_part_act=4; df_count=12606850 ===



                                                                                

model 0 build time 24.98 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_3c1e33d0ebd0, depth=13, numNodes=47, numClasses=2, numFeatures=1


                                                                                

model 1 build time 28.76 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_3fce322f4c48, depth=13, numNodes=47, numClasses=2, numFeatures=1


                                                                                

model 2 build time 24.21 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_bca1b6e86382, depth=13, numNodes=47, numClasses=2, numFeatures=1


                                                                                

=== size_mult=50;         n_part_req=6; n_part_act=6; df_count=12606850 ===



                                                                                

model 0 build time 26.55 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_6c73f0223172, depth=13, numNodes=47, numClasses=2, numFeatures=1


                                                                                

model 1 build time 27.72 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_c9b8464a71f2, depth=13, numNodes=47, numClasses=2, numFeatures=1


                                                                                

model 2 build time 27.95 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_86ad90767401, depth=13, numNodes=47, numClasses=2, numFeatures=1


                                                                                

=== size_mult=50;         n_part_req=8; n_part_act=8; df_count=12606850 ===



                                                                                

model 0 build time 26.58 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_0b756ab94c74, depth=13, numNodes=47, numClasses=2, numFeatures=1


                                                                                

model 1 build time 24.22 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_d0996f2f8197, depth=13, numNodes=47, numClasses=2, numFeatures=1


                                                                                

model 2 build time 24.71 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_b987fb8f4483, depth=13, numNodes=47, numClasses=2, numFeatures=1


                                                                                

=== size_mult=50;         n_part_req=16; n_part_act=16; df_count=12606850 ===



                                                                                

model 0 build time 28.83 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_f32a4c2c7438, depth=13, numNodes=47, numClasses=2, numFeatures=1


                                                                                

model 1 build time 28.19 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_c546fa9d8242, depth=13, numNodes=47, numClasses=2, numFeatures=1


                                                                                

model 2 build time 28.88 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_346e14e82005, depth=13, numNodes=47, numClasses=2, numFeatures=1


                                                                                

=== size_mult=50;         n_part_req=24; n_part_act=24; df_count=12606850 ===



                                                                                

model 0 build time 30.88 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_9fe0dd9fa96f, depth=13, numNodes=47, numClasses=2, numFeatures=1


                                                                                

model 1 build time 31.93 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_3aede4b01492, depth=13, numNodes=47, numClasses=2, numFeatures=1




model 2 build time 32.91 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_d4ab2ff1087c, depth=13, numNodes=47, numClasses=2, numFeatures=1


                                                                                

# Results

In [7]:
di = result
df_res_time = pd.DataFrame()
df_res_nodes = pd.DataFrame()
for p1 in di.keys():
    for p2 in di[p1].keys():
        df_res_time.loc[p1, p2] = np.round(np.mean(di[p1][p2][1]), 1)
        m_tmp = di[p1][p2][0]
        n_nodes = []
        for i,_ in enumerate(m_tmp):
            tmp = f'{m_tmp[i]}'.split(' ')
            print(i, tmp)
            n_nodes.append([int(x.split('=')[1][:-1]) for x in tmp if x[:4]== 'numN'][0])
        df_res_nodes.loc[p1, p2] = np.round(np.mean(n_nodes), 1)

df_res_time.to_csv(f'gs://mas-a5-storage-1/notebooks/jupyter/obj/{fn}_t.csv')
df_res_nodes.to_csv(f'gs://mas-a5-storage-1/notebooks/jupyter/obj/{fn}_n.csv')

display(df_res_time)      
display(df_res_nodes)     

0 ['DecisionTreeClassificationModel:', 'uid=DecisionTreeClassifier_5d1823802f8f,', 'depth=14,', 'numNodes=123,', 'numClasses=2,', 'numFeatures=1']
1 ['DecisionTreeClassificationModel:', 'uid=DecisionTreeClassifier_158adef97fe0,', 'depth=14,', 'numNodes=123,', 'numClasses=2,', 'numFeatures=1']
2 ['DecisionTreeClassificationModel:', 'uid=DecisionTreeClassifier_4723484652af,', 'depth=14,', 'numNodes=123,', 'numClasses=2,', 'numFeatures=1']
0 ['DecisionTreeClassificationModel:', 'uid=DecisionTreeClassifier_e21e7f3b1495,', 'depth=14,', 'numNodes=123,', 'numClasses=2,', 'numFeatures=1']
1 ['DecisionTreeClassificationModel:', 'uid=DecisionTreeClassifier_13872054975f,', 'depth=14,', 'numNodes=123,', 'numClasses=2,', 'numFeatures=1']
2 ['DecisionTreeClassificationModel:', 'uid=DecisionTreeClassifier_3d6639470fe1,', 'depth=14,', 'numNodes=123,', 'numClasses=2,', 'numFeatures=1']
0 ['DecisionTreeClassificationModel:', 'uid=DecisionTreeClassifier_96b9486294a7,', 'depth=14,', 'numNodes=123,', 'numC

Unnamed: 0,0,2,4,6,8,16,24
1,5.2,7.1,5.7,5.5,5.5,8.5,9.8
2,7.0,9.5,6.2,8.0,8.0,11.5,14.4
5,17.4,14.3,9.4,9.6,10.2,13.9,16.9
10,39.0,18.1,11.8,16.0,19.6,26.2,32.5
15,69.8,24.6,14.3,20.0,25.7,32.6,39.8
20,96.5,27.7,15.9,16.2,15.6,19.3,23.1
30,128.2,32.1,18.3,23.9,31.2,39.2,46.6
50,214.4,48.4,26.0,27.4,25.2,28.6,31.9


Unnamed: 0,0,2,4,6,8,16,24
1,123.0,123.0,123.0,123.0,123.0,123.0,123.0
2,151.0,151.0,151.0,151.0,151.0,151.0,151.0
5,67.0,67.0,67.0,67.0,67.0,67.0,67.0
10,65.0,65.0,65.0,65.0,65.0,65.0,65.0
15,35.0,35.0,35.0,35.0,35.0,35.0,35.0
20,57.0,57.0,57.0,57.0,57.0,57.0,57.0
30,41.0,41.0,41.0,41.0,41.0,41.0,41.0
50,47.0,47.0,47.0,47.0,47.0,47.0,47.0


In [3]:
fn = '4cpu_by_6n__4m'

tmp = pd.read_csv(f'gs://mas-a5-storage-1/notebooks/jupyter/obj/{fn}_t.csv')
tmp

Unnamed: 0.1,Unnamed: 0,0,2,4,6,8,16,24
0,1,5.2,7.1,5.7,5.5,5.5,8.5,9.8
1,2,7.0,9.5,6.2,8.0,8.0,11.5,14.4
2,5,17.4,14.3,9.4,9.6,10.2,13.9,16.9
3,10,39.0,18.1,11.8,16.0,19.6,26.2,32.5
4,15,69.8,24.6,14.3,20.0,25.7,32.6,39.8
5,20,96.5,27.7,15.9,16.2,15.6,19.3,23.1
6,30,128.2,32.1,18.3,23.9,31.2,39.2,46.6
7,50,214.4,48.4,26.0,27.4,25.2,28.6,31.9


In [4]:
fn = '4cpu_by_6n__4m'
tmp = pd.read_csv(f'gs://mas-a5-storage-1/notebooks/jupyter/obj/{fn}_n.csv')
tmp

Unnamed: 0.1,Unnamed: 0,0,2,4,6,8,16,24
0,1,123.0,123.0,123.0,123.0,123.0,123.0,123.0
1,2,151.0,151.0,151.0,151.0,151.0,151.0,151.0
2,5,67.0,67.0,67.0,67.0,67.0,67.0,67.0
3,10,65.0,65.0,65.0,65.0,65.0,65.0,65.0
4,15,35.0,35.0,35.0,35.0,35.0,35.0,35.0
5,20,57.0,57.0,57.0,57.0,57.0,57.0,57.0
6,30,41.0,41.0,41.0,41.0,41.0,41.0,41.0
7,50,47.0,47.0,47.0,47.0,47.0,47.0,47.0
