# Library

In [1]:
#import sys
import pandas as pd
import numpy as np
import random
import pyspark
import itertools
import matplotlib.pyplot as plt
import itertools
import seaborn as sns
import pickle
import statsmodels.api as sm

from pyspark import SparkContext, SQLContext

from math import sqrt
from time import time as ttt

from pyspark.sql import SparkSession
from pyspark.sql import functions as f

from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier as DTC_spark
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

import joblib
from joblib import parallel_backend
from joblib import Parallel, delayed
from joblib import parallel_backend


# Upload

In [2]:
spark = SparkSession.builder.master("Yarn").appName("spark_app_1234").getOrCreate()
sc = spark.sparkContext
spark
d0 = (
    spark
    .read
    .format("csv") 
    .option("header","true") 
    .option("inferSchema","true") 
    .load("gs://mas-a5-storage-1/notebooks/jupyter/application_train.csv")
)

d1 = d0.filter(d0.DAYS_EMPLOYED != 365243).select('TARGET','DAYS_EMPLOYED')

print(d1.rdd.getNumPartitions())

                                                                                

4


In [3]:
sc

# Function

In [4]:
def prepare_spark_data(n_part, k_mult):
    '''
    takes 'DAYS_EMPLOYED and 'TARGET'  from d1 (alreay filtered)
    rearrane in n partitions (if n==0 keeps initial number of partitions)
    prints final shape/ partition
    returns d2 - spark df
    '''
    data = d1
    data_new = data # first step in the cycle
    for i in range(k_mult-1):
        data_tmp = data.select('TARGET', \
                     f.col('DAYS_EMPLOYED')*(f.lit(0.9995) + f.rand()/1000)).\
                   toDF('TARGET','DAYS_EMPLOYED')
        data_tmp = data_tmp.select('TARGET', f.floor('DAYS_EMPLOYED'))
        data_new = data_new.union(data_tmp)

    assembler = VectorAssembler(inputCols=["DAYS_EMPLOYED"], 
                        outputCol="DAYS_EMPLOYED_vect")
    d2 = assembler.transform(data_new)
    if n_part != 0:
        d2 = d2.repartition(n_part)       
    #print(f'n-partitions initial: {d2.rdd.getNumPartitions()}; df size: {d2.count()}\n')

    return d2

# Params an run

In [None]:
## !! mind the fn NOT TO rewrite results
fn = '4cpu_by_4n__4m_yarn_ssd'
size_mult = [1, 2, 5, 10, 15, 20, 30, 50, 100]
partitions = [2, 4, 8, 16] 
# initial, 2**i incl n_nodes till n_cpu*n_nodes and n_cpu*n_nodes
n_iter = 3

print('d1-size', d1.count())
n_part_base = d1.rdd.getNumPartitions()
print('n-partitions initial', n_part_base, '\n')

rd1 = {}
for k_size_mult in size_mult:
    rd2 = {}
    for n_part in partitions:
        df = prepare_spark_data(n_part, k_size_mult)
        df.cache()
        print('======================================================================')
        print(f'=== size_mult={k_size_mult}; \
        n_part_req={n_part}; n_part_act={df.rdd.getNumPartitions()}; df_count={df.count()} ===\n')
        times = [0 for i in range(n_iter)]
        models = {}
        for i in range(n_iter):
            dt = DTC_spark(labelCol="TARGET",
                       featuresCol="DAYS_EMPLOYED_vect",
                       minInfoGain=0.0001,
                       impurity='entropy',
                       maxDepth=14, maxBins=2**14, # it differs from scikit learn - it means number of canidate split points
                       #minInstancesPerNode = 1,
                       #checkpointInterval = 10
                       )
           
            t0 = ttt()
            model = dt.fit(df)
            t1 = ttt()
            times[i] = t1-t0
            models[i] = model 
            print('model', i, 'build time', round(times[i],2), '\n', model)
        rd2[n_part] = (models, times)
    rd1[k_size_mult] = rd2

result = rd1  

                                                                                

d1-size 252137
n-partitions initial 4 



                                                                                

=== size_mult=1;         n_part_req=2; n_part_act=2; df_count=252137 ===



                                                                                

model 0 build time 6.93 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_b30adb956b80, depth=14, numNodes=123, numClasses=2, numFeatures=1


                                                                                

model 1 build time 6.48 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_f2d4a7ccf821, depth=14, numNodes=123, numClasses=2, numFeatures=1


                                                                                

model 2 build time 6.46 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_743212ae4e3c, depth=14, numNodes=123, numClasses=2, numFeatures=1


                                                                                

=== size_mult=1;         n_part_req=4; n_part_act=4; df_count=252137 ===



                                                                                

model 0 build time 7.84 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_c2b28cc4ad08, depth=14, numNodes=123, numClasses=2, numFeatures=1


                                                                                

model 1 build time 7.71 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_1d3589241bea, depth=14, numNodes=123, numClasses=2, numFeatures=1


                                                                                

model 2 build time 7.55 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_7b4a57dc6944, depth=14, numNodes=123, numClasses=2, numFeatures=1


                                                                                

=== size_mult=1;         n_part_req=8; n_part_act=8; df_count=252137 ===



                                                                                

model 0 build time 10.33 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_fafd2a48ffa6, depth=14, numNodes=123, numClasses=2, numFeatures=1


                                                                                

model 1 build time 9.95 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_81a33bff7dbb, depth=14, numNodes=123, numClasses=2, numFeatures=1


                                                                                

model 2 build time 10.1 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_4746cb5ff66f, depth=14, numNodes=123, numClasses=2, numFeatures=1


                                                                                

=== size_mult=1;         n_part_req=16; n_part_act=16; df_count=252137 ===



                                                                                

model 0 build time 15.02 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_bb63de6e5e1e, depth=14, numNodes=123, numClasses=2, numFeatures=1


                                                                                

model 1 build time 16.46 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_dec9eae2a86e, depth=14, numNodes=123, numClasses=2, numFeatures=1


                                                                                

model 2 build time 15.35 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_0bf1a703dbec, depth=14, numNodes=123, numClasses=2, numFeatures=1


                                                                                

=== size_mult=2;         n_part_req=2; n_part_act=2; df_count=504274 ===



                                                                                

model 0 build time 10.65 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_e51e0270ddc7, depth=14, numNodes=125, numClasses=2, numFeatures=1


                                                                                

model 1 build time 8.6 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_baa4c4f86761, depth=14, numNodes=125, numClasses=2, numFeatures=1


                                                                                

model 2 build time 8.3 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_b0e737136fad, depth=14, numNodes=125, numClasses=2, numFeatures=1


                                                                                

=== size_mult=2;         n_part_req=4; n_part_act=4; df_count=504274 ===

model 0 build time 5.9 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_3d34d9c8d75e, depth=14, numNodes=121, numClasses=2, numFeatures=1
model 1 build time 5.62 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_c5894139fe2b, depth=14, numNodes=121, numClasses=2, numFeatures=1


[Stage 1176:>                                                       (0 + 4) / 4]                                                                                

model 2 build time 5.49 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_5fddee4b73e6, depth=14, numNodes=121, numClasses=2, numFeatures=1


                                                                                

=== size_mult=2;         n_part_req=8; n_part_act=8; df_count=504274 ===





model 0 build time 6.57 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_7a844e922350, depth=14, numNodes=89, numClasses=2, numFeatures=1


                                                                                

model 1 build time 6.67 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_0d2e460916c4, depth=14, numNodes=89, numClasses=2, numFeatures=1




model 2 build time 6.61 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_84cb1b8e06ed, depth=14, numNodes=89, numClasses=2, numFeatures=1


                                                                                

=== size_mult=2;         n_part_req=16; n_part_act=16; df_count=504274 ===



                                                                                

model 0 build time 10.31 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_a2ce9aac7b93, depth=14, numNodes=83, numClasses=2, numFeatures=1


                                                                                

model 1 build time 10.1 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_6ab853483726, depth=14, numNodes=83, numClasses=2, numFeatures=1


                                                                                

model 2 build time 10.41 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_b29378ac7016, depth=14, numNodes=83, numClasses=2, numFeatures=1


                                                                                

=== size_mult=5;         n_part_req=2; n_part_act=2; df_count=1260685 ===



                                                                                

model 0 build time 15.12 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_6b1f7946f80a, depth=14, numNodes=85, numClasses=2, numFeatures=1


                                                                                

model 1 build time 14.77 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_6d393d366ab5, depth=14, numNodes=85, numClasses=2, numFeatures=1


                                                                                

model 2 build time 13.59 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_e8fd1dd8f7fa, depth=14, numNodes=85, numClasses=2, numFeatures=1


                                                                                

=== size_mult=5;         n_part_req=4; n_part_act=4; df_count=1260685 ===



                                                                                

model 0 build time 12.41 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_9c6895908263, depth=14, numNodes=55, numClasses=2, numFeatures=1


                                                                                

model 1 build time 10.97 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_1e1a789f21d4, depth=14, numNodes=55, numClasses=2, numFeatures=1


                                                                                

model 2 build time 9.73 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_782200602465, depth=14, numNodes=55, numClasses=2, numFeatures=1


                                                                                

=== size_mult=5;         n_part_req=8; n_part_act=8; df_count=1260685 ===



                                                                                

model 0 build time 14.86 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_6ec150418025, depth=14, numNodes=73, numClasses=2, numFeatures=1


                                                                                

model 1 build time 8.84 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_14868690f5bd, depth=14, numNodes=73, numClasses=2, numFeatures=1


                                                                                

model 2 build time 8.67 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_523532abd53c, depth=14, numNodes=73, numClasses=2, numFeatures=1


                                                                                

=== size_mult=5;         n_part_req=16; n_part_act=16; df_count=1260685 ===



                                                                                

model 0 build time 22.05 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_79759d89e350, depth=14, numNodes=83, numClasses=2, numFeatures=1


                                                                                

model 1 build time 21.98 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_7ebe8798dfdf, depth=14, numNodes=83, numClasses=2, numFeatures=1


                                                                                

model 2 build time 22.25 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_c298f23d8799, depth=14, numNodes=83, numClasses=2, numFeatures=1


                                                                                

=== size_mult=10;         n_part_req=2; n_part_act=2; df_count=2521370 ===



                                                                                

model 0 build time 21.69 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_899c4e6e1cc9, depth=14, numNodes=69, numClasses=2, numFeatures=1


                                                                                

model 1 build time 20.51 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_637de873a934, depth=14, numNodes=69, numClasses=2, numFeatures=1


                                                                                

model 2 build time 18.81 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_2b8362917607, depth=14, numNodes=69, numClasses=2, numFeatures=1


                                                                                

=== size_mult=10;         n_part_req=4; n_part_act=4; df_count=2521370 ===



                                                                                

model 0 build time 13.69 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_62fe4b2cd7f9, depth=13, numNodes=51, numClasses=2, numFeatures=1


                                                                                

model 1 build time 12.02 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_f2564e3c208f, depth=13, numNodes=51, numClasses=2, numFeatures=1


                                                                                

model 2 build time 12.18 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_825412832822, depth=13, numNodes=51, numClasses=2, numFeatures=1


                                                                                

=== size_mult=10;         n_part_req=8; n_part_act=8; df_count=2521370 ===



                                                                                

model 0 build time 12.22 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_1b8d86c22124, depth=14, numNodes=107, numClasses=2, numFeatures=1


                                                                                

model 1 build time 9.08 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_7d246fc7b247, depth=14, numNodes=107, numClasses=2, numFeatures=1


                                                                                

model 2 build time 8.68 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_47b77c49c70c, depth=14, numNodes=107, numClasses=2, numFeatures=1


                                                                                

=== size_mult=10;         n_part_req=16; n_part_act=16; df_count=2521370 ===



                                                                                

model 0 build time 11.08 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_ae798074a56e, depth=14, numNodes=99, numClasses=2, numFeatures=1


                                                                                

model 1 build time 10.21 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_f22d7a0cf851, depth=14, numNodes=99, numClasses=2, numFeatures=1


                                                                                

model 2 build time 10.33 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_9db6bbfe17e7, depth=14, numNodes=99, numClasses=2, numFeatures=1


                                                                                

=== size_mult=15;         n_part_req=2; n_part_act=2; df_count=3782055 ===



                                                                                

model 0 build time 26.26 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_ea6c749a3158, depth=14, numNodes=51, numClasses=2, numFeatures=1


                                                                                

model 1 build time 25.24 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_45059424d3c1, depth=14, numNodes=51, numClasses=2, numFeatures=1


                                                                                

model 2 build time 25.9 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_18b2de402b4c, depth=14, numNodes=51, numClasses=2, numFeatures=1


                                                                                

=== size_mult=15;         n_part_req=4; n_part_act=4; df_count=3782055 ===



                                                                                

model 0 build time 15.65 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_74f707620279, depth=11, numNodes=51, numClasses=2, numFeatures=1


                                                                                

model 1 build time 15.17 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_5a2f82da5905, depth=11, numNodes=51, numClasses=2, numFeatures=1


                                                                                

model 2 build time 14.32 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_27bcf1666b4a, depth=11, numNodes=51, numClasses=2, numFeatures=1


                                                                                

=== size_mult=15;         n_part_req=8; n_part_act=8; df_count=3782055 ===



                                                                                

model 0 build time 9.66 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_093070c2a2a9, depth=14, numNodes=69, numClasses=2, numFeatures=1


                                                                                

model 1 build time 9.3 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_7f47c24ee4df, depth=14, numNodes=69, numClasses=2, numFeatures=1


                                                                                

model 2 build time 9.19 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_597d3307b2c9, depth=14, numNodes=69, numClasses=2, numFeatures=1


                                                                                

=== size_mult=15;         n_part_req=16; n_part_act=16; df_count=3782055 ===



                                                                                

model 0 build time 10.01 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_d52f9fbdde9d, depth=10, numNodes=35, numClasses=2, numFeatures=1


                                                                                

model 1 build time 10.99 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_e284d89d7d76, depth=10, numNodes=35, numClasses=2, numFeatures=1


                                                                                

model 2 build time 10.79 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_e05c579eeb27, depth=10, numNodes=35, numClasses=2, numFeatures=1


                                                                                

=== size_mult=20;         n_part_req=2; n_part_act=2; df_count=5042740 ===



                                                                                

model 0 build time 31.3 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_ad0f86450a55, depth=11, numNodes=39, numClasses=2, numFeatures=1


                                                                                

model 1 build time 28.21 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_340aa2538734, depth=11, numNodes=39, numClasses=2, numFeatures=1


                                                                                

model 2 build time 28.6 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_63accc4464d9, depth=11, numNodes=39, numClasses=2, numFeatures=1


                                                                                

=== size_mult=20;         n_part_req=4; n_part_act=4; df_count=5042740 ===



                                                                                

model 0 build time 17.91 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_4fdf4e432a3a, depth=14, numNodes=59, numClasses=2, numFeatures=1


                                                                                

model 1 build time 16.52 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_e561d9f88f3e, depth=14, numNodes=59, numClasses=2, numFeatures=1


                                                                                

model 2 build time 16.73 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_1415710e8bc8, depth=14, numNodes=59, numClasses=2, numFeatures=1


                                                                                

=== size_mult=20;         n_part_req=8; n_part_act=8; df_count=5042740 ===



                                                                                

model 0 build time 12.29 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_3246aa146ad3, depth=10, numNodes=41, numClasses=2, numFeatures=1


                                                                                

model 1 build time 10.56 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_a7361fdb4d5d, depth=10, numNodes=41, numClasses=2, numFeatures=1


                                                                                

model 2 build time 11.24 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_53ab2aca6dc6, depth=10, numNodes=41, numClasses=2, numFeatures=1


                                                                                

=== size_mult=20;         n_part_req=16; n_part_act=16; df_count=5042740 ===



                                                                                

model 0 build time 10.93 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_8f96a831c5c5, depth=14, numNodes=49, numClasses=2, numFeatures=1


                                                                                

model 1 build time 10.52 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_e558f940a3d2, depth=14, numNodes=49, numClasses=2, numFeatures=1


                                                                                

model 2 build time 10.69 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_e679e9be9906, depth=14, numNodes=49, numClasses=2, numFeatures=1


                                                                                

=== size_mult=30;         n_part_req=2; n_part_act=2; df_count=7564110 ===



                                                                                

model 0 build time 40.83 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_f168066ab26c, depth=14, numNodes=63, numClasses=2, numFeatures=1


                                                                                

model 1 build time 39.47 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_dbb8de7b15fd, depth=14, numNodes=63, numClasses=2, numFeatures=1


                                                                                

model 2 build time 39.12 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_7cc0cdd002c9, depth=14, numNodes=63, numClasses=2, numFeatures=1


                                                                                

=== size_mult=30;         n_part_req=4; n_part_act=4; df_count=7564110 ===



                                                                                

model 0 build time 22.14 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_6bc8843cbd5c, depth=12, numNodes=45, numClasses=2, numFeatures=1


                                                                                

model 1 build time 23.11 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_cf692767c433, depth=12, numNodes=45, numClasses=2, numFeatures=1


                                                                                

model 2 build time 20.33 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_e1216f868c22, depth=12, numNodes=45, numClasses=2, numFeatures=1


                                                                                

=== size_mult=30;         n_part_req=8; n_part_act=8; df_count=7564110 ===



                                                                                

model 0 build time 14.05 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_6b39bac636ef, depth=11, numNodes=37, numClasses=2, numFeatures=1


                                                                                

model 1 build time 15.34 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_3e116dee3661, depth=11, numNodes=37, numClasses=2, numFeatures=1


                                                                                

model 2 build time 15.49 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_9af6445514eb, depth=11, numNodes=37, numClasses=2, numFeatures=1


                                                                                

=== size_mult=30;         n_part_req=16; n_part_act=16; df_count=7564110 ===



                                                                                

model 0 build time 15.31 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_225e16aaf894, depth=14, numNodes=49, numClasses=2, numFeatures=1


                                                                                

model 1 build time 13.75 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_0e27aa2869c3, depth=14, numNodes=49, numClasses=2, numFeatures=1


                                                                                

model 2 build time 13.95 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_b32f4b3de279, depth=14, numNodes=49, numClasses=2, numFeatures=1


                                                                                

=== size_mult=50;         n_part_req=2; n_part_act=2; df_count=12606850 ===



                                                                                

model 0 build time 55.2 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_f18bcbfd66e4, depth=11, numNodes=39, numClasses=2, numFeatures=1


                                                                                

model 1 build time 57.62 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_2dc8e04b8648, depth=11, numNodes=39, numClasses=2, numFeatures=1


                                                                                

model 2 build time 58.76 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_87784ba4effd, depth=11, numNodes=39, numClasses=2, numFeatures=1


                                                                                

=== size_mult=50;         n_part_req=4; n_part_act=4; df_count=12606850 ===



                                                                                

model 0 build time 32.42 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_cb2fc235b04e, depth=12, numNodes=41, numClasses=2, numFeatures=1


                                                                                

model 1 build time 30.17 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_019ba0d1c5fb, depth=12, numNodes=41, numClasses=2, numFeatures=1


                                                                                

model 2 build time 30.03 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_19dd622da30e, depth=12, numNodes=41, numClasses=2, numFeatures=1


                                                                                

=== size_mult=50;         n_part_req=8; n_part_act=8; df_count=12606850 ===



                                                                                

model 0 build time 19.37 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_81cfb8ff6492, depth=10, numNodes=37, numClasses=2, numFeatures=1


                                                                                

model 1 build time 18.33 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_56ab6fbc1ee5, depth=10, numNodes=37, numClasses=2, numFeatures=1


                                                                                

model 2 build time 18.97 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_e230d4169ac8, depth=10, numNodes=37, numClasses=2, numFeatures=1


                                                                                

=== size_mult=50;         n_part_req=16; n_part_act=16; df_count=12606850 ===



                                                                                

model 0 build time 22.75 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_07a0fc94fcb2, depth=14, numNodes=59, numClasses=2, numFeatures=1


                                                                                

model 1 build time 17.53 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_d8988de5da84, depth=14, numNodes=59, numClasses=2, numFeatures=1


                                                                                

model 2 build time 16.33 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_d7b075492f1c, depth=14, numNodes=59, numClasses=2, numFeatures=1


                                                                                

=== size_mult=100;         n_part_req=2; n_part_act=2; df_count=25213700 ===



                                                                                

model 0 build time 96.25 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_d44f92c68c21, depth=12, numNodes=39, numClasses=2, numFeatures=1


                                                                                

model 1 build time 87.82 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_fc56773b25c9, depth=12, numNodes=39, numClasses=2, numFeatures=1


                                                                                

model 2 build time 90.41 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_1c318b879874, depth=12, numNodes=39, numClasses=2, numFeatures=1


                                                                                

=== size_mult=100;         n_part_req=4; n_part_act=4; df_count=25213700 ===



                                                                                

model 0 build time 61.16 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_b6c009cf5f62, depth=10, numNodes=33, numClasses=2, numFeatures=1


                                                                                

model 1 build time 51.76 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_f8393789a9fe, depth=10, numNodes=33, numClasses=2, numFeatures=1


                                                                                

model 2 build time 62.91 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_6582d0a4e22b, depth=10, numNodes=33, numClasses=2, numFeatures=1


                                                                                

=== size_mult=100;         n_part_req=8; n_part_act=8; df_count=25213700 ===



                                                                                

model 0 build time 36.87 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_1126b68fe28a, depth=12, numNodes=47, numClasses=2, numFeatures=1


                                                                                

model 1 build time 34.9 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_c1d45c53dc6c, depth=12, numNodes=47, numClasses=2, numFeatures=1


                                                                                

model 2 build time 32.25 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_784a5e1b8af4, depth=12, numNodes=47, numClasses=2, numFeatures=1


                                                                                

=== size_mult=100;         n_part_req=16; n_part_act=16; df_count=25213700 ===



                                                                                

model 0 build time 37.88 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_5ee2c18a9caa, depth=10, numNodes=35, numClasses=2, numFeatures=1


                                                                                

model 1 build time 29.57 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_1b84bb484f88, depth=10, numNodes=35, numClasses=2, numFeatures=1




model 2 build time 26.45 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_07638440d27d, depth=10, numNodes=35, numClasses=2, numFeatures=1


                                                                                

# Results

In [10]:
di = result
df_res_time = pd.DataFrame()
df_res_nodes = pd.DataFrame()
for p1 in di.keys():
    for p2 in di[p1].keys():
        df_res_time.loc[p1, p2] = np.round(np.median(di[p1][p2][1]), 1)
        m_tmp = di[p1][p2][0]
        n_nodes = []
        for i,_ in enumerate(m_tmp):
            tmp = f'{m_tmp[i]}'.split(' ')
            print(i, tmp)
            n_nodes.append([int(x.split('=')[1][:-1]) for x in tmp if x[:4]== 'numN'][0])
        df_res_nodes.loc[p1, p2] = np.round(np.mean(n_nodes), 1)

df_res_time.to_csv(f'gs://mas-a5-storage-1/notebooks/jupyter/obj/{fn}_t.csv')
df_res_nodes.to_csv(f'gs://mas-a5-storage-1/notebooks/jupyter/obj/{fn}_n.csv')

display(df_res_time)      
display(df_res_nodes)     

0 ['DecisionTreeClassificationModel:', 'uid=DecisionTreeClassifier_b30adb956b80,', 'depth=14,', 'numNodes=123,', 'numClasses=2,', 'numFeatures=1']
1 ['DecisionTreeClassificationModel:', 'uid=DecisionTreeClassifier_f2d4a7ccf821,', 'depth=14,', 'numNodes=123,', 'numClasses=2,', 'numFeatures=1']
2 ['DecisionTreeClassificationModel:', 'uid=DecisionTreeClassifier_743212ae4e3c,', 'depth=14,', 'numNodes=123,', 'numClasses=2,', 'numFeatures=1']
0 ['DecisionTreeClassificationModel:', 'uid=DecisionTreeClassifier_c2b28cc4ad08,', 'depth=14,', 'numNodes=123,', 'numClasses=2,', 'numFeatures=1']
1 ['DecisionTreeClassificationModel:', 'uid=DecisionTreeClassifier_1d3589241bea,', 'depth=14,', 'numNodes=123,', 'numClasses=2,', 'numFeatures=1']
2 ['DecisionTreeClassificationModel:', 'uid=DecisionTreeClassifier_7b4a57dc6944,', 'depth=14,', 'numNodes=123,', 'numClasses=2,', 'numFeatures=1']
0 ['DecisionTreeClassificationModel:', 'uid=DecisionTreeClassifier_fafd2a48ffa6,', 'depth=14,', 'numNodes=123,', 'numC

Unnamed: 0,2,4,8,16
1,6.5,7.7,10.1,15.4
2,8.6,5.6,6.6,10.3
5,14.8,11.0,8.8,22.1
10,20.5,12.2,9.1,10.3
15,25.9,15.2,9.3,10.8
20,28.6,16.7,11.2,10.7
30,39.5,22.1,15.3,13.9
50,57.6,30.2,19.0,17.5
100,90.4,61.2,34.9,29.6


Unnamed: 0,2,4,8,16
1,123.0,123.0,123.0,123.0
2,125.0,121.0,89.0,83.0
5,85.0,55.0,73.0,83.0
10,69.0,51.0,107.0,99.0
15,51.0,51.0,69.0,35.0
20,39.0,59.0,41.0,49.0
30,63.0,45.0,37.0,49.0
50,39.0,41.0,37.0,59.0
100,39.0,33.0,47.0,35.0


In [11]:
fn = '4cpu_by_8n__4m_yarn_ssd'

tmp = pd.read_csv(f'gs://mas-a5-storage-1/notebooks/jupyter/obj/{fn}_t.csv')
tmp

Unnamed: 0.1,Unnamed: 0,2,4,8,16
0,1,6.5,7.7,10.1,15.4
1,2,8.6,5.6,6.6,10.3
2,5,14.8,11.0,8.8,22.1
3,10,20.5,12.2,9.1,10.3
4,15,25.9,15.2,9.3,10.8
5,20,28.6,16.7,11.2,10.7
6,30,39.5,22.1,15.3,13.9
7,50,57.6,30.2,19.0,17.5
8,100,90.4,61.2,34.9,29.6


In [12]:
fn = '4cpu_by_8n__4m_yarn_ssd'
tmp = pd.read_csv(f'gs://mas-a5-storage-1/notebooks/jupyter/obj/{fn}_n.csv')
tmp

Unnamed: 0.1,Unnamed: 0,2,4,8,16
0,1,123.0,123.0,123.0,123.0
1,2,125.0,121.0,89.0,83.0
2,5,85.0,55.0,73.0,83.0
3,10,69.0,51.0,107.0,99.0
4,15,51.0,51.0,69.0,35.0
5,20,39.0,59.0,41.0,49.0
6,30,63.0,45.0,37.0,49.0
7,50,39.0,41.0,37.0,59.0
8,100,39.0,33.0,47.0,35.0
