# Library

In [14]:
#import sys
import pandas as pd
import numpy as np
import random
import pyspark
import itertools
import matplotlib.pyplot as plt
import itertools
import seaborn as sns
import pickle
import statsmodels.api as sm

from pyspark import SparkContext, SQLContext

from math import sqrt
from time import time as ttt

from pyspark.sql import SparkSession
from pyspark.sql import functions as f

from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier as DTC_spark
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

import joblib
from joblib import parallel_backend
from joblib import Parallel, delayed
from joblib import parallel_backend


# Upload

In [15]:
spark = SparkSession.builder.master("local").appName("spark_app_1234").getOrCreate()
sc = spark.sparkContext
spark
d0 = (
    spark
    .read
    .format("csv") 
    .option("header","true") 
    .option("inferSchema","true") 
    .load("/Users/AM/home-credit-default-risk/application_train.csv")
)

d1 = d0.filter(d0.DAYS_EMPLOYED != 365243).select('TARGET','DAYS_EMPLOYED')

print(d1.rdd.getNumPartitions())



2


                                                                                

# Function

In [16]:
def prepare_spark_data(n_part, k_mult):
    '''
    takes 'DAYS_EMPLOYED and 'TARGET'  from d1 (alreay filtered)
    rearrane in n partitions (if n==0 keeps initial number of partitions)
    prints final shape/ partition
    returns d2 - spark df
    '''
    data = d1
    data_new = data # first step in the cycle
    for i in range(k_mult-1):
        data_tmp = data.select('TARGET', \
                     f.col('DAYS_EMPLOYED')*(f.lit(0.9995) + f.rand()/1000)).\
                   toDF('TARGET','DAYS_EMPLOYED')
        data_tmp = data_tmp.select('TARGET', f.floor('DAYS_EMPLOYED'))
        data_new = data_new.union(data_tmp)

    assembler = VectorAssembler(inputCols=["DAYS_EMPLOYED"], 
                        outputCol="DAYS_EMPLOYED_vect")
    d2 = assembler.transform(data_new)
    if n_part != 0:
        d2 = d2.repartition(n_part, "DAYS_EMPLOYED_vect")       
    #print(f'n-partitions initial: {d2.rdd.getNumPartitions()}; df size: {d2.count()}\n')

    return d2

# Params an run

In [17]:
## !! mind the fn NOT TO rewrite results
fn = 'local'
size_mult = [1, 5]
partitions = [2, 4, 6, 8] 
# initial, 2**i incl n_nodes till n_cpu*n_nodes and n_cpu*n_nodes
n_iter = 3

print('d1-size', d1.count())
n_part_base = d1.rdd.getNumPartitions()
print('n-partitions initial', n_part_base, '\n')

rd1 = {}
for k_size_mult in size_mult:
    rd2 = {}
    for n_part in partitions:
        df = prepare_spark_data(n_part, k_size_mult)
        df.cache()
        print('======================================================================')
        print(f'=== size_mult={k_size_mult}; \
        n_part_req={n_part}; n_part_act={df.rdd.getNumPartitions()}; df_count={df.count()} ===\n')
        times = [0 for i in range(n_iter)]
        models = {}
        for i in range(n_iter):
            dt = DTC_spark(labelCol="TARGET",
                       featuresCol="DAYS_EMPLOYED_vect",
                       minInfoGain=0.0001,
                       impurity='entropy',
                       maxDepth=14, maxBins=2**14, # it differs from scikit learn - it means number of canidate split points
                       #minInstancesPerNode = 1,
                       #checkpointInterval = 10
                       )
           
            t0 = ttt()
            model = dt.fit(df)
            t1 = ttt()
            times[i] = t1-t0
            models[i] = model 
            print('model', i, 'build time', round(times[i],2), '\n', model)
        rd2[n_part] = (models, times)
    rd1[k_size_mult] = rd2

result = rd1  

                                                                                

d1-size 252137
n-partitions initial 2 



                                                                                

=== size_mult=1;         n_part_req=2; n_part_act=2; df_count=252137 ===



                                                                                

model 0 build time 10.41 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_8520c350c004, depth=14, numNodes=123, numClasses=2, numFeatures=1


                                                                                

model 1 build time 10.3 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_f1f264be0b37, depth=14, numNodes=123, numClasses=2, numFeatures=1


                                                                                

model 2 build time 10.5 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_8909a9d4a951, depth=14, numNodes=123, numClasses=2, numFeatures=1


                                                                                

=== size_mult=1;         n_part_req=4; n_part_act=4; df_count=252137 ===



21/11/24 23:35:33 WARN BlockManager: Asked to remove block broadcast_3894_piece0, which does not exist
                                                                                

model 0 build time 11.91 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_16b4aca40c7c, depth=14, numNodes=123, numClasses=2, numFeatures=1


                                                                                

model 1 build time 11.06 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_51bef95b4cec, depth=14, numNodes=123, numClasses=2, numFeatures=1


                                                                                

model 2 build time 11.42 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_a9931982c69d, depth=14, numNodes=123, numClasses=2, numFeatures=1


                                                                                

=== size_mult=1;         n_part_req=6; n_part_act=6; df_count=252137 ===



                                                                                

model 0 build time 13.03 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_43ad24fb5f93, depth=14, numNodes=123, numClasses=2, numFeatures=1


                                                                                

model 1 build time 12.86 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_f8c55491782d, depth=14, numNodes=123, numClasses=2, numFeatures=1


                                                                                

model 2 build time 12.89 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_4d7bb9132227, depth=14, numNodes=123, numClasses=2, numFeatures=1


                                                                                

=== size_mult=1;         n_part_req=8; n_part_act=8; df_count=252137 ===



                                                                                

model 0 build time 14.2 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_57552a4ac81c, depth=14, numNodes=123, numClasses=2, numFeatures=1


                                                                                

model 1 build time 13.83 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_0d1d3a399ae9, depth=14, numNodes=123, numClasses=2, numFeatures=1


                                                                                

model 2 build time 14.27 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_191b17ae1e65, depth=14, numNodes=123, numClasses=2, numFeatures=1


                                                                                

=== size_mult=5;         n_part_req=2; n_part_act=2; df_count=1260685 ===



                                                                                

model 0 build time 24.58 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_300a0a7b2896, depth=14, numNodes=75, numClasses=2, numFeatures=1


                                                                                

model 1 build time 24.74 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_e8813a4fcb90, depth=14, numNodes=75, numClasses=2, numFeatures=1


                                                                                

model 2 build time 25.0 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_57c2d4e18670, depth=14, numNodes=75, numClasses=2, numFeatures=1


                                                                                

=== size_mult=5;         n_part_req=4; n_part_act=4; df_count=1260685 ===



                                                                                

model 0 build time 29.0 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_79798525e05e, depth=14, numNodes=97, numClasses=2, numFeatures=1


                                                                                

model 1 build time 27.48 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_835037865a6b, depth=14, numNodes=97, numClasses=2, numFeatures=1


                                                                                

model 2 build time 27.45 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_315472a02320, depth=14, numNodes=97, numClasses=2, numFeatures=1


                                                                                

=== size_mult=5;         n_part_req=6; n_part_act=6; df_count=1260685 ===



                                                                                

model 0 build time 30.66 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_b129d778a26f, depth=14, numNodes=123, numClasses=2, numFeatures=1


                                                                                

model 1 build time 31.56 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_46a3eff10789, depth=14, numNodes=123, numClasses=2, numFeatures=1


                                                                                

model 2 build time 29.44 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_90ba8174e7d8, depth=14, numNodes=123, numClasses=2, numFeatures=1


                                                                                

=== size_mult=5;         n_part_req=8; n_part_act=8; df_count=1260685 ===



                                                                                

model 0 build time 31.65 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_9235917104ef, depth=14, numNodes=149, numClasses=2, numFeatures=1


                                                                                

model 1 build time 31.59 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_bd07a45527ab, depth=14, numNodes=149, numClasses=2, numFeatures=1




model 2 build time 32.3 
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_4ab89cd548d2, depth=14, numNodes=149, numClasses=2, numFeatures=1


                                                                                

# Results

In [11]:
di = result
df_res_time = pd.DataFrame()
df_res_nodes = pd.DataFrame()
for p1 in di.keys():
    for p2 in di[p1].keys():
        df_res_time.loc[p1, p2] = np.round(np.mean(di[p1][p2][1]), 1)
        m_tmp = di[p1][p2][0]
        n_nodes = []
        for i,_ in enumerate(m_tmp):
            tmp = f'{m_tmp[i]}'.split(' ')
            print(i, tmp)
            n_nodes.append([int(x.split('=')[1][:-1]) for x in tmp if x[:4]== 'numN'][0])
        df_res_nodes.loc[p1, p2] = np.round(np.mean(n_nodes), 1)

df_res_time.to_csv(f'gs://mas-a5-storage-1/notebooks/jupyter/obj/{fn}_t.csv')
df_res_nodes.to_csv(f'gs://mas-a5-storage-1/notebooks/jupyter/obj/{fn}_n.csv')

display(df_res_time)      
display(df_res_nodes)     

0 ['DecisionTreeClassificationModel:', 'uid=DecisionTreeClassifier_157eb890c4b6,', 'depth=14,', 'numNodes=123,', 'numClasses=2,', 'numFeatures=1']
1 ['DecisionTreeClassificationModel:', 'uid=DecisionTreeClassifier_ebd445b49b05,', 'depth=14,', 'numNodes=123,', 'numClasses=2,', 'numFeatures=1']
2 ['DecisionTreeClassificationModel:', 'uid=DecisionTreeClassifier_306f5e621e41,', 'depth=14,', 'numNodes=123,', 'numClasses=2,', 'numFeatures=1']
0 ['DecisionTreeClassificationModel:', 'uid=DecisionTreeClassifier_3f7be4f90863,', 'depth=14,', 'numNodes=123,', 'numClasses=2,', 'numFeatures=1']
1 ['DecisionTreeClassificationModel:', 'uid=DecisionTreeClassifier_6a6ede5d47f2,', 'depth=14,', 'numNodes=123,', 'numClasses=2,', 'numFeatures=1']
2 ['DecisionTreeClassificationModel:', 'uid=DecisionTreeClassifier_8a2b9e28684b,', 'depth=14,', 'numNodes=123,', 'numClasses=2,', 'numFeatures=1']
0 ['DecisionTreeClassificationModel:', 'uid=DecisionTreeClassifier_0e1f9b85ff8d,', 'depth=14,', 'numNodes=123,', 'numC

ImportError: Please install gcsfs to access Google Storage

In [13]:
df_res_time.to_csv(f'obj/{fn}_t.csv')
df_res_nodes.to_csv(f'obj/{fn}_n.csv')

display(df_res_time)      
display(df_res_nodes)     

Unnamed: 0,0,2,4,6,8,16
1,11.2,10.1,11.8,13.5,14.6,20.1
2,15.7,14.8,17.4,16.5,20.8,25.7
5,30.9,23.6,28.0,23.2,26.1,39.7
10,56.9,30.6,34.1,34.9,41.3,51.2


Unnamed: 0,0,2,4,6,8,16
1,123.0,123.0,123.0,123.0,123.0,123.0
2,155.0,137.0,129.0,121.0,117.0,127.0
5,75.0,101.0,109.0,89.0,93.0,109.0
10,63.0,101.0,69.0,85.0,101.0,77.0


In [6]:
fn = 'local'

tmp = pd.read_csv(f'gs://mas-a5-storage-1/notebooks/jupyter/obj/{fn}_t.csv')
tmp

Unnamed: 0.1,Unnamed: 0,0,2,4,6,8,16,24
0,1,8.2,6.7,7.9,9.1,10.2,15.6,20.8
1,2,14.3,8.9,9.8,5.4,6.4,17.4,27.8
2,5,45.5,13.8,10.5,8.0,6.6,5.4,6.3
3,10,48.4,20.0,12.3,8.8,8.9,6.6,7.6
4,15,61.2,24.5,14.6,11.8,9.8,7.8,8.6
5,20,84.0,29.3,17.2,13.5,11.8,7.7,8.1
6,30,135.2,39.0,22.2,16.3,14.0,9.5,11.4
7,50,226.8,49.5,29.2,21.3,18.0,12.7,14.0
8,100,463.1,91.3,56.2,39.5,33.5,21.7,24.8


In [7]:
fn = 'local'
tmp = pd.read_csv(f'gs://mas-a5-storage-1/notebooks/jupyter/obj/{fn}_n.csv')
tmp

Unnamed: 0.1,Unnamed: 0,0,2,4,6,8,16,24
0,1,123.0,123.0,123.0,123.0,123.0,123.0,123.0
1,2,149.0,159.0,143.0,97.0,121.0,127.0,131.0
2,5,91.0,97.0,123.0,121.0,93.0,95.0,75.0
3,10,85.0,93.0,69.0,71.0,81.0,61.0,77.0
4,15,63.0,43.0,39.0,65.0,43.0,53.0,47.0
5,20,63.0,61.0,59.0,47.0,63.0,65.0,37.0
6,30,57.0,31.0,37.0,39.0,45.0,45.0,53.0
7,50,31.0,43.0,35.0,65.0,37.0,55.0,37.0
8,100,35.0,39.0,39.0,53.0,37.0,31.0,41.0
