In [1]:
from pyspark.sql.functions import col, avg, stddev, sum, row_number, lit
from pyspark.sql.functions import radians, sin, cos, sqrt, atan2
from pyspark.sql import functions as F
from pyspark.sql.functions import broadcast
from pyspark.sql.window import Window

In [2]:
import pandas as pd
import numpy as np
import dask.dataframe as dd
from dask.distributed import Client, LocalCluster
from pyspark.sql import SparkSession
import pyspark
 
print('pandas version: %s' % pd.__version__)
print('numpy version: %s' % np.__version__)
print('pyspark version: %s' % pyspark.__version__)
import dask
print('dask version: %s' % dask.__version__)
 
import time
 
def benchmark(f, df, benchmarks, name, **kwargs):
    """Benchmark the given function against the given DataFrame.
    
    Parameters
    ----------
    f: function to benchmark
    df: data frame
    benchmarks: container for benchmark results
    name: task name
    
    Returns
    -------
    Duration (in seconds) of the given operation
    """
    start_time = time.time()
    ret = f(df, **kwargs)
    benchmarks['duration'].append(time.time() - start_time)
    benchmarks['task'].append(name)
    print(f"{name} took: {benchmarks['duration'][-1]} seconds")
    return benchmarks['duration'][-1]
 
def get_results(benchmarks):
    """Return a pandas DataFrame containing benchmark results."""
    return pd.DataFrame.from_dict(benchmarks)

pandas version: 1.4.4
numpy version: 1.22.4
pyspark version: 3.3.2
dask version: 2022.01.1


In [3]:
from dask.distributed import Client
from dask_yarn import YarnCluster

cluster = YarnCluster()
client = Client(cluster)

cluster.adapt() # Dynamically scale Dask resources

  from distributed.utils import (
  from distributed.utils import (
24/06/03 23:07:41 INFO client.DefaultNoHARMFailoverProxyProvider: Connecting to ResourceManager at cluster-multi-nodes-m.europe-west2-c.c.nd-project-bdcc-up202310061.internal./10.154.0.16:8032
24/06/03 23:07:41 INFO client.AHSProxy: Connecting to Application History server at cluster-multi-nodes-m.europe-west2-c.c.nd-project-bdcc-up202310061.internal./10.154.0.16:10200
24/06/03 23:07:41 INFO skein.Driver: Driver started, listening on 33881
24/06/03 23:07:42 INFO conf.Configuration: resource-types.xml not found
24/06/03 23:07:42 INFO resource.ResourceUtils: Unable to find 'resource-types.xml'.
24/06/03 23:07:42 INFO skein.Driver: Uploading application resources to hdfs://cluster-multi-nodes-m/user/root/.skein/application_1717455815209_0002
24/06/03 23:07:43 INFO skein.Driver: Submitting application...
24/06/03 23:07:43 INFO impl.YarnClientImpl: Submitted application application_1717455815209_0002


In [4]:
dask_data = dd.read_parquet("gs://bucket-for-cluster-dataproc/data/*.parquet")
 
dask_benchmarks = {
    'duration': [],  # in seconds
    'task': [],
}
dask_data

Unnamed: 0_level_0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee
npartitions=4,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
,Int32,datetime64[ns],datetime64[ns],Int64,float64,Int64,object,Int32,Int32,Int64,float64,float64,float64,float64,float64,float64,float64,float64,float64
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [5]:
def read_file_parquet(df=None):
    return dd.read_parquet("gs://bucket-for-cluster-dataproc/data/*.parquet")
  
def count(df=None):
    return len(df)
 
def mean(df):
    return df.fare_amount.mean().compute()
 
def standard_deviation(df):
    return df.fare_amount.std().compute()
 
def mean_of_sum(df):
    return (df.fare_amount + df.tip_amount).mean().compute()
 
def sum_columns(df):
    return (df.fare_amount + df.tip_amount).compute()
 
def mean_of_product(df):
    return (df.fare_amount * df.tip_amount).mean().compute()
 
def product_columns(df):
    return (df.fare_amount * df.tip_amount).compute()
  
def value_counts(df):
    return df.fare_amount.value_counts().compute()
  
def mean_of_complicated_arithmetic_operation(df):
    theta_1 = df.PULocationID
    phi_1 = df.PULocationID
    theta_2 = df.DOLocationID
    phi_2 = df.DOLocationID
    temp = (np.sin((theta_2-theta_1)/2*np.pi/180)**2
           + np.cos(theta_1*np.pi/180)*np.cos(theta_2*np.pi/180) * np.sin((phi_2-phi_1)/2*np.pi/180)**2)
    ret = 2 * np.arctan2(np.sqrt(temp), np.sqrt(1-temp))
    return ret.mean().compute()
  
def complicated_arithmetic_operation(df):
    theta_1 = df.PULocationID
    phi_1 = df.PULocationID
    theta_2 = df.DOLocationID
    phi_2 = df.DOLocationID
    temp = (np.sin((theta_2-theta_1)/2*np.pi/180)**2
           + np.cos(theta_1*np.pi/180)*np.cos(theta_2*np.pi/180) * np.sin((phi_2-phi_1)/2*np.pi/180)**2)
    ret = 2 * np.arctan2(np.sqrt(temp), np.sqrt(1-temp))
    return ret.compute()
  
def groupby_statistics(df):
    return df.groupby(by='passenger_count').agg(
      {
        'fare_amount': ['mean', 'std'], 
        'tip_amount': ['mean', 'std']
      }
    ).compute()
  
other = groupby_statistics(dask_data)
other.columns = pd.Index([e[0]+'_' + e[1] for e in other.columns.tolist()])
 
def join_count(df, other):
    return len(dd.merge(df, other, left_index=True, right_index=True))
 
def join_data(df, other):
    return dd.merge(df, other, left_index=True, right_index=True).compute()

In [6]:
benchmark(read_file_parquet, df=None, benchmarks=dask_benchmarks, name='read file')
benchmark(count, df=dask_data, benchmarks=dask_benchmarks, name='count')
benchmark(mean, df=dask_data, benchmarks=dask_benchmarks, name='mean')
benchmark(standard_deviation, df=dask_data, benchmarks=dask_benchmarks, name='standard deviation')
benchmark(mean_of_sum, df=dask_data, benchmarks=dask_benchmarks, name='mean of columns addition')
benchmark(sum_columns, df=dask_data, benchmarks=dask_benchmarks, name='addition of columns')
benchmark(mean_of_product, df=dask_data, benchmarks=dask_benchmarks, name='mean of columns multiplication')
benchmark(product_columns, df=dask_data, benchmarks=dask_benchmarks, name='multiplication of columns')
benchmark(value_counts, df=dask_data, benchmarks=dask_benchmarks, name='value counts')
benchmark(mean_of_complicated_arithmetic_operation, df=dask_data, benchmarks=dask_benchmarks, name='mean of complex arithmetic ops')
benchmark(complicated_arithmetic_operation, df=dask_data, benchmarks=dask_benchmarks, name='complex arithmetic ops')
benchmark(groupby_statistics, df=dask_data, benchmarks=dask_benchmarks, name='groupby statistics')
benchmark(join_count, dask_data, benchmarks=dask_benchmarks, name='join count', other=other)
benchmark(join_data, dask_data, benchmarks=dask_benchmarks, name='join', other=other)

read file took: 0.20809412002563477 seconds
count took: 2.9065091609954834 seconds
mean took: 1.825331687927246 seconds
standard deviation took: 1.6517345905303955 seconds
mean of columns addition took: 1.8503732681274414 seconds
addition of columns took: 2.8408076763153076 seconds
mean of columns multiplication took: 1.7823243141174316 seconds
multiplication of columns took: 2.5357179641723633 seconds
value counts took: 2.0391838550567627 seconds
mean of complex arithmetic ops took: 3.149811029434204 seconds
complex arithmetic ops took: 3.4084455966949463 seconds
groupby statistics took: 8.384632110595703 seconds
join count took: 9.47532033920288 seconds
join took: 8.958397150039673 seconds


8.958397150039673

In [7]:
expr_filter = (dask_data.tip_amount >= 1) & (dask_data.tip_amount <= 5)
 
def filter_data(df):
    return df[expr_filter]
  
dask_filtered = filter_data(dask_data)

removed the count_index_lenght operation since the spark does not make use of these indexes in parquet files. So I have decided to remove it both from Dask and Pyspark. 

In [8]:
benchmark(count, dask_filtered, benchmarks=dask_benchmarks, name='filtered count')
benchmark(mean, dask_filtered, benchmarks=dask_benchmarks, name='filtered mean')
benchmark(standard_deviation, dask_filtered, benchmarks=dask_benchmarks, name='filtered standard deviation')
benchmark(mean_of_sum, dask_filtered, benchmarks=dask_benchmarks, name ='filtered mean of columns addition')
benchmark(sum_columns, df=dask_filtered, benchmarks=dask_benchmarks, name='filtered addition of columns')
benchmark(mean_of_product, dask_filtered, benchmarks=dask_benchmarks, name ='filtered mean of columns multiplication')
benchmark(product_columns, df=dask_filtered, benchmarks=dask_benchmarks, name='filtered multiplication of columns')
benchmark(mean_of_complicated_arithmetic_operation, dask_filtered, benchmarks=dask_benchmarks, name='filtered mean of complex arithmetic ops')
benchmark(complicated_arithmetic_operation, dask_filtered, benchmarks=dask_benchmarks, name='filtered complex arithmetic ops')
benchmark(value_counts, dask_filtered, benchmarks=dask_benchmarks, name ='filtered value counts')
benchmark(groupby_statistics, dask_filtered, benchmarks=dask_benchmarks, name='filtered groupby statistics')
 
other = groupby_statistics(dask_filtered)
other.columns = pd.Index([e[0]+'_' + e[1] for e in other.columns.tolist()])
 
benchmark(join_count, dask_filtered, benchmarks=dask_benchmarks, name='filtered join count', other=other)
benchmark(join_data, dask_filtered, benchmarks=dask_benchmarks, name='filtered join', other=other)

filtered count took: 19.202646017074585 seconds
filtered mean took: 8.536171674728394 seconds
filtered standard deviation took: 7.854693651199341 seconds
filtered mean of columns addition took: 8.639937162399292 seconds
filtered addition of columns took: 8.870735883712769 seconds
filtered mean of columns multiplication took: 7.799517393112183 seconds
filtered multiplication of columns took: 9.387236833572388 seconds
filtered mean of complex arithmetic ops took: 8.640625953674316 seconds
filtered complex arithmetic ops took: 8.963932037353516 seconds
filtered value counts took: 8.066583633422852 seconds
filtered groupby statistics took: 8.773380994796753 seconds
filtered join count took: 8.929194688796997 seconds
filtered join took: 9.067906856536865 seconds


9.067906856536865

In [9]:
pyspark_data = spark.read.parquet("gs://bucket-for-cluster-dataproc/data/*.parquet")


pyspark_benchmarks = {
    'duration': [],  # in seconds
    'task': [],
}
pyspark_data.count()

                                                                                

12931345

In [10]:
def read_file_parquet(df=None):
    return spark.read.parquet("gs://bucket-for-cluster-dataproc/data/*.parquet")
  
def count(df=None):
    return df.count()
 

def mean(df):
    return df.agg(avg(df.fare_amount)).collect()
 
def standard_deviation(df):
    return df.agg(stddev(df.fare_amount)).collect()
 
def mean_of_sum(df):
    return df.select((col("fare_amount") + col("tip_amount")).alias("total_amount")).agg(avg("total_amount")).collect()
 
def sum_columns(df):
    result = df.select((col("fare_amount") + col("tip_amount")).alias("total_amount")).collect()
    return result
 
def mean_of_product(df):
    return df.select((col("fare_amount") * col("tip_amount")).alias("total_amount")).agg(avg("total_amount")).collect()
 
def product_columns(df):
    result = df.select((col("fare_amount") * col("tip_amount")).alias("total_prod")).agg(avg("total_prod")).collect()
    return result
 
def value_counts(df):
    val_counts = df.groupBy("fare_amount").count()
    return val_counts.collect()

def complicated_arithmetic_operation(df):
    temp = (
        (sin(radians(df['DOLocationID'] - df['PULocationID']) / 2) ** 2) +
        (cos(radians(df['PULocationID'])) * cos(radians(df['DOLocationID'])) * (sin(radians(df['DOLocationID'] - df['PULocationID']) / 2) ** 2))
    )
    ret = 2 * atan2(sqrt(temp), sqrt(1 - temp))
    return df.withColumn('result', ret).select('result').collect()

def mean_of_complicated_arithmetic_operation(df):
    temp = (
        (sin(radians(df['DOLocationID'] - df['PULocationID']) / 2) ** 2) +
        (cos(radians(df['PULocationID'])) * cos(radians(df['DOLocationID'])) * (sin(radians(df['DOLocationID'] - df['PULocationID']) / 2) ** 2))
    )
    ret = 2 * atan2(sqrt(temp), sqrt(1 - temp))
    return df.withColumn('result', ret).agg(F.mean('result')).collect()[0][0]


def groupby_statistics(df):
    gb = df.groupBy('passenger_count').agg(
        avg("fare_amount"), stddev("fare_amount"),
        avg("tip_amount"), stddev("tip_amount")
    )
    return gb.toPandas()


windowSpec = Window.orderBy(lit(1))
other_spark = spark.createDataFrame(groupby_statistics(pyspark_data))
other_spark = other_spark.withColumn("index", row_number().over(windowSpec))
#other_spark.columns = pd.Index([e[0]+'_' + e[1] for e in other_spark.columns.tolist()])
pyspark_data_with_index = pyspark_data.withColumn("index", row_number().over(windowSpec))

def join_count(df, other):
    joined_df = df.join(broadcast(other), on="index")
    # Count the number of rows in the joined DataFrame
    count = joined_df.count()
    return count

def join_data(df, other):
    # Use broadcast hint to optimize the join
    ret = df.join(broadcast(other), on="index")
    return ret

                                                                                

In [7]:
benchmark(read_file_parquet, df=None, benchmarks=pyspark_benchmarks, name='read file')
benchmark(count, df=pyspark_data, benchmarks=pyspark_benchmarks, name='count')
benchmark(mean, df=pyspark_data, benchmarks=pyspark_benchmarks, name='mean')
benchmark(standard_deviation, df=pyspark_data, benchmarks=pyspark_benchmarks, name='standard deviation')
benchmark(mean_of_sum, df=pyspark_data, benchmarks=pyspark_benchmarks, name='mean of columns addition')
benchmark(sum_columns, df=pyspark_data, benchmarks=pyspark_benchmarks, name='addition of columns')
benchmark(mean_of_product, df=pyspark_data, benchmarks=pyspark_benchmarks, name='mean of columns multiplication')
benchmark(product_columns, df=pyspark_data, benchmarks=pyspark_benchmarks, name='multiplication of columns')
benchmark(value_counts, df=pyspark_data, benchmarks=pyspark_benchmarks, name='value counts')
benchmark(complicated_arithmetic_operation, df=pyspark_data, benchmarks=pyspark_benchmarks, name='complex arithmetic ops')
benchmark(mean_of_complicated_arithmetic_operation, df=pyspark_data, benchmarks=pyspark_benchmarks, name='mean of complex arithmetic ops')
benchmark(groupby_statistics, df=pyspark_data, benchmarks=pyspark_benchmarks, name='groupby statistics')
benchmark(join_data, pyspark_data_with_index, benchmarks=pyspark_benchmarks, name='join', other=other_spark)
benchmark(join_count, pyspark_data_with_index, benchmarks=pyspark_benchmarks, name='join count', other=other_spark)

read file took: 0.4556393623352051 seconds


                                                                                

count took: 4.926044702529907 seconds


                                                                                

mean took: 2.0580999851226807 seconds


                                                                                

standard deviation took: 1.4237465858459473 seconds


                                                                                

mean of columns addition took: 1.921140432357788 seconds


                                                                                

addition of columns took: 38.33495020866394 seconds


                                                                                

mean of columns multiplication took: 1.4750266075134277 seconds


                                                                                

multiplication of columns took: 1.218735694885254 seconds


                                                                                

value counts took: 3.777167558670044 seconds


                                                                                

complex arithmetic ops took: 37.43865418434143 seconds


                                                                                

mean of complex arithmetic ops took: 2.130606174468994 seconds


24/05/30 15:19:56 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/05/30 15:19:56 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/05/30 15:19:56 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/05/30 15:19:56 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


groupby statistics took: 2.949803590774536 seconds
join took: 0.046562910079956055 seconds


24/05/30 15:19:58 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/05/30 15:19:58 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/05/30 15:19:58 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/05/30 15:19:58 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/05/30 15:19:58 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/05/30 15:19:58 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/05/30 1

join count took: 8.639317512512207 seconds


                                                                                

8.639317512512207

In [16]:
expr_filter = (pyspark_data.tip_amount >= 1) & (pyspark_data.tip_amount <= 5)
 
def filter_data(df):
    return df[expr_filter]
 
pyspark_filtered = filter_data(pyspark_data)

In [9]:
pyspark_data.unpersist()

DataFrame[VendorID: int, tpep_pickup_datetime: timestamp, tpep_dropoff_datetime: timestamp, passenger_count: bigint, trip_distance: double, RatecodeID: bigint, store_and_fwd_flag: string, PULocationID: int, DOLocationID: int, payment_type: bigint, fare_amount: double, extra: double, mta_tax: double, tip_amount: double, tolls_amount: double, improvement_surcharge: double, total_amount: double, congestion_surcharge: double, Airport_fee: double]

In [10]:
benchmark(count, pyspark_filtered, benchmarks=pyspark_benchmarks, name='filtered count')
benchmark(mean, pyspark_filtered, benchmarks=pyspark_benchmarks, name='filtered mean')
benchmark(standard_deviation, pyspark_filtered, benchmarks=pyspark_benchmarks, name='filtered standard deviation')
benchmark(mean_of_sum, pyspark_filtered, benchmarks=pyspark_benchmarks, name ='filtered mean of columns addition')
benchmark(sum_columns, df=pyspark_filtered, benchmarks=pyspark_benchmarks, name='filtered addition of columns')
benchmark(mean_of_product, pyspark_filtered, benchmarks=pyspark_benchmarks, name ='filtered mean of columns multiplication')
benchmark(product_columns, df=pyspark_filtered, benchmarks=pyspark_benchmarks, name='filtered multiplication of columns')
benchmark(mean_of_complicated_arithmetic_operation, pyspark_filtered, benchmarks=pyspark_benchmarks, name='filtered mean of complex arithmetic ops')
benchmark(complicated_arithmetic_operation, pyspark_filtered, benchmarks=pyspark_benchmarks, name='filtered complex arithmetic ops')
benchmark(value_counts, pyspark_filtered, benchmarks=pyspark_benchmarks, name ='filtered value counts')
benchmark(groupby_statistics, pyspark_filtered, benchmarks=pyspark_benchmarks, name='filtered groupby statistics')
 
other_spark = spark.createDataFrame(groupby_statistics(pyspark_filtered))
other_spark = other_spark.withColumn("index", row_number().over(windowSpec))
#other_spark.columns = pd.Index([e[0]+'_' + e[1] for e in other_spark.columns.tolist()])
pyspark_data_with_index_filtered = pyspark_filtered.withColumn("index", row_number().over(windowSpec))
    
benchmark(join_data, pyspark_data_with_index_filtered, benchmarks=pyspark_benchmarks, name='filtered join', other=other_spark)
benchmark(join_count, pyspark_data_with_index_filtered, benchmarks=pyspark_benchmarks, name='filtered join count', other=other_spark)

                                                                                

filtered count took: 1.8178718090057373 seconds


                                                                                

filtered mean took: 2.0907418727874756 seconds


                                                                                

filtered standard deviation took: 1.5138132572174072 seconds


                                                                                

filtered mean of columns addition took: 1.594895839691162 seconds


                                                                                

filtered addition of columns took: 20.493835926055908 seconds


                                                                                

filtered mean of columns multiplication took: 1.5635192394256592 seconds


                                                                                

filtered multiplication of columns took: 1.4159443378448486 seconds


                                                                                

filtered mean of complex arithmetic ops took: 2.129653215408325 seconds


                                                                                

filtered complex arithmetic ops took: 20.972954988479614 seconds


                                                                                

filtered value counts took: 2.6124513149261475 seconds


                                                                                

filtered groupby statistics took: 3.0877175331115723 seconds


24/05/30 15:21:24 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/05/30 15:21:24 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/05/30 15:21:24 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/05/30 15:21:24 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


filtered join took: 0.014917135238647461 seconds


24/05/30 15:21:25 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/05/30 15:21:25 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/05/30 15:21:25 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/05/30 15:21:25 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/05/30 15:21:26 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/05/30 15:21:26 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/05/30 1

filtered join count took: 6.593547582626343 seconds


                                                                                

6.593547582626343

In [16]:
pyspark_res_temp = get_results(pyspark_benchmarks).set_index('task')
dask_res_temp = get_results(dask_benchmarks).set_index('task')
pyspark_res_temp

Unnamed: 0_level_0,duration
task,Unnamed: 1_level_1
read file,0.455639
count,4.926045
mean,2.0581
standard deviation,1.423747
mean of columns addition,1.92114
addition of columns,38.33495
mean of columns multiplication,1.475027
multiplication of columns,1.218736
value counts,3.777168
complex arithmetic ops,37.438654


In [17]:
df = pd.concat([pyspark_res_temp.duration, dask_res_temp.duration],axis=1,keys=['pyspark', 'dask'])
df

Unnamed: 0_level_0,pyspark,dask
task,Unnamed: 1_level_1,Unnamed: 2_level_1
read file,0.455639,0.198511
count,4.926045,9.503813
mean,2.0581,1.560066
standard deviation,1.423747,1.66532
mean of columns addition,1.92114,1.953651
addition of columns,38.33495,2.698317
mean of columns multiplication,1.475027,1.79345
multiplication of columns,1.218736,2.633554
value counts,3.777168,1.733099
complex arithmetic ops,37.438654,5.752048


In [18]:
from datetime import datetime
from os import getcwd
 
filename = "gs://bucket-for-cluster-dataproc/multi_node_results_" + datetime.now().strftime("%H%M%S") + "_4files"
print(filename)
 
df.to_parquet(path=filename)

gs://bucket-for-cluster-dataproc/multi_node_results_152751_4files


In [12]:
!pip install yappi

Collecting yappi
  Downloading yappi-1.6.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading yappi-1.6.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: yappi
Successfully installed yappi-1.6.0
[0m

In [13]:
import yappi

In [17]:
yappi.start()
sum_columns(pyspark_filtered)

# Stop profiling
yappi.stop()

# Get thread stats
thread_stats = yappi.get_thread_stats()
print("Thread Stats:")
thread_stats.print_all()

# Get function stats
func_stats = yappi.get_func_stats()
print("Function Stats:")
func_stats.print_all()

                                                                                

Thread Stats:

name           id     tid              ttot      scnt        
_MainThread    0      139925494961984  571.3029  9049      
Thread         4      139923745007360  1.924969  1867      
Thread         3      139923155826432  0.830498  6541      
Thread         7      139924740839168  0.160554  1631      
Thread         5      139923147433728  0.135160  827       
Thread         6      139923139041024  0.128763  826       
Thread         1      139925429380864  0.094156  157       
..lizerWorker  8      139924732446464  0.082299  895       
..tPollerUnix  9      139924749231872  0.064662  826       
Thread         11     139925185877760  0.006549  40        
..avingThread  2      139925169092352  0.004412  12        
ControlThread  10     139925177485056  0.002049  4         
Function Stats:

Clock type: CPU
Ordered by: totaltime, desc

name                                  ncall  tsub      ttot      tavg      
..:3511 ZMQInteractiveShell.run_code  10     0.000324  571.2600  

In [18]:
yappi.start()
sum_columns(pyspark_filtered)

# Stop profiling
yappi.stop()

# Get thread stats
thread_stats = yappi.get_thread_stats()
print("Thread Stats:")
thread_stats.print_all()

# Get function stats
func_stats = yappi.get_func_stats()
print("Function Stats:")
func_stats.print_all()

                                                                                

Thread Stats:

name           id     tid              ttot      scnt        
_MainThread    0      139925494961984  673.3808  10710     
Thread         4      139923745007360  2.286550  2161      
Thread         3      139923155826432  1.010102  7493      
Thread         7      139924740839168  0.193503  1841      
Thread         5      139923147433728  0.163319  934       
Thread         6      139923139041024  0.152540  933       
Thread         1      139925429380864  0.111959  177       
..lizerWorker  8      139924732446464  0.097895  1017      
..tPollerUnix  9      139924749231872  0.077950  933       
Thread         11     139925185877760  0.007684  48        
..avingThread  2      139925169092352  0.005690  15        
ControlThread  10     139925177485056  0.002690  5         
Function Stats:

Clock type: CPU
Ordered by: totaltime, desc

name                                  ncall  tsub      ttot      tavg      
..dataframe.py:806 DataFrame.collect  5      1.301324  669.3740  

In [19]:
yappi.start()
complicated_arithmetic_operation(pyspark_filtered)

# Stop profiling
yappi.stop()

# Get thread stats
thread_stats = yappi.get_thread_stats()
print("Thread Stats:")
thread_stats.print_all()

# Get function stats
func_stats = yappi.get_func_stats()
print("Function Stats:")
func_stats.print_all()

                                                                                

Thread Stats:

name           id     tid              ttot      scnt        
_MainThread    0      139925494961984  776.3276  12325     
Thread         4      139923745007360  2.595524  2464      
Thread         3      139923155826432  1.150161  8489      
Thread         7      139924740839168  0.220164  2072      
Thread         5      139923147433728  0.183880  1052      
Thread         6      139923139041024  0.174444  1051      
Thread         1      139925429380864  0.132161  203       
..lizerWorker  8      139924732446464  0.112540  1145      
..tPollerUnix  9      139924749231872  0.087713  1051      
Thread         11     139925185877760  0.009249  59        
..avingThread  2      139925169092352  0.006942  18        
ControlThread  10     139925177485056  0.003455  6         
Function Stats:

Clock type: CPU
Ordered by: totaltime, desc

name                                  ncall  tsub      ttot      tavg      
..dataframe.py:806 DataFrame.collect  6      1.492420  771.6394  

In [20]:
yappi.start()
sum_columns(pyspark_data)

# Stop profiling
yappi.stop()

# Get thread stats
thread_stats = yappi.get_thread_stats()
print("Thread Stats:")
thread_stats.print_all()

# Get function stats
func_stats = yappi.get_func_stats()
print("Function Stats:")
func_stats.print_all()

                                                                                

Thread Stats:

name           id     tid              ttot      scnt        
_MainThread    0      139925494961984  960.8450  15184     
Thread         4      139923745007360  3.066508  2970      
Thread         3      139923155826432  1.367913  10188     
Thread         7      139924740839168  0.268446  2438      
Thread         5      139923147433728  0.224657  1238      
Thread         6      139923139041024  0.215175  1237      
Thread         1      139925429380864  0.152333  233       
..lizerWorker  8      139924732446464  0.136904  1344      
..tPollerUnix  9      139924749231872  0.109808  1237      
Thread         11     139925185877760  0.010859  68        
..avingThread  2      139925169092352  0.008117  21        
ControlThread  10     139925177485056  0.005514  11        
Function Stats:

Clock type: CPU
Ordered by: totaltime, desc

name                                  ncall  tsub      ttot      tavg      
..dataframe.py:806 DataFrame.collect  7      1.824382  955.0524  

In [None]:
yappi.start()
complicated_arithmetic_operation(pyspark_data)

# Stop profiling
yappi.stop()

# Get thread stats
thread_stats = yappi.get_thread_stats()
print("Thread Stats:")
thread_stats.print_all()

# Get function stats
func_stats = yappi.get_func_stats()
print("Function Stats:")
func_stats.print_all()

                                                                                

Thread Stats:

name           id     tid              ttot      scnt        
_MainThread    0      139925494961984  1147.645  18122     
Thread         4      139923745007360  3.570822  3505      
Thread         3      139923155826432  1.588714  11955     
Thread         7      139924740839168  0.314094  2832      
Thread         5      139923147433728  0.263440  1438      
Thread         6      139923139041024  0.252491  1437      
Thread         1      139925429380864  0.179275  268       
..lizerWorker  8      139924732446464  0.164029  1587      
..tPollerUnix  9      139924749231872  0.128515  1437      
Thread         11     139925185877760  0.012554  79        
..avingThread  2      139925169092352  0.009224  24        
ControlThread  10     139925177485056  0.005986  12        
Function Stats:

Clock type: CPU
Ordered by: totaltime, desc

name                                  ncall  tsub      ttot      tavg      
..dataframe.py:806 DataFrame.collect  8      2.155828  1140.746  