In [None]:
# basic imports 

import os # OS e.g directory structure
import sys
import numpy as np # linear algebra
import scipy as sc  # scientific computing
import pandas as pd # data processing, file I/O
import seaborn as sns  # visualization
import matplotlib.pyplot as plt # visualization
import math
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Spark related imports

from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import *
from pyspark.sql.types import *

from pyspark.ml.feature import StringIndexer
from pyspark.ml.fpm import FPGrowth
from pyspark.ml.fpm import PrefixSpan
from pyspark.ml.evaluation import RegressionEvaluator

# Data Exploration

In [None]:
! echo "Oct-2019"
! head -n 5 dataset/2019-Oct.csv
! tail -n 5 dataset/2019-Oct.csv
! echo "Nov-2019"
! head -n 5 dataset/2019-Nov.csv
! tail -n 5 dataset/2019-Nov.csv

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("abd_recommendation").getOrCreate()

In [None]:
def barPlot(columns, values):
    fig = plt.figure()
    ax = fig.add_axes([0,0,2,1])
    ax.bar(columns, values)
    for i, d in enumerate(values):
        plt.text(i - int(math.log10(d+1))*0.05, d + 0.01, str(d))
    plt.show()

Importing data, reading all csv files into one dataframe

In [None]:
sales = spark.read.csv("dataset/2019-Nov-small.csv", header="true", inferSchema="true", sep=",").union(spark.read.csv("dataset/2019-Oct-small.csv", header="true", inferSchema="true", sep=","))

In [None]:
sales.printSchema()
sales.count()

In [None]:
b = sales.where(col("category_code").isNull()).groupBy("category_id").count()
c = sales.where(col("category_code").isNotNull()).groupBy("category_id").count()
a = b.join(c, b.category_id == c.category_id, 'full')
a.show()


Count null values in all columns

In [None]:
df_nulls = sales.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in sales.columns])
df_nulls.show()
barPlot(df_nulls.columns ,[r for r in df_nulls.first()])


In [None]:
# Eliminar nulls

df_sales = sales.select("event_type", "product_id", "category_id", "category_code", "brand", "user_id", "user_session").where(col("brand").isNotNull() & col("category_code").isNotNull())
df_sales.printSchema()
df_sales.count()


In [None]:
products = df_sales.select("product_id").distinct()
brands = df_sales.select("brand").distinct()
categories = df_sales.select("category_id").distinct()

products.write.mode("overwrite").parquet("products")
brands.write.mode("overwrite").parquet("brands")
categories.write.mode("overwrite").parquet("categories")



In [None]:
df_session_product = df_sales.select("user_session","product_id").where(col("event_type") == "purchase").dropDuplicates().groupBy('user_session').agg(collect_list('product_id').alias('products'))

df_user_product = df_sales.where(col("event_type") == "purchase").groupBy('user_id').agg(collect_list('product_id').alias('products'))
#df_user_product.show(truncate=False)

df_user_brand = df_sales.where(col("event_type") == "purchase").groupBy('user_id').agg(collect_list('brand').alias('brands'))
#df_user_brand.show(truncate=False)

df_user_category = df_sales.where(col("event_type") == "purchase").groupBy('user_id').agg(collect_list('category_id').alias('categories'))
#df_user_category.show(truncate=False)

df_user_product_v = df_sales.where(col("event_type") == "view").groupBy('user_id').agg(collect_list('product_id').alias('products'))
#df_user_product_v.show(truncate=False)

df_user_brand_v = df_sales.where(col("event_type") == "view").groupBy('user_id').agg(collect_list('brand').alias('brands'))
#df_user_brand_v.show(truncate=False)

df_user_category_v = df_sales.where(col("event_type") == "view").groupBy('user_id').agg(collect_list('category_id').alias('categories'))
#df_user_category_v.show(truncate=False)

df_user_product.write.mode("overwrite").parquet("user_products")
df_user_brand.write.mode("overwrite").parquet("user_brands")
df_user_category.write.mode("overwrite").parquet("user_categories")
df_user_product_v.write.mode("overwrite").parquet("user_products_v")
df_user_brand_v.write.mode("overwrite").parquet("user_brands_v")
df_user_category_v.write.mode("overwrite").parquet("user_categories_v")

In [None]:
df_session_product = df_sales.where(col("event_type") == "purchase").groupBy('user_session').agg(collect_list('product_id').alias('products'))
#df_session_product.show(truncate=False)

df_session_brand = df_sales.where(col("event_type") == "purchase").groupBy('user_session').agg(collect_list('brand').alias('brands'))
#df_session_brand.show(truncate=False)

df_session_category = df_sales.where(col("event_type") == "purchase").groupBy('user_session').agg(collect_list('category_id').alias('categories'))
#df_session_category.show(truncate=False)

df_session_product_v = df_sales.where(col("event_type") == "view").groupBy('user_session').agg(collect_list('product_id').alias('products'))
#df_session_product_v.show(truncate=False)

df_session_brand_v = df_sales.where(col("event_type") == "view").groupBy('user_session').agg(collect_list('brand').alias('brands'))
#df_session_brand_v.show(truncate=False)

df_session_category_v = df_sales.where(col("event_type") == "view").groupBy('user_session').agg(collect_list('category_id').alias('categories'))
#df_session_category_v.show(truncate=False)

df_session_product.write.mode("overwrite").parquet("session_products")
df_session_brand.write.mode("overwrite").parquet("session_brands")
df_session_category.write.mode("overwrite").parquet("session_categories")
df_session_product_v.write.mode("overwrite").parquet("session_products_v")
df_session_brand_v.write.mode("overwrite").parquet("session_brands_v")
df_session_category_v.write.mode("overwrite").parquet("session_categories_v")

In [None]:
# Count purchase brands

limit = 15
df_count_purchase_brands = df_sales.where(col("event_type") == "purchase").groupBy("brand").count().orderBy(col("count").desc())
df_count_purchase_brands.limit(limit).show()
barPlot([col["brand"] for col in df_count_purchase_brands.select("brand").limit(limit).collect()], [row["count"] for row in df_count_purchase_brands.take(limit)])

In [None]:
# Count view brands

limit = 15
df_count_view_brands = df_sales.where(col("event_type") == "view").groupBy("brand").count().orderBy(col("count").desc())
df_count_view_brands.limit(limit).show()
barPlot([col["brand"] for col in df_count_view_brands.select("brand").limit(limit).collect()], [row["count"] for row in df_count_view_brands.take(limit)])

In [None]:
# brands viewed vs brands purchased

limit = 15
purchase_view_brands = df_count_purchase_brands.select(col("brand").alias("brand_p"), col("count").alias("count_p")).join(df_count_view_brands.select(col("brand").alias("brand_v"), col("count").alias("count_v")), col("brand_p") == col("brand_v"), 'full').where(col("brand_p").isNotNull()).withColumn("views_per_purchase", col("count_v").cast('int')/col("count_p").cast('int')).orderBy(col("count_v").desc())
purchase_view_brands.show()
barPlot([col["brand_p"] for col in purchase_view_brands.select("brand_p").limit(limit).collect()], [int(col["views_per_purchase"]) for col in purchase_view_brands.select("views_per_purchase").limit(limit).collect()])


In [None]:
# produto x purchase

limit = 15
products_purchased = df_sales.where(col("event_type") == "purchase").groupBy("product_id", "brand").count().orderBy(col("count").desc())
products_purchased.show()
barPlot([str(col["product_id"]) for col in products_purchased.select("product_id").limit(limit).collect()], [row["count"] for row in products_purchased.take(limit)])

In [None]:
# produto x view

limit = 15
products_viewed = df_sales.where(col("event_type") == "view").groupBy("product_id", "brand").count().orderBy(col("count").desc())
products_viewed.show()
barPlot([str(col["product_id"]) for col in products_viewed.select("product_id").limit(limit).collect()], [row["count"] for row in products_viewed.take(limit)])

In [None]:
# products viewed vs products purchased

limit = 15
purchase_view_products = products_purchased.select(col("product_id").alias("product_p"), col("count").alias("count_p")).join(products_viewed.select(col("product_id").alias("product_v"), col("count").alias("count_v")), col("product_p") == col("product_v"), 'full').where(col("product_p").isNotNull()).withColumn("views_per_purchase", col("count_v").cast('int')/col("count_p").cast('int')).orderBy(col("count_v").desc())
purchase_view_products.show()
barPlot([str(col["product_p"]) for col in purchase_view_products.select("product_p").limit(limit).collect()], [int(col["views_per_purchase"]) for col in purchase_view_products.select("views_per_purchase").limit(limit).collect()])


In [None]:
# category x purchase

limit = 5
category_purchased = df_sales.where(col("event_type") == "purchase").groupBy("category_id", "category_code").count().orderBy(col("count").desc())
category_purchased.show(truncate=False)
barPlot([str(col["category_code"]) for col in category_purchased.select("category_code").limit(limit).collect()], [row["count"] for row in category_purchased.take(limit)])

In [None]:
# category x view

limit = 5
category_viewd = df_sales.where(col("event_type") == "view").groupBy("category_id", "category_code").count().orderBy(col("count").desc())
category_viewd.show(truncate=False)
barPlot([str(col["category_code"]) for col in category_viewd.select("category_code").limit(limit).collect()], [row["count"] for row in category_viewd.take(limit)])

In [None]:
# categories viewed vs categories purchased

limit = 5
purchase_view_categories = category_purchased.select(col("category_code").alias("category_p"), col("count").alias("count_p")).join(category_viewd.select(col("category_code").alias("category_v"), col("count").alias("count_v")), col("category_p") == col("category_v"), 'full').where(col("category_p").isNotNull()).withColumn("views_per_purchase", col("count_v").cast('int')/col("count_p").cast('int')).orderBy(col("count_v").desc())
purchase_view_categories.show()
barPlot([str(col["category_p"]) for col in purchase_view_categories.select("category_p").limit(limit).collect()], [int(col["views_per_purchase"]) for col in purchase_view_categories.select("views_per_purchase").limit(limit).collect()])

In [None]:
# Delete nulls (brand and category_code)

df_sales = sales.select("event_type", "product_id", "category_id", "brand", "user_id", "user_session").where(col("brand").isNotNull())
df_sales.printSchema()
df_sales.count()

Create a new column by joint the value os cols brand and category_id 

In [None]:
def concat_brand_category_id(b,c):
    return b+"."+str(c)

concat = udf(concat_brand_category_id, StringType())

In [None]:
df_sales_new = df_sales.withColumn("brand_category", concat(col("brand"), col("category_id")))

df_sales_new.show()





category_code x purchase

brands x category_code

category_id




In [None]:
# Create parquet for product, brand and category

df_sales_new.select("product_id").distinct().write.mode("overwrite").parquet("products")
df_sales_new.select("brand").distinct().write.mode("overwrite").parquet("brands")
df_sales_new.select("category_id").distinct().write.mode("overwrite").parquet("categories")



# Create user parquet for purchase and view (_v) events

df_sales_new.select("user_id","product_id").where(col("event_type") == "purchase").dropDuplicates().groupBy('user_id').agg(collect_list('product_id').alias('items')).write.mode("overwrite").parquet("user_products")
df_sales_new.select("user_id","brand").where(col("event_type") == "purchase").dropDuplicates().groupBy('user_id').agg(collect_list('brand').alias('items')).write.mode("overwrite").parquet("user_brands")
df_sales_new.select("user_id","category_id").where(col("event_type") == "purchase").dropDuplicates().groupBy('user_id').agg(collect_list('category_id').alias('items')).write.mode("overwrite").parquet("user_categories")
df_sales_new.select("user_id","brand_category").where(col("event_type") == "purchase").dropDuplicates().groupBy('user_id').agg(collect_list('brand_category').alias('items')).write.mode("overwrite").parquet("user_category_brand")

df_sales_new.select("user_id","product_id").where(col("event_type") == "view").dropDuplicates().groupBy('user_id').agg(collect_list('product_id').alias('items')).write.mode("overwrite").parquet("user_products_v")
df_sales_new.select("user_id","brand").where(col("event_type") == "view").dropDuplicates().groupBy('user_id').agg(collect_list('brand').alias('items')).write.mode("overwrite").parquet("user_brands_v")
df_sales_new.select("user_id","category_id").where(col("event_type") == "view").dropDuplicates().groupBy('user_id').agg(collect_list('category_id').alias('items')).write.mode("overwrite").parquet("user_categories_v")
df_sales_new.select("user_id","brand_category").where(col("event_type") == "view").dropDuplicates().groupBy('user_id').agg(collect_list('brand_category').alias('items')).write.mode("overwrite").parquet("user_category_brand_v")

In [None]:
# Create session parquet for purchase and view (_v) events

df_sales_new.select("user_session","product_id").where(col("event_type") == "purchase").dropDuplicates().groupBy('user_session').agg(collect_list('product_id').alias('items')).write.mode("overwrite").parquet("session_products")
df_sales_new.select("user_session","brand").where(col("event_type") == "purchase").dropDuplicates().groupBy('user_session').agg(collect_list('brand').alias('items')).write.mode("overwrite").parquet("session_brands")
df_sales_new.select("user_session","category_id").where(col("event_type") == "purchase").dropDuplicates().groupBy('user_session').agg(collect_list('category_id').alias('items')).write.mode("overwrite").parquet("session_categories")
df_sales_new.select("user_session","brand_category").where(col("event_type") == "purchase").dropDuplicates().groupBy('user_session').agg(collect_list('brand_category').alias('items')).write.mode("overwrite").parquet("session_category_brand")

df_sales_new.select("user_session","product_id").where(col("event_type") == "view").dropDuplicates().groupBy('user_session').agg(collect_list('product_id').alias('items')).write.mode("overwrite").parquet("session_products_v")
df_sales_new.select("user_session","brand").where(col("event_type") == "view").dropDuplicates().groupBy('user_session').agg(collect_list('brand').alias('items')).write.mode("overwrite").parquet("session_brands_v")
df_sales_new.select("user_session","category_id").where(col("event_type") == "view").dropDuplicates().groupBy('user_session').agg(collect_list('category_id').alias('items')).write.mode("overwrite").parquet("session_categories_v")
df_sales_new.select("user_session","brand_category").where(col("event_type") == "view").dropDuplicates().groupBy('user_session').agg(collect_list('brand_category').alias('items')).write.mode("overwrite").parquet("session_category_brand_v")

In [None]:
def split_brand_category_id(s):
    return s.split(".")[1]

split = udf(split_brand_category_id, StringType())

In [51]:
df_sales_new.printSchema()

root
 |-- event_type: string (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- category_id: long (nullable = true)
 |-- brand: string (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- user_session: string (nullable = true)
 |-- brand_category: string (nullable = true)



In [None]:
#df_sales_new = df_sales.withColumn("brand_category", concat(col("brand"), col("category_id")))
df_sales_new2 = df_sales_new.withColumn("category_id", split(col("brand_category")))

In [57]:
def ifNull(ColIn, ReplaceVal):
    return(when(ColIn == "null", ReplaceVal).otherwise(ColIn))

nvl = udf(ifNull, StringType())

In [58]:
a = sales.select("category_id", "category_code").distinct().withColumn("cat_code", nvl(col("category_code"), lit("uncategorized")))
a.show()

Py4JJavaError: An error occurred while calling o555.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 30.0 failed 1 times, most recent failure: Lost task 0.0 in stage 30.0 (TID 69) (FosquitoMiGamingLaptop executor driver): org.apache.spark.SparkException: Python worker failed to connect back.
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:182)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:107)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:119)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:145)
	at org.apache.spark.sql.execution.python.BatchEvalPythonExec.evaluate(BatchEvalPythonExec.scala:70)
	at org.apache.spark.sql.execution.python.EvalPythonExec.$anonfun$doExecute$2(EvalPythonExec.scala:130)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:863)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:863)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.base/java.lang.Thread.run(Thread.java:834)
Caused by: java.net.SocketTimeoutException: Accept timed out
	at java.base/java.net.PlainSocketImpl.waitForNewConnection(Native Method)
	at java.base/java.net.PlainSocketImpl.socketAccept(PlainSocketImpl.java:163)
	at java.base/java.net.AbstractPlainSocketImpl.accept(AbstractPlainSocketImpl.java:458)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:551)
	at java.base/java.net.ServerSocket.accept(ServerSocket.java:519)
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:174)
	... 24 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2253)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2202)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2201)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2201)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1078)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1078)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1078)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2440)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2382)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2371)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:868)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2202)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2223)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2242)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:472)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:425)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:47)
	at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:3696)
	at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:2722)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3687)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:772)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3685)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2722)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2929)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:301)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:338)
	at jdk.internal.reflect.GeneratedMethodAccessor112.invoke(Unknown Source)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.base/java.lang.Thread.run(Thread.java:834)
Caused by: org.apache.spark.SparkException: Python worker failed to connect back.
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:182)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:107)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:119)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:145)
	at org.apache.spark.sql.execution.python.BatchEvalPythonExec.evaluate(BatchEvalPythonExec.scala:70)
	at org.apache.spark.sql.execution.python.EvalPythonExec.$anonfun$doExecute$2(EvalPythonExec.scala:130)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:863)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:863)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	... 1 more
Caused by: java.net.SocketTimeoutException: Accept timed out
	at java.base/java.net.PlainSocketImpl.waitForNewConnection(Native Method)
	at java.base/java.net.PlainSocketImpl.socketAccept(PlainSocketImpl.java:163)
	at java.base/java.net.AbstractPlainSocketImpl.accept(AbstractPlainSocketImpl.java:458)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:551)
	at java.base/java.net.ServerSocket.accept(ServerSocket.java:519)
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:174)
	... 24 more
