In [34]:
from time import sleep

from pyspark import StorageLevel
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *


spark = SparkSession.builder.master("local").getOrCreate()
sc = spark.sparkContext

Spark cluster parallelism 
executors_num
memory_per_ex
cores_per_execut
s = executors_num * cores_per_execut = 400 slotes
20 block => 20 slotes ~ 95%

# 1. HOW TO CREATE RDD
# we can build RDDs out of local collections

In [35]:
def split_row(row):
    return row.split(",")

numbers = range(1, 1000000)
numbers_parent_rdd = sc.parallelize(numbers, 4)
numbers_child_rdd = numbers_parent_rdd.map(lambda x: x + 1)
# Dependency: numbers_rdd => numbers_rdd_2
# Linage: partition: block => numbers_rdd => numbers_rdd_2
numbers_child_rdd.take(10)


[2, 3, 4, 5, 6, 7, 8, 9, 10, 11]

How to read a file in parallel

In [36]:
stocks_rdd_v2 = sc.textFile("data/stocks/aapl.csv", 4). \
    map(lambda row: row.split(",")). \
    filter(lambda tokens: float(tokens[2]) > 15)

stocks_rdd_v2.take(10)

# protected def getPartitions: Array[Partition]


[['AAPL', 'Jan 1 2000', '25.94'],
 ['AAPL', 'Feb 1 2000', '28.66'],
 ['AAPL', 'Mar 1 2000', '33.95'],
 ['AAPL', 'Apr 1 2000', '31.01'],
 ['AAPL', 'May 1 2000', '21'],
 ['AAPL', 'Jun 1 2000', '26.19'],
 ['AAPL', 'Jul 1 2000', '25.41'],
 ['AAPL', 'Aug 1 2000', '30.47'],
 ['AAPL', 'Jun 1 2004', '16.27'],
 ['AAPL', 'Jul 1 2004', '16.17']]

In [37]:
# read from a DF
stocks_df = spark.read.csv("data/stocks"). \
    withColumnRenamed("_c0", "company"). \
    withColumnRenamed("_c1", "date"). \
    withColumnRenamed("_c2", "price")

stocks_rdd_v3 = stocks_df.rdd

prices_rdd = stocks_rdd_v3.map(lambda row: row.price)
prices_rdd.toDebugString()
prices_rdd.take(10)

['25.94',
 '28.66',
 '33.95',
 '31.01',
 '21',
 '26.19',
 '25.41',
 '30.47',
 '12.88',
 '9.78']

In [38]:
# RDD to DF
# condition: the RDD must contain Spark Rows (data structures conforming to a schema)
stocks_df_v2 = spark.createDataFrame(stocks_rdd_v3)
stocks_df_v2.take(10)

[Row(company='AAPL', date='Jan 1 2000', price='25.94'),
 Row(company='AAPL', date='Feb 1 2000', price='28.66'),
 Row(company='AAPL', date='Mar 1 2000', price='33.95'),
 Row(company='AAPL', date='Apr 1 2000', price='31.01'),
 Row(company='AAPL', date='May 1 2000', price='21'),
 Row(company='AAPL', date='Jun 1 2000', price='26.19'),
 Row(company='AAPL', date='Jul 1 2000', price='25.41'),
 Row(company='AAPL', date='Aug 1 2000', price='30.47'),
 Row(company='AAPL', date='Sep 1 2000', price='12.88'),
 Row(company='AAPL', date='Oct 1 2000', price='9.78')]

    Use cases for RDDs
    - the computations that cannot work on DFs/Spark SQL API
    - very custom perf optimizations

In [39]:
# RDD transformations
# map, filter, flatMap

# distinct
company_names_rdd = stocks_rdd_v3 \
    .map(lambda row: row.company) \
    .distinct()
company_names_rdd.take(10)


['AAPL', 'AMZN', 'MSFT', 'IBM', 'GOOG']

In [40]:
# counting
total_entries = stocks_rdd_v3.count()  # action - the RDD must be evaluated
total_entries

560

In [41]:
# min and max
aapl_stocks_rdd = stocks_rdd_v3 \
    .filter(lambda row: row.company == "AAPL") \
    .map(lambda row: float(row.price))
max_aapl = aapl_stocks_rdd.max()
min_aapl = aapl_stocks_rdd.min()
print(max_aapl)
print(min_aapl)

223.02
7.07


In [42]:
# reduce
sum_prices = aapl_stocks_rdd \
    .reduce(lambda x, y: x + y)  # can use ANY Python function here  1,2,3,4 => 1+2 = 3 + 3 = 6 + 4
print(sum_prices)

7961.850000000001


In [43]:
# grouping
grouped_stocks_rdd = stocks_rdd_v3 \
    .groupBy(lambda row: row.company)  # can use ANY grouping criterion as a Python function
# grouping is expensive - involves a shuffle
grouped_stocks_rdd.take(10)

[('AAPL', <pyspark.resultiterable.ResultIterable at 0x7fc7ee623250>),
 ('AMZN', <pyspark.resultiterable.ResultIterable at 0x7fc7ee58afa0>),
 ('MSFT', <pyspark.resultiterable.ResultIterable at 0x7fc7ef74d2e0>),
 ('IBM', <pyspark.resultiterable.ResultIterable at 0x7fc7ee59b820>),
 ('GOOG', <pyspark.resultiterable.ResultIterable at 0x7fc7ee59d040>)]

In [44]:
# partitioning
repartitioned_stocks_rdd = stocks_rdd_v3 \
    .repartition(4) \
    .coalesce(2)

repartitioned_stocks_rdd.getNumPartitions()

# TODO: SLIDE
# .repartition(30)  # involves a shuffle
# involves a shuffle
#  .repartition(5) 100
#  part1 => |||||| 20           20 2  =>
#  part2 => |||||||||||||| 40   20 2  => |||||||||||||| 40 + |||||| 20 = 60
#  part3 => ||||| 10            20 2
#  part4 => |||||||||| 30       20 2  => |||||||||| 30 + ||||| 10 = 40
#  part5 =>                     20 2


2

Exercises

1. Read the movies dataset as an RDD

2. Show the distinct genres as an RDD

3. Print all the movies in the Drama genre with IMDB rating > 6


In [45]:
movies_df = spark.read.json("data/movies")
movies_rdd = movies_df.rdd

movies_rdd.take(5)

[Row(Creative_Type=None, Director=None, Distributor='Gramercy', IMDB_Rating=6.1, IMDB_Votes=1071, MPAA_Rating='R', Major_Genre=None, Production_Budget=8000000, Release_Date='12-Jun-98', Rotten_Tomatoes_Rating=None, Running_Time_min=None, Source=None, Title='The Land Girls', US_DVD_Sales=None, US_Gross=146083, Worldwide_Gross=146083),
 Row(Creative_Type=None, Director=None, Distributor='Strand', IMDB_Rating=6.9, IMDB_Votes=207, MPAA_Rating='R', Major_Genre='Drama', Production_Budget=300000, Release_Date='7-Aug-98', Rotten_Tomatoes_Rating=None, Running_Time_min=None, Source=None, Title='First Love, Last Rites', US_DVD_Sales=None, US_Gross=10876, Worldwide_Gross=10876),
 Row(Creative_Type=None, Director=None, Distributor='Lionsgate', IMDB_Rating=6.8, IMDB_Votes=865, MPAA_Rating=None, Major_Genre='Comedy', Production_Budget=250000, Release_Date='28-Aug-98', Rotten_Tomatoes_Rating=None, Running_Time_min=None, Source=None, Title='I Married a Strange Person', US_DVD_Sales=None, US_Gross=20313

# 2. HOW TO SAVE AND PERSIST RDD

In [47]:
r = [1, 2, 3, 4, 5, 6, 7, 8]
ints = sc.parallelize(r, 4).repartition(5)

# ints = sc.parallelize(r).coalesce(1)
ints.coalesce(1)\
    .saveAsTextFile("data/output/ints")

In [50]:
cachedInts = sc.textFile("data/output/ints") \
    .map(lambda x: int(x)) \
    .persist(StorageLevel.MEMORY_AND_DISK) \
    # .cache()
cachedInts.count()

#  very important to count() after cashing
# cachedInts.first()
# cachedInts.count()

8

In [55]:
cachedInts.unpersist()

unpersisted = cachedInts.map(lambda x: x + 1).take(5)

36


In [57]:
reduced = cachedInts.reduce(lambda x, y: x + y)
unpersisted

[2, 3, 6, 7, 8]

In [60]:
# How to show plan 

doubles = cachedInts.map(lambda x: x * 2)

even = cachedInts.filter(lambda x: x % 2 == 0)
print(even.take(5))


even.setName("Even numbers")
print("Name is " + even.name() + " id is " + str(even.id()))

plan = even.toDebugString().decode("utf-8")

print(plan)
print(doubles.take(5))

[2, 6, 8, 4]
Name is Even numbers id is 223
(1) Even numbers PythonRDD[223] at RDD at PythonRDD.scala:53 []
 |  data/output/ints MapPartitionsRDD[202] at textFile at NativeMethodAccessorImpl.java:0 []
 |  data/output/ints HadoopRDD[201] at textFile at NativeMethodAccessorImpl.java:0 []
[2, 4, 10, 12, 14]


# 3. HOW TO GROUP AND JOIN RDD

In [62]:
data = [("Ivan", 240), ("Petr", 39), ("Elena", 290), ("Elena", 300)]
codeRows = sc.parallelize(data)
codeRows.collect()

[('Ivan', 240), ('Petr', 39), ('Elena', 290), ('Elena', 300)]

In [63]:
# how to reduce
reduced = codeRows.reduceByKey(lambda x, y: x + y)
print(reduced.collect())

[('Ivan', 240), ('Petr', 39), ('Elena', 590)]
[('Ivan', 240), ('Petr', 39), ('Elena', 300)]


In [64]:
# how to deduplicate
deduplicated = codeRows.reduceByKey(lambda x, y: x if (x > y) else y)
print(deduplicated.collect())

[('Ivan', 240), ('Petr', 39), ('Elena', 300)]


In [67]:
# how to fold by key
folded = codeRows.foldByKey(1000, lambda x, y: x + y)

# TODO Sliding
#     How to foldByKey works
#     part1 (k1:2, k2:2, k3:2, k1:2) shufle => (k1:2, k1:2, k1:2) => k1:6
#     part2 (k2:2, k2:2, k3:2, k1:2) shufle => (k2:2, k2:2, k2:2) => k2:6, (k3:2, k3:2) => k3:4

folded.collect()

[('Ivan', 1240), ('Petr', 1039), ('Elena', 1590)]

In [None]:
# Aggregated
aggregated = codeRows.aggregateByKey(500, lambda x, y: x + y, lambda x, y: x + y)
aggregated.collect()

#     How to aggregateByKey works, shuffle less
#     part1 (k1:2, k2:2, k3:2, k1:2) => (k1:4, k2:2, k3:2) =>  shuffle => (k1:2, k1:2, k1:2) => k1:6
#     part2 (k2:2, k2:2, k3:2, k1:2) => (k1:4, k2:2, k3:2) => shuffle => (k2:2, k2:2, k2:2) => k2:6, (k3:2, k3:2) => k3:4

In [72]:
# # groupByKey works
grouped = codeRows.groupByKey()
# TODO show the inner array
print(grouped.collect())

grouped.toDebugString().decode("utf-8")

# b'(1) PythonRDD[19] at collect at C:/Users/VOpolskiy/PycharmProjects/another/eas-017-RDD-py/lection/01-RDD.py:208 []\n |
# MapPartitionsRDD[18] at mapPartitions at PythonRDD.scala:145 []\n |
# ShuffledRDD[17] at partitionBy at NativeMethodAccessorImpl.java:0 []
# \n +-(1) PairwiseRDD[16] at groupByKey at C:/Users/VOpolskiy/PycharmProjects/another/eas-017-RDD-py/lection/01-RDD.py:207 []
# \n    |  PythonRDD[15] at groupByKey at C:/Users/VOpolskiy/PycharmProjects/another/eas-017-RDD-py/lection/01-RDD.py:207 []
# \n    |  ParallelCollectionRDD[0] at readRDDFromFile at PythonRDD.scala:274 []'
# # Don't forget about joins with preferred languages
#

[('Ivan', <pyspark.resultiterable.ResultIterable object at 0x7fc7ee232430>), ('Petr', <pyspark.resultiterable.ResultIterable object at 0x7fc7ee2322b0>), ('Elena', <pyspark.resultiterable.ResultIterable object at 0x7fc7ee2323a0>)]


'(1) PythonRDD[290] at collect at /tmp/ipykernel_109/2934942739.py:3 []\n |  MapPartitionsRDD[289] at mapPartitions at PythonRDD.scala:145 []\n |  ShuffledRDD[288] at partitionBy at NativeMethodAccessorImpl.java:0 []\n +-(1) PairwiseRDD[287] at groupByKey at /tmp/ipykernel_109/2934942739.py:2 []\n    |  PythonRDD[286] at groupByKey at /tmp/ipykernel_109/2934942739.py:2 []\n    |  ParallelCollectionRDD[235] at readRDDFromFile at PythonRDD.scala:274 []'

# Joining

In [76]:
profileData = [("Ivan", "Java"), ("Elena", "Scala"), ("Petr", "Scala")]
programmerProfiles = sc.parallelize(profileData)
programmerProfiles.collect()

[('Ivan', 'Java'), ('Elena', 'Scala'), ('Petr', 'Scala')]

In [77]:
# RDD joining possible for only tuples

joined = programmerProfiles.join(codeRows)
print(joined.toDebugString().decode("utf-8"))
joined.collect()

(2) PythonRDD[301] at RDD at PythonRDD.scala:53 []
 |  MapPartitionsRDD[300] at mapPartitions at PythonRDD.scala:145 []
 |  ShuffledRDD[299] at partitionBy at <unknown>:0 []
 +-(2) PairwiseRDD[298] at join at /tmp/ipykernel_109/3472480033.py:3 []
    |  PythonRDD[297] at join at /tmp/ipykernel_109/3472480033.py:3 []
    |  UnionRDD[296] at union at NativeMethodAccessorImpl.java:0 []
    |  PythonRDD[294] at RDD at PythonRDD.scala:53 []
    |  ParallelCollectionRDD[293] at readRDDFromFile at PythonRDD.scala:274 []
    |  PythonRDD[295] at RDD at PythonRDD.scala:53 []
    |  ParallelCollectionRDD[235] at readRDDFromFile at PythonRDD.scala:274 []


[('Elena', ('Scala', 290)),
 ('Elena', ('Scala', 300)),
 ('Ivan', ('Java', 240)),
 ('Petr', ('Scala', 39))]

In [92]:
# cogroup is fullouter join with dividing array

data = [("Ivan", 240), ("Petr", 39), ("Elena", 290), ("Elena", 300)]
codeRows = sc.parallelize(data)
codeRows = programmerProfiles.cogroup(codeRows)
codeRows.take(5)

[('Elena',
  (<pyspark.resultiterable.ResultIterable at 0x7fc7ee2a42e0>,
   <pyspark.resultiterable.ResultIterable at 0x7fc7ee2a4400>)),
 ('Ivan',
  (<pyspark.resultiterable.ResultIterable at 0x7fc7ee2a3460>,
   <pyspark.resultiterable.ResultIterable at 0x7fc7ee2a3280>)),
 ('Petr',
  (<pyspark.resultiterable.ResultIterable at 0x7fc7ee2a3430>,
   <pyspark.resultiterable.ResultIterable at 0x7fc7ee2a3340>))]

In [81]:
# Sorting
programmerProfiles.cogroup(codeRows).sortByKey(False).collect()

# TODO Write code to show inner arry

[('Petr',
  (<pyspark.resultiterable.ResultIterable at 0x7fc7ee280eb0>,
   <pyspark.resultiterable.ResultIterable at 0x7fc7ed8d5ca0>)),
 ('Ivan',
  (<pyspark.resultiterable.ResultIterable at 0x7fc7ee232940>,
   <pyspark.resultiterable.ResultIterable at 0x7fc7ee232370>)),
 ('Elena',
  (<pyspark.resultiterable.ResultIterable at 0x7fc7ee2afaf0>,
   <pyspark.resultiterable.ResultIterable at 0x7fc7ee2afd60>))]

In [82]:
print("== CountByKey")
print(str(joined.countByKey()))

== CountByKey
defaultdict(<class 'int'>, {'Elena': 2, 'Ivan': 1, 'Petr': 1})


In [None]:
# or get all values by specific key
print("== Lookup")
print(str(joined.lookup("Elena")))

In [91]:
# codeRows keys only
print("== Keys")
codeRows.keys().collect()

== Keys


['Elena', 'Ivan', 'Petr']

In [97]:
# Print values only
print("== Value")
codeRows.values().take(4)

== Value


[(<pyspark.resultiterable.ResultIterable at 0x7fc7ee28fca0>,
  <pyspark.resultiterable.ResultIterable at 0x7fc7ee2a9040>),
 (<pyspark.resultiterable.ResultIterable at 0x7fc7ed8d5e20>,
  <pyspark.resultiterable.ResultIterable at 0x7fc7ed8d5f70>),
 (<pyspark.resultiterable.ResultIterable at 0x7fc7edf6b460>,
  <pyspark.resultiterable.ResultIterable at 0x7fc7ee2a46a0>)]

In [109]:
df = spark.read.option("inferSchema", "true").json("data/movies")
movies_rdd = df.rdd

print(movies_rdd.take(5))

[Row(Creative_Type=None, Director=None, Distributor='Gramercy', IMDB_Rating=6.1, IMDB_Votes=1071, MPAA_Rating='R', Major_Genre=None, Production_Budget=8000000, Release_Date='12-Jun-98', Rotten_Tomatoes_Rating=None, Running_Time_min=None, Source=None, Title='The Land Girls', US_DVD_Sales=None, US_Gross=146083, Worldwide_Gross=146083), Row(Creative_Type=None, Director=None, Distributor='Strand', IMDB_Rating=6.9, IMDB_Votes=207, MPAA_Rating='R', Major_Genre='Drama', Production_Budget=300000, Release_Date='7-Aug-98', Rotten_Tomatoes_Rating=None, Running_Time_min=None, Source=None, Title='First Love, Last Rites', US_DVD_Sales=None, US_Gross=10876, Worldwide_Gross=10876), Row(Creative_Type=None, Director=None, Distributor='Lionsgate', IMDB_Rating=6.8, IMDB_Votes=865, MPAA_Rating=None, Major_Genre='Comedy', Production_Budget=250000, Release_Date='28-Aug-98', Rotten_Tomatoes_Rating=None, Running_Time_min=None, Source=None, Title='I Married a Strange Person', US_DVD_Sales=None, US_Gross=203134,

In [114]:
dist_movies = movies_rdd.map(lambda row: row.Major_Genre).distinct()
dist_movies.take(5)

[None, 'Drama', 'Comedy', 'Musical', 'Thriller/Suspense']

In [118]:
# spark_dsl_only_df = col("Major_Genre") == "Drama" && col("IMDB_Rating") > 6
python_lambda_rdd = lambda movie: movie.Major_Genre & movie.IMDB_Rating > 6

s_movies = movies_rdd.filter(python_lambda_rdd)
print(s_movies.take(5))

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 170.0 failed 1 times, most recent failure: Lost task 0.0 in stage 170.0 (TID 173) (8c82e8c1a1d9 executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 619, in main
    process()
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 611, in process
    serializer.dump_stream(out_iter, outfile)
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 259, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/usr/local/spark/python/pyspark/rdd.py", line 1562, in takeUpToNumLeft
    yield next(iterator)
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/util.py", line 74, in wrapper
    return f(*args, **kwargs)
  File "/tmp/ipykernel_109/2136324967.py", line 2, in <lambda>
TypeError: unsupported operand type(s) for &: 'NoneType' and 'float'

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:555)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:713)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:695)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:508)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:358)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:358)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:345)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.api.python.PythonRDD$.$anonfun$runJob$1(PythonRDD.scala:166)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2254)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1462)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.base/java.lang.Thread.run(Thread.java:829)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2454)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2403)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2402)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2402)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1160)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1160)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1160)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2642)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2584)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2573)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:938)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2214)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2235)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2254)
	at org.apache.spark.api.python.PythonRDD$.runJob(PythonRDD.scala:166)
	at org.apache.spark.api.python.PythonRDD.runJob(PythonRDD.scala)
	at jdk.internal.reflect.GeneratedMethodAccessor88.invoke(Unknown Source)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 619, in main
    process()
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 611, in process
    serializer.dump_stream(out_iter, outfile)
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 259, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/usr/local/spark/python/pyspark/rdd.py", line 1562, in takeUpToNumLeft
    yield next(iterator)
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/util.py", line 74, in wrapper
    return f(*args, **kwargs)
  File "/tmp/ipykernel_109/2136324967.py", line 2, in <lambda>
TypeError: unsupported operand type(s) for &: 'NoneType' and 'float'

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:555)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:713)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:695)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:508)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:358)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:358)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:345)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.api.python.PythonRDD$.$anonfun$runJob$1(PythonRDD.scala:166)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2254)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1462)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	... 1 more
