# Creating RDDs

In [2]:
import os
os.environ["SPARK_HOME"] = "/opt/homebrew/Cellar/apache-spark/3.5.2/libexec"

In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/22 19:28:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/09/22 19:28:26 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/09/22 19:28:26 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
24/09/22 19:28:26 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.


In [4]:
# Creating a PySpark dataframe from a list of rows: (without schema information)
import pandas as pd
from datetime import datetime, date
from pyspark.sql import Row

df = spark.createDataFrame([
    Row(a=1, b=2., c='string1', d=date(2024, 5, 11), e=datetime(2024, 5, 11, 18, 50, 51)),
    Row(a=2, b=3., c='string2', d=date(2024, 5, 12), e=datetime(2024, 5, 12, 12, 22, 45)),
    Row(a=3, b=4., c='string3', d=date(2024, 5, 13), e=datetime(2024, 5, 13, 22, 1, 51)),
    Row(a=4, b=5., c='string4', d=date(2024, 5, 14), e=datetime(2024, 5, 14, 0, 0)),
])
df

DataFrame[a: bigint, b: double, c: string, d: date, e: timestamp]

In [5]:
# Creating a PySpark dataframe with schema:

df = spark.createDataFrame([
    (1, 2., 'string1', date(2024, 5, 11), datetime(2024, 5, 11, 18, 50, 51)),
    (2, 3., 'string2', date(2024, 5, 12), datetime(2024, 5, 12, 12, 22, 45)),
    (3, 4., 'string3', date(2024, 5, 13), datetime(2024, 5, 13, 22, 1, 51)),
    (4, 5., 'string4', date(2024, 5, 14), datetime(2024, 5, 14, 0, 0)),
],
    schema='a long, b double, c string, d date, e timestamp')
df

DataFrame[a: bigint, b: double, c: string, d: date, e: timestamp]

In [6]:
# Creating a PySpark dataframe from pandas dataframe:

pandas_df = pd.DataFrame({
    'a': [1, 2, 3, 5],
    'b': [5., 6., 7., 8.],
    'c': ['string1', 'string2', 'string3', 'string4'],
    'd': [date(2024, 5, 15), date(2024, 5, 16), date(2024, 5, 17), date(2024, 5, 18)],
    'e': [datetime(2024, 5, 15, 18, 15, 51), datetime(2024, 5, 13, 22, 1, 51), datetime(2024, 5, 22, 11, 2, 2), datetime(2024, 5, 31, 3, 13, 3)]
})
df = spark.createDataFrame(pandas_df)
df

DataFrame[a: bigint, b: double, c: string, d: date, e: timestamp]

In [7]:
# Create a PySpark dataframe from an RDD:

rdd = spark.sparkContext.parallelize([
    (1, 2., 'string1', date(2024, 5, 11), datetime(2024, 5, 11, 18, 50, 51)),
    (2, 3., 'string2', date(2024, 5, 12), datetime(2024, 5, 12, 12, 22, 45)),
    (3, 4., 'string3', date(2024, 5, 13), datetime(2024, 5, 13, 22, 1, 51)),
    (4, 5., 'string4', date(2024, 5, 14), datetime(2024, 5, 14, 0, 0)),
])
df = spark.createDataFrame(rdd, schema=['a', 'b', 'c', 'd', 'e'])
df


                                                                                

DataFrame[a: bigint, b: double, c: string, d: date, e: timestamp]

In [8]:
df.show()

+---+---+-------+----------+-------------------+
|  a|  b|      c|         d|                  e|
+---+---+-------+----------+-------------------+
|  1|2.0|string1|2024-05-11|2024-05-11 18:50:51|
|  2|3.0|string2|2024-05-12|2024-05-12 12:22:45|
|  3|4.0|string3|2024-05-13|2024-05-13 22:01:51|
|  4|5.0|string4|2024-05-14|2024-05-14 00:00:00|
+---+---+-------+----------+-------------------+



In [9]:
df.printSchema()

root
 |-- a: long (nullable = true)
 |-- b: double (nullable = true)
 |-- c: string (nullable = true)
 |-- d: date (nullable = true)
 |-- e: timestamp (nullable = true)



In [10]:
# Configuration of PySpark dataframes representation:

spark.conf.set('spark.sql.repl.eagerEval.enabled', True)
spark.conf.set('spark.sql.repl.eagerEval.maxNumRows', 3)
df

a,b,c,d,e
1,2.0,string1,2024-05-11,2024-05-11 18:50:51
2,3.0,string2,2024-05-12,2024-05-12 12:22:45
3,4.0,string3,2024-05-13,2024-05-13 22:01:51


In [11]:
df.show(2, vertical=True)  # one can show longer rows in vertical view

-RECORD 0------------------
 a   | 1                   
 b   | 2.0                 
 c   | string1             
 d   | 2024-05-11          
 e   | 2024-05-11 18:50:51 
-RECORD 1------------------
 a   | 2                   
 b   | 3.0                 
 c   | string2             
 d   | 2024-05-12          
 e   | 2024-05-12 12:22:45 
only showing top 2 rows



In [12]:
df.columns

['a', 'b', 'c', 'd', 'e']

In [13]:
df.select(['a', 'b', 'c']).describe().show()  # statistics

+-------+------------------+------------------+-------+
|summary|                 a|                 b|      c|
+-------+------------------+------------------+-------+
|  count|                 4|                 4|      4|
|   mean|               2.5|               3.5|   NULL|
| stddev|1.2909944487358056|1.2909944487358056|   NULL|
|    min|                 1|               2.0|string1|
|    max|                 4|               5.0|string4|
+-------+------------------+------------------+-------+



In [14]:
df.collect()  # this can throw out-of-memory error

[Row(a=1, b=2.0, c='string1', d=datetime.date(2024, 5, 11), e=datetime.datetime(2024, 5, 11, 18, 50, 51)),
 Row(a=2, b=3.0, c='string2', d=datetime.date(2024, 5, 12), e=datetime.datetime(2024, 5, 12, 12, 22, 45)),
 Row(a=3, b=4.0, c='string3', d=datetime.date(2024, 5, 13), e=datetime.datetime(2024, 5, 13, 22, 1, 51)),
 Row(a=4, b=5.0, c='string4', d=datetime.date(2024, 5, 14), e=datetime.datetime(2024, 5, 14, 0, 0))]

In [15]:
# so .take() or .tail() are recommended instead:

df.take(1)

[Row(a=1, b=2.0, c='string1', d=datetime.date(2024, 5, 11), e=datetime.datetime(2024, 5, 11, 18, 50, 51))]

In [16]:
# converting PySpark dataframe to Pandas also uses .collect() under the hood, so the memory issues apply

df.toPandas()

Unnamed: 0,a,b,c,d,e
0,1,2.0,string1,2024-05-11,2024-05-11 18:50:51
1,2,3.0,string2,2024-05-12,2024-05-12 12:22:45
2,3,4.0,string3,2024-05-13,2024-05-13 22:01:51
3,4,5.0,string4,2024-05-14,2024-05-14 00:00:00


### Selecting data:

In [17]:
df.b  # lazily evaluated, does not return values

Column<'b'>

In [18]:
type(df.b)

pyspark.sql.column.Column

In [19]:
from pyspark.sql import Column
from pyspark.sql.functions import upper

type(df.c) == type(upper(df.c)) == type(df.c.isNull())  # all are of types Column

True

In [20]:
df.select(df.b).show()

+---+
|  b|
+---+
|2.0|
|3.0|
|4.0|
|5.0|
+---+



In [21]:
# assigning new column:

df.withColumn('upper_c', upper(df.c)).show()

+---+---+-------+----------+-------------------+-------+
|  a|  b|      c|         d|                  e|upper_c|
+---+---+-------+----------+-------------------+-------+
|  1|2.0|string1|2024-05-11|2024-05-11 18:50:51|STRING1|
|  2|3.0|string2|2024-05-12|2024-05-12 12:22:45|STRING2|
|  3|4.0|string3|2024-05-13|2024-05-13 22:01:51|STRING3|
|  4|5.0|string4|2024-05-14|2024-05-14 00:00:00|STRING4|
+---+---+-------+----------+-------------------+-------+



In [22]:
df.filter(df.a==2).show()

+---+---+-------+----------+-------------------+
|  a|  b|      c|         d|                  e|
+---+---+-------+----------+-------------------+
|  2|3.0|string2|2024-05-12|2024-05-12 12:22:45|
+---+---+-------+----------+-------------------+



In [23]:
from pyspark.sql.functions import pandas_udf

@pandas_udf('long')
def pandas_plus_1(series: pd.Series) -> pd.Series:
    return series + 1

df.select(pandas_plus_1(df.a)).show()

                                                                                

+----------------+
|pandas_plus_1(a)|
+----------------+
|               2|
|               3|
|               4|
|               5|
+----------------+



In [24]:
def pandas_filter_func(iterator):
    for pandas_df in iterator:
        yield pandas_df[pandas_df.a == 2]

df.mapInPandas(pandas_filter_func, schema = df.schema).show()

+---+---+-------+----------+-------------------+
|  a|  b|      c|         d|                  e|
+---+---+-------+----------+-------------------+
|  2|3.0|string2|2024-05-12|2024-05-12 12:22:45|
+---+---+-------+----------+-------------------+



### Grouping data

In [25]:
df = spark.createDataFrame([
    ['red', 'banana', 1, 10], ['blue', 'banana', 2, 20], ['red', 'carrot', 3, 30],
    ['blue', 'grape', 4, 40], ['red', 'carrot', 5, 50], ['black', 'carrot', 6, 60],
    ['red', 'banana', 7, 70], ['red', 'grape', 8, 80]], schema=['color', 'fruit', 'v1', 'v2'])
df.show()

+-----+------+---+---+
|color| fruit| v1| v2|
+-----+------+---+---+
|  red|banana|  1| 10|
| blue|banana|  2| 20|
|  red|carrot|  3| 30|
| blue| grape|  4| 40|
|  red|carrot|  5| 50|
|black|carrot|  6| 60|
|  red|banana|  7| 70|
|  red| grape|  8| 80|
+-----+------+---+---+



In [26]:
df.groupBy('color').avg().show()

+-----+-------+-------+
|color|avg(v1)|avg(v2)|
+-----+-------+-------+
|  red|    4.8|   48.0|
| blue|    3.0|   30.0|
|black|    6.0|   60.0|
+-----+-------+-------+



In [27]:
def minus_mean(pandas_df):
    return pandas_df.assign(v1=pandas_df.v1 - pandas_df.v1.mean())

df.groupBy('color').applyInPandas(minus_mean, schema=df.schema).show()

+-----+------+---+---+
|color| fruit| v1| v2|
+-----+------+---+---+
|black|carrot|  0| 60|
| blue|banana| -1| 20|
| blue| grape|  1| 40|
|  red|banana| -3| 10|
|  red|carrot| -1| 30|
|  red|carrot|  0| 50|
|  red|banana|  2| 70|
|  red| grape|  3| 80|
+-----+------+---+---+



In [28]:
# grouping:

df1 = spark.createDataFrame(
    [(20000101, 1, 1.0), (20000101, 2, 2.0), (20000102, 1, 3.0), (20000102, 2, 4.0)],
    ('time', 'id', 'v1'))

df2 = spark.createDataFrame(
    [(20000101, 1, 'x'), (20000101, 2, 'y')],
    ('time', 'id', 'v2'))

In [29]:
def asof_join(left, right):
    return pd.merge_asof(left, right, on='time', by='id')

df1.groupby('id').cogroup(df2.groupby('id')).applyInPandas(
    asof_join, schema='time int, id int, v1 double, v2 string'
).show()

+--------+---+---+---+
|    time| id| v1| v2|
+--------+---+---+---+
|20000101|  1|1.0|  x|
|20000102|  1|3.0|  x|
|20000101|  2|2.0|  y|
|20000102|  2|4.0|  y|
+--------+---+---+---+



### I/O file formats

In [30]:
import os


if not os.path.exists('foo.csv'):
    df.write.csv('foo.csv', header = True)
    spark.read.csv('foo.csv', header = True).show()

In [58]:
if not os.path.exists('Exercise Files/var.parquet'):
    df.write.parquet('Exercise Files/var.parquet')
    spark.read.parquet('Exercise Files/var.parquet').show()

24/09/22 19:42:53 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
24/09/22 19:42:53 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 84.47% for 8 writers
24/09/22 19:42:53 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 75.08% for 9 writers
24/09/22 19:42:54 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 84.47% for 8 writers
24/09/22 19:42:54 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
                                                                                

+-----+------+---+---+
|color| fruit| v1| v2|
+-----+------+---+---+
|black|carrot|  6| 60|
| blue|banana|  2| 20|
|  red|carrot|  5| 50|
| blue| grape|  4| 40|
|  red|banana|  7| 70|
|  red|banana|  1| 10|
|  red|carrot|  3| 30|
|  red| grape|  8| 80|
+-----+------+---+---+



In [57]:
if not os.path.exists('Exercise Files/loo.orc'):
    df.write.orc('Exercise Files/loo.orc')
    spark.read.orc('Exercise Files/loo.orc').show()

+-----+------+---+---+
|color| fruit| v1| v2|
+-----+------+---+---+
|  red|banana|  7| 70|
|  red| grape|  8| 80|
|black|carrot|  6| 60|
| blue|banana|  2| 20|
|  red|banana|  1| 10|
|  red|carrot|  5| 50|
|  red|carrot|  3| 30|
| blue| grape|  4| 40|
+-----+------+---+---+



### Working with SQL

DataFrame and Spark SQL share the same execution engine so they can be interchangeably used seamlessly.

In [33]:
df.createOrReplaceTempView('table1')
spark.sql('SELECT COUNT(*) FROM table1').show()

+--------+
|count(1)|
+--------+
|       8|
+--------+



In [34]:
@pandas_udf('integer')
def add_two(series: pd.Series) -> pd.Series:
    return series + 2

spark.udf.register('add_two', add_two)
spark.sql('SELECT add_two(v1) FROM table1').show()

+-----------+
|add_two(v1)|
+-----------+
|          3|
|          4|
|          5|
|          6|
|          7|
|          8|
|          9|
|         10|
+-----------+



In [35]:
from pyspark.sql.functions import expr

df.selectExpr('add_two(v1)').show()

+-----------+
|add_two(v1)|
+-----------+
|          3|
|          4|
|          5|
|          6|
|          7|
|          8|
|          9|
|         10|
+-----------+



In [36]:
df.select(expr('COUNT(*)') > 1).show()

+--------------+
|(count(1) > 1)|
+--------------+
|          true|
+--------------+



In [37]:
import numpy as np
import pyspark.pandas as ps
from pyspark.sql import SparkSession

s = pd.Series([3, 2, 1, 6, np.nan, 3, 67])
s



0     3.0
1     2.0
2     1.0
3     6.0
4     NaN
5     3.0
6    67.0
dtype: float64

Creating pandas-on-Spark DataFrame

In [38]:
psdf = ps.DataFrame({
    'a': [1, 2, 3, 4, 5, 5, 6, 7, 8],
    'b': [10, 20, 30, 40, 50, 50, 60, 70, 80],
    'c': ['one', 'two', 'three', 'four', 'five', 'five', 'six', 'seven', 'eight']},
    index=[100, 200, 300, 400, 500, 550, 600, 700, 800])
psdf

CodeCache: size=131072Kb used=36455Kb max_used=36455Kb free=94616Kb
 bounds [0x00000001071d8000, 0x00000001095a8000, 0x000000010f1d8000]
 total_blobs=13485 nmethods=12534 adapters=861
 compilation: disabled (not enough contiguous free space left)




Unnamed: 0,a,b,c
100,1,10,one
200,2,20,two
300,3,30,three
400,4,40,four
500,5,50,five
550,5,50,five
600,6,60,six
700,7,70,seven
800,8,80,eight


In [39]:
dates = pd.date_range('20240101', periods=9)
dates

DatetimeIndex(['2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04',
               '2024-01-05', '2024-01-06', '2024-01-07', '2024-01-08',
               '2024-01-09'],
              dtype='datetime64[ns]', freq='D')

In [40]:
pdf = pd.DataFrame(np.random.randn(9, 4), index=dates, columns=list('ABCD'))
pdf

Unnamed: 0,A,B,C,D
2024-01-01,-0.435276,-1.286075,0.195799,0.766595
2024-01-02,0.005854,-0.755499,0.056252,0.376563
2024-01-03,-0.092601,1.211071,-0.979399,-1.311218
2024-01-04,-0.85165,0.546247,0.337879,0.284074
2024-01-05,-1.117155,-1.425612,0.948037,0.093289
2024-01-06,-0.307171,-0.043208,0.96458,0.545044
2024-01-07,-0.705668,2.290857,0.392779,-0.584824
2024-01-08,-0.275698,1.819392,0.033441,0.271112
2024-01-09,0.74158,-1.570105,-0.300179,0.595464


In [41]:
psdf = ps.from_pandas(pdf)
psdf

Unnamed: 0,A,B,C,D
2024-01-01,-0.435276,-1.286075,0.195799,0.766595
2024-01-02,0.005854,-0.755499,0.056252,0.376563
2024-01-03,-0.092601,1.211071,-0.979399,-1.311218
2024-01-04,-0.85165,0.546247,0.337879,0.284074
2024-01-05,-1.117155,-1.425612,0.948037,0.093289
2024-01-06,-0.307171,-0.043208,0.96458,0.545044
2024-01-07,-0.705668,2.290857,0.392779,-0.584824
2024-01-08,-0.275698,1.819392,0.033441,0.271112
2024-01-09,0.74158,-1.570105,-0.300179,0.595464


Another possible way:

In [42]:
spark = SparkSession.builder.getOrCreate()
sdf = spark.createDataFrame(pdf)
sdf.show()

+--------------------+--------------------+--------------------+-------------------+
|                   A|                   B|                   C|                  D|
+--------------------+--------------------+--------------------+-------------------+
| -0.4352763088755493| -1.2860745894213137| 0.19579899623374467| 0.7665950661427242|
|0.005854488363323174| -0.7554991669723404| 0.05625240940718462|  0.376562885949065|
| -0.0926013177751083|   1.211071256175591| -0.9793991054856823|-1.3112178140091402|
| -0.8516499967336203|  0.5462470431668168|  0.3378787722697738| 0.2840740234248561|
| -1.1171554454449573| -1.4256121826494943|  0.9480372013614239|0.09328863662185087|
| -0.3071710997750796|-0.04320818153859925|  0.9645801553001758| 0.5450441562641292|
| -0.7056681172799276|  2.2908570606642193|  0.3927786080536894|-0.5848239605837317|
|-0.27569809343643403|   1.819391817141453|0.033441395582421256| 0.2711124720633295|
|  0.7415801517379037|  -1.570104570803742|-0.30017894434508413| 

In [43]:
psdf = sdf.pandas_api()
psdf

Unnamed: 0,A,B,C,D
0,-0.435276,-1.286075,0.195799,0.766595
1,0.005854,-0.755499,0.056252,0.376563
2,-0.092601,1.211071,-0.979399,-1.311218
3,-0.85165,0.546247,0.337879,0.284074
4,-1.117155,-1.425612,0.948037,0.093289
5,-0.307171,-0.043208,0.96458,0.545044
6,-0.705668,2.290857,0.392779,-0.584824
7,-0.275698,1.819392,0.033441,0.271112
8,0.74158,-1.570105,-0.300179,0.595464


### Missing Data

In [44]:
pdf1 = pdf.reindex(index=dates[:4], columns=list(pdf.columns) + ['E'])
pdf1.loc[dates[0]:dates[1], 'E'] = 7
pdf1

Unnamed: 0,A,B,C,D,E
2024-01-01,-0.435276,-1.286075,0.195799,0.766595,7.0
2024-01-02,0.005854,-0.755499,0.056252,0.376563,7.0
2024-01-03,-0.092601,1.211071,-0.979399,-1.311218,
2024-01-04,-0.85165,0.546247,0.337879,0.284074,


In [45]:
psdf1 = ps.from_pandas(pdf1)
psdf1

Unnamed: 0,A,B,C,D,E
2024-01-01,-0.435276,-1.286075,0.195799,0.766595,7.0
2024-01-02,0.005854,-0.755499,0.056252,0.376563,7.0
2024-01-03,-0.092601,1.211071,-0.979399,-1.311218,
2024-01-04,-0.85165,0.546247,0.337879,0.284074,


In [46]:
psdf1.dropna(how='any')

Unnamed: 0,A,B,C,D,E
2024-01-01,-0.435276,-1.286075,0.195799,0.766595,7.0
2024-01-02,0.005854,-0.755499,0.056252,0.376563,7.0


In [47]:
psdf1.fillna(value=0)

Unnamed: 0,A,B,C,D,E
2024-01-01,-0.435276,-1.286075,0.195799,0.766595,7.0
2024-01-02,0.005854,-0.755499,0.056252,0.376563,7.0
2024-01-03,-0.092601,1.211071,-0.979399,-1.311218,0.0
2024-01-04,-0.85165,0.546247,0.337879,0.284074,0.0


In [48]:
psdf1.mean()

A   -0.343418
B   -0.071064
C   -0.097367
D    0.029004
E    7.000000
dtype: float64

Spark Configurations

In [49]:
import warnings

prev = spark.conf.get('spark.sql.execution.arrow.pyspark.enabled')
ps.set_option('compute.default_index_type', 'distributed')

warnings.filterwarnings('ignore')

In [50]:
spark.conf.set('spark.sql.execution.arrow.pyspark.enabled', True)
%timeit ps.range(300000).to_pandas()

92.8 ms ± 18.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [51]:
spark.conf.set('spark.sql.execution.arrow.pyspark.enabled', False)
%timeit ps.range(300000).to_pandas()

676 ms ± 34.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [52]:
ps.reset_option('compute.default_index_type')
spark.conf.set('spark.sql.execution.arrow.pyspark.enabled', prev)  # returning to previous value

### Grouping PSDF

In [53]:
psdf = ps.DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
                          'foo', 'bar', 'foo', 'foo'],
                    'B': ['one', 'one', 'two', 'three',
                          'two', 'two', 'one', 'three'],
                    'C': np.random.randn(8),
                    'D': np.random.randn(8)})

In [54]:
type(psdf)

pyspark.pandas.frame.DataFrame

In [55]:
psdf.groupby('A').sum()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
foo,4.601896,-2.412322
bar,-0.811366,-2.455529


In [56]:
psdf.groupby(['A', 'B']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
foo,one,1.411328,-0.72249
bar,one,-0.818752,-0.99895
foo,two,0.638034,0.257881
bar,three,0.131684,-0.369456
bar,two,-0.124298,-1.087122
foo,three,0.503172,-1.483104
