## https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.html

In [1]:
# setup spark on linux
from pyspark.sql import *

if SparkSession.getActiveSession() == None:
    print("Active spark session not found")
else:
    print("Active spark session found")

spark = (SparkSession.builder
         #.master("local") # local- no parallelizm at all, local[2] - 2 cores, local[*] - as many cores as local logical cores
         .appName("SparkSession#1")
         .enableHiveSupport() # enableHiveSupport() needed to make data persistent... 
         #.config("spark.driver.allowMultipleContexts", "true")
         .config("spark.sql.cbo.enabled", "true")
         .config("spark.sql.cbo.optimizer", "true")
         .getOrCreate())

print('spark version:', spark.version)

spark.sql("Show databases").show()
spark.sql("select current_database()").show()
#spark.sql("use mytestdb")
spark.sql("Show tables").show()
spark.sql("Show views").show()

print('Done.')

Active spark session not found


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/25 12:37:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/04/25 12:37:53 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/04/25 12:37:53 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


spark version: 3.5.5


25/04/25 12:37:55 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
25/04/25 12:37:55 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
25/04/25 12:37:57 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0
25/04/25 12:37:57 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore jovyan@172.17.0.2


+---------+
|namespace|
+---------+
|  default|
| mytestdb|
|   testdb|
+---------+

+------------------+
|current_database()|
+------------------+
|           default|
+------------------+

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
+---------+---------+-----------+

+---------+--------+-----------+
|namespace|viewName|isTemporary|
+---------+--------+-----------+
+---------+--------+-----------+

Done.


25/04/25 12:37:59 WARN ObjectStore: Failed to get database global_temp, returning NoSuchObjectException


In [2]:
# stop the session if needed (for another notebook)
spark.stop()
print('Done.')

Done.


In [4]:
# create database
spark.sql("create database if not exists mytestdb")
spark.sql("Show databases").show()
spark.sql("use mytestdb")
spark.sql("select current_database()").show()

25/04/03 14:13:05 WARN ObjectStore: Failed to get database mytestdb, returning NoSuchObjectException
25/04/03 14:13:05 WARN ObjectStore: Failed to get database mytestdb, returning NoSuchObjectException
25/04/03 14:13:05 WARN ObjectStore: Failed to get database mytestdb, returning NoSuchObjectException


+---------+
|namespace|
+---------+
|  default|
| mytestdb|
+---------+

+------------------+
|current_database()|
+------------------+
|          mytestdb|
+------------------+



In [164]:
# create dataframes

from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType

# from list, just column names
l = [
    (1, 'name_1', 21),
    (2, 'name_2', 22),
    (3, 'name_3', 23),
     ]

df = spark.createDataFrame(l, schema= ["id", "name", "age"])
df.show()

# from list, separate schema
l = [
    (1, 'name_1', 21),
    (2, 'name_2', 22),
    (3, 'name_3', 23),
     ]
df_schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True)
])
df = spark.createDataFrame(l, df_schema)
df.show()

# from rows implicite schema
df = spark.createDataFrame([
    Row(id = 1, name = "name_1", age = 21),
    Row(id = 12, name = "name_12", age = 32),
    Row(id = 13, name = "name_13", age = 33),
])
df.show()

+---+------+---+
| id|  name|age|
+---+------+---+
|  1|name_1| 21|
|  2|name_2| 22|
|  3|name_3| 23|
+---+------+---+

+---+------+---+
| id|  name|age|
+---+------+---+
|  1|name_1| 21|
|  2|name_2| 22|
|  3|name_3| 23|
+---+------+---+

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  1| name_1| 21|
| 12|name_12| 32|
| 13|name_13| 33|
+---+-------+---+



In [123]:
###########
# overview
###########
# df --> temp view
df = spark.createDataFrame([
    Row(id = 1, name = "name_1", age = 21, ts = datetime.now()),
    Row(id = 12, name = "name_12", age = 32, ts = datetime.now()),
    Row(id = 13, name = "name_13", age = 33, ts = datetime.now()),
])
df.createOrReplaceTempView('v_df')
df_rslt = spark.sql('select * from v_df where id > 2')
print('v_df:')
df_rslt.show()
spark.catalog.dropTempView('v_df')

# df --> table
df = spark.createDataFrame([
    Row(id = 1, name = "name_1", age = 21, ts = datetime.now()),
    Row(id = 12, name = "name_12", age = 32, ts = datetime.now()),
    Row(id = 13, name = "name_13", age = 33, ts = datetime.now()),
])

# save as managed table (as no path given)
(
    df.write
    .mode('overwrite')
    .format('parquet')
    .saveAsTable('df_table')
)

# table --> df
df_rslt = spark.sql('select * from df_table')
print('df_rslt:')
df_rslt.show()
spark.sql('drop table if exists df_table')

v_df:
+---+-------+---+--------------------+
| id|   name|age|                  ts|
+---+-------+---+--------------------+
| 12|name_12| 32|2025-03-19 19:19:...|
| 13|name_13| 33|2025-03-19 19:19:...|
+---+-------+---+--------------------+

df_rslt:
+---+-------+---+--------------------+
| id|   name|age|                  ts|
+---+-------+---+--------------------+
|  1| name_1| 21|2025-03-19 19:19:...|
| 12|name_12| 32|2025-03-19 19:19:...|
| 13|name_13| 33|2025-03-19 19:19:...|
+---+-------+---+--------------------+



DataFrame[]

In [49]:
# dataframe column manipulations

from pyspark.sql.functions import current_timestamp
df = spark.createDataFrame([
    Row(id = 1, name = "name_1", age = 21),
    Row(id = 12, name = "name_12", age = None),
    Row(id = 13, name = None, age = 23),
])
df.show()

# handling empty values (fillna() is alias for .na.fill())
df.na.replace(21, 99).show()  # simply replace all values with new one
df.na.replace([21,23], [99, 24]).show()  # replace 21 to 99 and 23 to 24!
df.na.drop().show() # drop all rows having null values in any column
df.na.fill("?").show()  # fill all null values with "?"
df.na.fill(value=0, subset = ['age']).show() # fill null values of age with 0
df=df.na.fill({'name': 'unknown', 'age': 0}) # fill null name with unknown and null age with 0
df.show()

# add column
df_new1 = df.withColumn("ingestion_timestamp", current_timestamp())
df_new1.show()

# rename column
df_new2 = df_new1.withColumnRenamed("ingestion_timestamp", "ingestion_ts")
df_new2.show()

# drop column
df_new3 = df_new1.drop("ingestion_timestamp")
df_new3.show()

# pivot
df_new4 = df_new3.groupby("age").pivot("name").min("id")
df_new4.show()



+---+-------+----+
| id|   name| age|
+---+-------+----+
|  1| name_1|  21|
| 12|name_12|NULL|
| 13|   NULL|  23|
+---+-------+----+

+---+-------+----+
| id|   name| age|
+---+-------+----+
|  1| name_1|  99|
| 12|name_12|NULL|
| 13|   NULL|  23|
+---+-------+----+

+---+-------+----+
| id|   name| age|
+---+-------+----+
|  1| name_1|  99|
| 12|name_12|NULL|
| 13|   NULL|  24|
+---+-------+----+

+---+------+---+
| id|  name|age|
+---+------+---+
|  1|name_1| 21|
+---+------+---+

+---+-------+----+
| id|   name| age|
+---+-------+----+
|  1| name_1|  21|
| 12|name_12|NULL|
| 13|      ?|  23|
+---+-------+----+

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  1| name_1| 21|
| 12|name_12|  0|
| 13|   NULL| 23|
+---+-------+---+

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  1| name_1| 21|
| 12|name_12|  0|
| 13|unknown| 23|
+---+-------+---+

+---+-------+---+--------------------+
| id|   name|age| ingestion_timestamp|
+---+-------+---+--------------------+
|  1| n

In [98]:
# create small_table and large_table

spark.sql("USE mytestdb")
spark.sql("""
create table if not exists small_table as
select 0 id, 'name00' name union all
select 1 id, 'name01' name union all
select 2 id, 'name02' name union all
select 3 id, 'name03' name union all
select 4 id, 'name04' name union all
select 5 id, 'name05' name union all
select 6 id, 'name06' name union all
select 7 id, 'name07' name union all
select 8 id, 'name08' name union all
select 9 id, 'name09' name
""")

spark.sql("drop table if exists large_table")

spark.sql("""
create table if not exists large_table as
select row_number() over (order by 1) id,
        'name' || lpad(row_number() over (order by 1), 8, '0') name
from small_table t1
    inner join
    small_table t2
    inner join
    small_table t3
    inner join
    small_table t4
    inner join
    small_table t5
    inner join
    small_table t6
""")

spark.sql("select count(1) as cnt from large_table").show()

25/03/19 18:32:49 WARN ResolveSessionCatalog: A Hive serde table will be created as there is no table provider specified. You can set spark.sql.legacy.createHiveTableByDefault to false so that native data source table will be created instead.
25/03/19 18:32:49 WARN ResolveSessionCatalog: A Hive serde table will be created as there is no table provider specified. You can set spark.sql.legacy.createHiveTableByDefault to false so that native data source table will be created instead.
25/03/19 18:32:49 WARN HiveMetaStore: Location: file:/home/jovyan/work/various_tests/spark/spark-warehouse/mytestdb.db/large_table specified for non-external table:large_table
25/03/19 18:32:49 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/19 18:32:49 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/19

+-------+
|    cnt|
+-------+
|1000000|
+-------+



In [190]:
import datetime

d_now = datetime.datetime.now() 
d_10 = d_now + datetime.timedelta(days=10)
print(d, d_10)

2025-03-19 20:42:51.383042 2025-03-29 20:45:35.335687


In [112]:
# perf test
from pyspark.sql.functions import col

ct = datetime.now()
print('joined as df op:')

# 100000x1000000
df_t1 = spark.sql('select t1.* from small_table t1 cross join small_table t2 cross join small_table t3 cross join small_table t4 cross join small_table t5')
df_t2 = spark.sql('select * from large_table')

df_rst1 = (
    df_t1.alias('t1')
     .join(df_t2.alias('t2'), 
          (col("t1.id") == col("t2.id")) & (col("t1.name") > col("t2.name")), "full")
    .filter(~df_t2.name.like('%e1%')) #  not like 
    .groupBy()
    .count().withColumnRenamed('count', 'cnt') # the only way instead of alias?
).show()

print(f"done in {(datetime.now() - ct)}.")

ct = datetime.now()
print('joined as sql op:')
# join as sql using local temp view
df_t1.createOrReplaceTempView('t1')  # df --> temp table
df_t2.createOrReplaceTempView('t2')  # df --> table

# table --> df
df_rst1 = spark.sql('''
select count(1) as cnt
from t1 full outer join t2 
on (t1.id = t2.id)
where t2.name not like '%e1%'
''')
df_rst1.show()
spark.catalog.dropTempView('t1')
spark.catalog.dropTempView('t2')
print(f"done in {(datetime.now() - ct)}.")

joined as df op:
+-------+
|    cnt|
+-------+
|1089991|
+-------+

done in 0:00:01.070806.
joined as sql op:
+-------+
|    cnt|
+-------+
|1089991|
+-------+

done in 0:00:00.628944.


In [10]:
import datetime
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType

# sql UDF functions
def f1(s: str) -> str:
    if len(s) < 2:
        return s
    elif len(s) == 2:
        return s[0] + s[1].upper()
    else:
        return s[0] + s[1].upper() + s[2:]

def f2(dt: datetime, n: int) -> TimestampType:
    return dt + datetime.timedelta(days = n)

l = [
    (1, 'name_1', 21, datetime.datetime.now()),
    (2, 'name_2', 22, datetime.datetime.now()),
    (3, 'name_3', 23, datetime.datetime.now()),
     ]
df_schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("ts", TimestampType(), True)
])

df = spark.createDataFrame(l, df_schema)
df.show()

spark.udf.register("f1", f1)
spark.udf.register("f2", f2, TimestampType())
df.createOrReplaceTempView("udf_test")
spark.sql('''
select id, f1(name) as f1_name, 
    age, 
    f2(ts, 5) as fs_5,
    date_format(f2(ts, 5),"yyyy-MM-dd HH:mm:ss") as fs_5_fmt
from udf_test
''').show()
spark.catalog.dropTempView("udf_test")

+---+------+---+--------------------+
| id|  name|age|                  ts|
+---+------+---+--------------------+
|  1|name_1| 21|2025-03-24 15:35:...|
|  2|name_2| 22|2025-03-24 15:35:...|
|  3|name_3| 23|2025-03-24 15:35:...|
+---+------+---+--------------------+



25/03/24 15:35:23 WARN SimpleFunctionRegistry: The function f1 replaced a previously registered function.
25/03/24 15:35:23 WARN SimpleFunctionRegistry: The function f2 replaced a previously registered function.


+---+-------+---+--------------------+-------------------+
| id|f1_name|age|                fs_5|           fs_5_fmt|
+---+-------+---+--------------------+-------------------+
|  1| nAme_1| 21|2025-03-29 15:35:...|2025-03-29 15:35:23|
|  2| nAme_2| 22|2025-03-29 15:35:...|2025-03-29 15:35:23|
|  3| nAme_3| 23|2025-03-29 15:35:...|2025-03-29 15:35:23|
+---+-------+---+--------------------+-------------------+



True

In [20]:
# joins, filter

from datetime import datetime
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType

spark.sql("use mytestdb")
spark.sql("Show tables").show()
spark.sql("Show views").show()
spark.sql("SELECT count(1) FROM large_table").show()
spark.sql("SELECT count(1) FROM small_table").show()

t1 = [
    (1, 'name_1', 21, datetime.now()),
    (2, 'name_2', 22, datetime.now()),
    (3, 'name_3', 23, datetime.now()),
     ]
t1_schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("ts", DateType(), True)
])

# from list
df_t1 = spark.createDataFrame(t1, schema = t1_schema)
df_t1.printSchema()
df_t1.show()

# from rows of data
df_t2 = spark.createDataFrame([
    Row(id = 1, name = "name_1", age = 21, ts = datetime.now()),
    Row(id = 12, name = "name_12", age = 32, ts = datetime.now()),
    Row(id = 13, name = "name_13", age = 33, ts = datetime.now()),
])

df_t2.printSchema()
df_t2.show()


print('joined as df op:')
# join as df
from pyspark.sql.functions import col
df_rst1 = (
    df_t1.alias('t1')
    .filter(df_t1.age  > 5)  # <, ==, >, !=, isin(), &,|, 
    .filter(df_t1.age.isin([32, 33]))  # <, ==, >, !=, isin(), &,|, startswith(), endswith(), contains(), like('%x%')
    .filter(df_t1.name.like('%am%'))
    .filter(~df_t1.name.like('%xx%')) #  not like
    .filter("t1.age > 5 and t1.age in (32,33) and t1.name like '%am%'") # simple spark sql where condition copy
    .join(df_t2.alias('t2'), 
          (col("t1.id") == col("t2.id")) & (col("t1.name") > col("t2.name")), "full")
     .select('t1.id', 't2.id')
).show()

print('joined as sql op:')
# join as sql using local temp view
df_t1.createTempView('t1')
df_t2.createTempView('t2')
df_rst1 = spark.sql('select * from t1 inner join t2 on (t1.id = t2.id)')
df_rst1.show()
spark.catalog.dropTempView('t1')
spark.catalog.dropTempView('t2')

# join as sql using global temp view
spark.sql('drop view if exists global_temp.t1')
spark.sql('drop view if exists global_temp.t2')
df_t1.createOrReplaceGlobalTempView('t1')
df_t2.createOrReplaceGlobalTempView('t2')
df_rst1 = spark.sql('select * from global_temp.t1 inner join global_temp.t2 on (t1.id = t2.id)')
df_rst1.show()
df_rst2 = spark.sql('select * from global_temp.t1 left outer join global_temp.t2 on (t1.id = t2.id)')
df_rst2.show()
df_rst2 = spark.sql('select * from global_temp.t1 right outer join global_temp.t2 on (t1.id = t2.id)')
df_rst2.show()
df_rst2 = spark.sql('select * from global_temp.t1 full outer join global_temp.t2 on (t1.id = t2.id)')
df_rst2.show()

# join hints (BROADCAST, MERGE, SHUFFLE_HASH and SHUFFLE_REPLICATE_NL)
df_rst1 = spark.sql('select /*+ BROADCAST(global_temp.t1)*/ * from global_temp.t1 inner join global_temp.t2 on (t1.id = t2.id)')

# drop temp views
spark.catalog.dropGlobalTempView('t1')
spark.catalog.dropGlobalTempView('t2')

+---------+------------+-----------+
|namespace|   tableName|isTemporary|
+---------+------------+-----------+
| mytestdb| large_table|      false|
| mytestdb| small_table|      false|
|         |age_district|       true|
+---------+------------+-----------+

+---------+------------+-----------+
|namespace|    viewName|isTemporary|
+---------+------------+-----------+
|         |age_district|       true|
+---------+------------+-----------+

+--------+
|count(1)|
+--------+
| 1001000|
+--------+

+--------+
|count(1)|
+--------+
|      10|
+--------+

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- ts: date (nullable = true)

+---+------+---+----------+
| id|  name|age|        ts|
+---+------+---+----------+
|  1|name_1| 21|2025-03-30|
|  2|name_2| 22|2025-03-30|
|  3|name_3| 23|2025-03-30|
+---+------+---+----------+

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- age: long (nullable = 

True

In [179]:
# cache the DF to memory
df.cache()


# persist / cache (persist to StorageLevel.MEMORY_AND_DISK) data for optimization
# MEMORY_ONLY,MEMORY_AND_DISK, MEMORY_ONLY_SER, MEMORY_AND_DISK_SER, DISK_ONLY, MEMORY_ONLY_2,MEMORY_AND_DISK_2
df.persist(pyspark.StorageLevel.DISK_ONLY)  
df.show()

# checkpoint: write DF only to disk. by default, checkpoint happens after compute, as a separate job
spark.sparkContext.setCheckpointDir("./checkpoint")
spark.conf.set("spark.cleaner.referenceTracking.cleanCheckpoints", True)  # cleanup checkpoint dir if DF is out of scope
df_r1.checkpoint()

+---+------+---+
| id|  name|age|
+---+------+---+
|  1|name_1| 21|
|  2|name_2| 22|
|  3|name_3| 23|
+---+------+---+



25/03/19 20:36:32 WARN CacheManager: Asked to cache already cached data.


In [15]:
# save / remove dataframe as parquet

import os
import shutil
from datetime import datetime

# df --> table
df = spark.createDataFrame([
    Row(id = 1, name = "name_1", age = 21, ts = datetime.now()),
    Row(id = 12, name = "name_12", age = 32, ts = datetime.now()),
    Row(id = 13, name = "name_13", age = 33, ts = datetime.now()),
])
df.show()

# save as parquet file
if not os.path.exists('df_test1'):
    print('saving df as df_test1 parquet')
    df.write.format('parquet').save('df_test1')

# remove parquet file
del df
shutil.rmtree('df_test1', shutil.rmtree)

print('Done.')

+---+-------+---+--------------------+
| id|   name|age|                  ts|
+---+-------+---+--------------------+
|  1| name_1| 21|2025-03-30 20:52:...|
| 12|name_12| 32|2025-03-30 20:52:...|
| 13|name_13| 33|2025-03-30 20:52:...|
+---+-------+---+--------------------+

Done.


In [4]:
# read / select / group by performance tests
# partitioning (hash, range,round-robin)
# use repartition() and coalesce() for dynamic partitioning
# use  partitionBy() partition by multiple columns
# https://sparkbyexamples.com/pyspark/pyspark-partitionby-example/

from datetime import datetime
import random
from pyspark.sql import functions as sf
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType


# creating large DF:
print("creating large list: ")
ct = datetime.now()
l = []
rows = 100
for c in range(rows):
    l.append([c, ('name-' + str(c)),  None if random.random() < 0.1 else random.randint(1, 100),  None if random.random() < 0.1 else random.randint(1000, 1004)])

df_schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("district", IntegerType(), True),
])
print(f"Done in {datetime.now() - ct}")

print("creating large DF from list: ")
ct = datetime.now()
df = spark.createDataFrame(l, df_schema)
df.printSchema()
df.show()
print(f"Done in {datetime.now() - ct}")


# repartitioning DF in memory (weather partitionBy is used or not)
print("repartitioning DF: ")
ct = datetime.now()
#df4 = df.repartition(4, "age")
df4 = df.repartition(1)
print(f"Done ({df4.rdd.getNumPartitions()}) in {datetime.now() - ct}")

# data skew (repartition by maxRecordsPerFile in case partition size varies much)

# us partitionBy (physically partition, not logically as group by) to split data into partitions and write, creating subfolters, eg. age=77/district=1004.
# data file does not include the partitioned columns (they woud be redundant)
print("saving DF to disk: ")
ct = datetime.now()
(
    df4.write
    .mode("overwrite")
    .option("header", True)
    #.option("maxRecordsPerFile", 1000)
    .partitionBy("age", "district")  # test: partition the data (into folders & subfolders)
    .csv("my_partitioned_df")
)
print(f"Done in {datetime.now() - ct}")

# clear cache for perf testing
print("clearing cache: ")
ct = datetime.now()
spark.catalog.clearCache()
print(f"Done in {datetime.now() - ct}")

# reading DF
print("reading DF: ", end="")
ct = datetime.now()
read_df = (
    spark.read.option("header", True)
    #.csv("my_partitioned_df/age=28/district=1003")  # either read single partition (partition columns will not exist)
    .csv("my_partitioned_df")  # or read full DF, including partitioned columns
)
# cannot filter partition columns IF single partition is read. partitions do not exist (not written into datafile)!!!
# if full DF is read and filtered, performance is not bad
#read_df = read_df.filter((read_df.age == 28) & (read_df.district == 1003)).groupby("age").count()
read_df = read_df.groupBy("age").agg(sf.count(read_df.age)).sort(read_df.age.asc())

read_df.printSchema()
read_df.show()
print(f"Done in {datetime.now() - ct}")


# clear cache for perf testing
print("clearing cache: ")
ct = datetime.now()
spark.catalog.clearCache()
print(f"Done in {datetime.now() - ct}")

# reading DF as SQL
print("reading DF from file for SQL: ")
read_df = (
    spark
    .read
    .option("header", True)
    .csv("my_partitioned_df")
)
print(f"Done in {datetime.now() - ct}")

print("reading specific age and district dataset as SQL... ")
ct = datetime.now()
read_df.createOrReplaceTempView("AGE_DISTRICT")
spark.sql("select age, district, count(1) cnt from AGE_DISTRICT  where age=28 and district = 1003 group by age, district order by age, district").show()
print(f"Done in {datetime.now() - ct}")

# removing DF
print("removing DF: ")
ct = datetime.now()
del df
del df4
del read_df
print(f"Done in {datetime.now() - ct}")

creating large list: 
Done in 0:00:00.000399
creating large DF from list: 
root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- district: integer (nullable = true)

+---+-------+---+--------+
| id|   name|age|district|
+---+-------+---+--------+
|  0| name-0| 56|    NULL|
|  1| name-1| 24|    1000|
|  2| name-2| 99|    1003|
|  3| name-3| 40|    1000|
|  4| name-4| 49|    1000|
|  5| name-5| 33|    1000|
|  6| name-6| 40|    1000|
|  7| name-7| 51|    1000|
|  8| name-8| 93|    1001|
|  9| name-9| 92|    1003|
| 10|name-10| 96|    1001|
| 11|name-11| 12|    NULL|
| 12|name-12| 33|    1000|
| 13|name-13| 54|    1003|
| 14|name-14| 18|    1001|
| 15|name-15| 47|    1002|
| 16|name-16| 68|    1003|
| 17|name-17| 71|    1000|
| 18|name-18| 73|    1003|
| 19|name-19| 63|    1004|
+---+-------+---+--------+
only showing top 20 rows

Done in 0:00:00.152793
repartitioning DF: 
Done (1) in 0:00:00.107518
saving DF to disk: 


                                                                                

Done in 0:00:00.797781
clearing cache: 
Done in 0:00:00.000300
reading DF: root
 |-- age: integer (nullable = true)
 |-- count(age): long (nullable = false)

+----+----------+
| age|count(age)|
+----+----------+
|NULL|         0|
|   1|         2|
|   3|         2|
|   4|         1|
|   6|         2|
|   8|         1|
|  11|         1|
|  12|         1|
|  13|         1|
|  14|         1|
|  16|         1|
|  17|         1|
|  18|         2|
|  20|         2|
|  21|         1|
|  22|         1|
|  23|         2|
|  24|         1|
|  25|         1|
|  28|         1|
+----+----------+
only showing top 20 rows

Done in 0:00:01.289620
clearing cache: 
Done in 0:00:00.000285
reading DF from file for SQL: 
Done in 0:00:00.881780
reading specific age and district dataset as SQL... 
+---+--------+---+
|age|district|cnt|
+---+--------+---+
+---+--------+---+

Done in 0:00:00.055287
removing DF: 
Done in 0:00:00.000071


25/03/30 20:31:21 WARN ObjectStore: Failed to get database parquet, returning NoSuchObjectException
25/03/30 20:31:22 ERROR Executor: Exception in task 0.0 in stage 30.0 (TID 682)
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.util.ThreadUtils$.parmap(ThreadUtils.scala:387)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$.readParquetFootersInParallel(ParquetFileFormat.scala:443)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$.$anonfun$mergeSchemasInParallel$1(ParquetFileFormat.scala:493)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$.$anonfun$mergeSchemasInParallel$1$adapted(ParquetFileFormat.scala:485)
	at org.apache.spark.sql.execution.datasources.SchemaMergeUtils$.$anonfun$mergeSchemasInParallel$2(SchemaMergeUti

AnalysisException: [UNSUPPORTED_DATASOURCE_FOR_DIRECT_QUERY] Unsupported data source type for direct query on files: parquet; line 1 pos 14

In [28]:
# join performance tests DF vs SQL

from datetime import datetime
import random
from pyspark.sql import functions as sf
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType


# creating large DF:
print("creating list: ")
ct = datetime.now()
l1 = []
rows = 10
for c in range(rows):
    l1.append([c, ('name-' + str(c)), None if random.random() < 0.1 else random.randint(1, 100), None if random.random() < 0.1 else random.randint(1000, 1004)])

l2 = []
rows = 10
for c in range(rows):
    l2.append([c, ('name-' + str(c)), None if random.random() < 0.1 else random.randint(1, 100), None if random.random() < 0.1 else random.randint(1000, 1004)])

l3 = []
rows = 10
for c in range(rows):
    l3.append([c, ('name-' + str(c)), None if random.random() < 0.1 else random.randint(1, 100), None if random.random() < 0.1 else random.randint(1000, 1004)])

df_schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("district", IntegerType(), True),
])
print(f"Done in {datetime.now() - ct}")

print("creating DF from list: ")
ct = datetime.now()
df1 = spark.createDataFrame(l1, df_schema)
df1.printSchema()
df1.show()
df2 = spark.createDataFrame(l2, df_schema)
df2.printSchema()
df2.show()
df3 = spark.createDataFrame(l3, df_schema)
df3.printSchema()
df3.show()
print(f"Done in {datetime.now() - ct}")

del l1
del l2
del l3

print("DF join:")
df_query = df1.join(other=df2, on=["id", "name"], how='inner') # equi join on id=id, name=name, all columns retrieved, duplicates cause issues later
df_query = df1.join(other=df2, on=[df1.id == df2.id, df1.name == df2.name], how='inner') # normal join, all columns retrieved, duplicates cause issues later
df_query = (
                df1
                .join(other=df2, on=[df1.id == df2.id, df1.name == df2.name], how='inner')
                .select(df1.id, df1.name, df1.district, df2.id.alias("id_df2")) # normal join, all columns retrieved
            )

df_query = (
                df1
                .join(other=df2, on=[df1.id == df2.id, df1.name == df2.name], how='inner')
                .join(other=df3, on=[df2.id == df3.id, df2.name == df3.name], how='inner')
                .select(df1.id, df1.name, df1.age, df1.district, df2.district.alias("district_df2"), df3.district.alias("district_df3")) # normal join, all columns retrieved
            )
df_query.name = "5"
#df_query.describe().show()
df_query.show()


df1.createOrReplaceTempView('t1')
df2.createOrReplaceTempView('t2')
df3.createOrReplaceTempView('t3')
df_rst1 = spark.sql('''
select t1.*, t2.district district_df2, t3.district district_df3
from t1 inner join t2 on (t1.id = t2.id and t1.name = t2.name)
    inner join t3 on (t2.id = t3.id and t2.name = t3.name)
''')
df_rst1.show()

# removing DF
print("removing DF: ")
ct = datetime.now()
del df1
del df2
del df_query
del df_rst1

print(f"Done in {datetime.now() - ct}")

creating list: 
Done in 0:00:00.000388
creating DF from list: 
root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- district: integer (nullable = true)

+---+------+----+--------+
| id|  name| age|district|
+---+------+----+--------+
|  0|name-0|  79|    1000|
|  1|name-1|  20|    1004|
|  2|name-2|  37|    1003|
|  3|name-3|  30|    1003|
|  4|name-4|NULL|    1000|
|  5|name-5|NULL|    1002|
|  6|name-6|  56|    1004|
|  7|name-7|  60|    NULL|
|  8|name-8|NULL|    1002|
|  9|name-9|  34|    1001|
+---+------+----+--------+

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- district: integer (nullable = true)

+---+------+----+--------+
| id|  name| age|district|
+---+------+----+--------+
|  0|name-0|  98|    1003|
|  1|name-1|  64|    1003|
|  2|name-2|NULL|    1001|
|  3|name-3|  64|    1000|
|  4|name-4|  11|    1000|
|  5|name-5|NULL|    1002|
|  6|na

In [12]:
# select, filter
from datetime import datetime
import random
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType

l = []
rows = 100
for c in range(rows):
    l.append([c, ('name-' + str(c)), None if random.random() < 0.1 else random.randint(1, 100), random.randint(1000, 1004)])

df_schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("district", IntegerType(), True),
])

df = spark.createDataFrame(l, df_schema)

# 1. select logic result (true/false)
df.select(df.age, (df['age'] + 1).alias('age'), 'age', (df['age'] == 56).alias('age'), df.age + 1).show()

# select count
df.select(count(df.age).alias("count(age)"), count(expr("1")).alias("count(exr('1'))")).show() # not null columns

# 2. real filter
df.filter(df.age == 56).show()

(
df.filter(df.age  > 5)  # <, ==, >, !=, isin(), &,|, 
     .filter(df.age.isin([32, 33]))  # <, ==, >, !=, isin(), &,|, startswith(), endswith(), contains(), like('%x%')
     .filter(df.name.like('%am%'))
     .filter(~df.name.like('%xx%')) #  not like
).show()

+----+----+----+-----+---------+
| age| age| age|  age|(age + 1)|
+----+----+----+-----+---------+
|NULL|NULL|NULL| NULL|     NULL|
|NULL|NULL|NULL| NULL|     NULL|
|  38|  39|  38|false|       39|
|  10|  11|  10|false|       11|
|  12|  13|  12|false|       13|
|  69|  70|  69|false|       70|
|  75|  76|  75|false|       76|
|  70|  71|  70|false|       71|
|  98|  99|  98|false|       99|
|  33|  34|  33|false|       34|
|  91|  92|  91|false|       92|
|  54|  55|  54|false|       55|
|   8|   9|   8|false|        9|
|  50|  51|  50|false|       51|
|   3|   4|   3|false|        4|
|  29|  30|  29|false|       30|
|  82|  83|  82|false|       83|
|  98|  99|  98|false|       99|
|  94|  95|  94|false|       95|
|  44|  45|  44|false|       45|
+----+----+----+-----+---------+
only showing top 20 rows

+----------+---------------+
|count(age)|count(exr('1'))|
+----------+---------------+
|        84|            100|
+----------+---------------+

+---+-------+---+--------+
| id|   n

In [23]:
# sort/orderby, limit, offset, group by, withColumnRenamed, alias
from datetime import datetime
import random
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType

l = []
rows = 100
for c in range(rows):
    l.append([c, ('name-' + str(c % (rows//10))), None if random.random() < 0.1 else random.randint(1, 100), None if random.random() < 0.1 else random.randint(1000, 1004)])

df_schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("district", IntegerType(), True),
])

df = spark.createDataFrame(l, df_schema)

print("schema:")
df.printSchema()

print("describe:")
df.describe().show()

print("explain:")
df.explain()

print("group:") # groupBy returns GroupedData
df.groupBy("name", "age").count().show() # GroupedData.count
df.groupBy(df.name, df.age).count().show() # GroupedData.count
df.groupBy(df.name, df.age).min().withColumnRenamed("min(id)","min_id").show() # GroupedData.min of all int columns!
df.groupBy(df.name, df.age).agg(min("age").alias("min_age")).show() # GroupedData.agg(#pyspark.sql.functions aggregate functions), similar to spark sql group by / aggregate selects
df.groupBy(df.name, df.age).agg({"district": "max"}).show() # GroupedData.agg(#dict{col, funct} of pyspark.sql.functions aggregate functions). similar to spark sql group by / aggregate selects

print("sort or orderby:")
df.orderBy(["name", "age"], ascending=[1, 0]).show()
df.orderBy([col("name"), col("age")], ascending=[1, 0]).show()
df.orderBy([df.name.asc(), df.age.desc()]).show()

df.sort(["name", "age"], ascending=[1,0]).show()
df.sort([df.name.asc(), df.age.desc()]).show()

print("group and sort together:")
df.groupBy("age").agg(count(df.age)).sort(df.age.asc()).show() # using count function

print("limit:")
df.limit(5).offset(3).show()

schema:
root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- district: integer (nullable = true)

describe:
+-------+------------------+------+------------------+------------------+
|summary|                id|  name|               age|          district|
+-------+------------------+------+------------------+------------------+
|  count|               100|   100|                93|                90|
|   mean|              49.5|  NULL| 46.69892473118279|1001.8333333333334|
| stddev|29.011491975882016|  NULL|27.957181913961932| 1.326056924227661|
|    min|                 0|name-0|                 1|              1000|
|    max|                99|name-9|                98|              1004|
+-------+------------------+------+------------------+------------------+

explain:
== Physical Plan ==
*(1) Scan ExistingRDD[id#5172,name#5173,age#5174,district#5175]


group:
+------+---+-----+
|  name|age|count|
+------+---+-----+
|n

In [57]:
from datetime import datetime
dt = datetime.now()
print(dt.strftime("%Y%m"))

spark.sql("""
select to_date('20250101','yyyyMMdd') from_date
""").show()

202504
+----------+
| from_date|
+----------+
|2025-01-01|
+----------+



In [23]:
# show explain plan
def print_dict(d: dict, level: int = 0) -> None:

    sp = "".ljust(level * 3)
    for k in d.keys():
        print(f"{sp}{k}: {d[k]}")
        
        if type(d[k]) is dict:
            print_dict(d[k], level + 1)
            
    if level == 0:
        print()

print("plan with partition filter:")
plan = spark.sql("""
explain
select *
FROM t_my_date_partitioned t
WHERE 1 = 1
AND t.to_date_year_month >= to_number('202710','000000')
""")

for row in plan.collect():
    None
    print_dict(row.asDict())

#####################################################################
print("plan with partition filter and extra filter DOES NOT WORK!!!:")
plan = spark.sql("""
explain
select t.id, t.age
FROM t_my_date_partitioned t
WHERE 1 = 1
AND to_date_year_month >= 202710
AND t.from_date < to_date('202710','yyyyMMdd')
AND t.to_date > to_date('202710','yyyyMMdd')
""")

for row in plan.collect():
    print_dict(row.asDict())

plan with partition filter:
plan: == Physical Plan ==
*(1) ColumnarToRow
+- FileScan parquet spark_catalog.mytestdb.t_my_date_partitioned[id#212,name#213,age#214,district#215,from_date#216,to_date#217,to_date_year_month#218] Batched: true, DataFilters: [], Format: Parquet, Location: InMemoryFileIndex(5 paths)[file:/home/jovyan/work/various_tests/spark/spark-warehouse/mytestdb.db..., PartitionFilters: [isnotnull(to_date_year_month#218), (to_date_year_month#218 >= 202710)], PushedFilters: [], ReadSchema: struct<id:int,name:string,age:int,district:int,from_date:date,to_date:date>



plan with partition filter and extra filter DOES NOT WORK!!!:
plan: == Physical Plan ==
LocalTableScan <empty>, [id#212, age#214]





In [3]:
# partitioning (hash, range, round-robin) test
# CANNOT FIND A WAY to PARTITION a date (to_date) into date ranges!!!

from datetime import datetime, timedelta
import random
from pyspark.sql import functions as sf
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType, TimestampType
import shutil

# creating large DF:
print("Creating list: ")
ct = datetime.now()
l = []
rows = 100
for c in range(rows):
    from_date = datetime.now() + timedelta(days = random.randint(0, 1000))
    to_date = from_date + timedelta(days = random.randint(1, 100))
    l.append([c, 
              ('name-' + str(c)),  
              None if random.random() < 0.1 else random.randint(1, 100),  
              None if random.random() < 0.1 else random.randint(1000, 1004),
              from_date,
              to_date,
              int(to_date.strftime("%Y%m")),
             ])

df_schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("district", IntegerType(), True),
    StructField("from_date", DateType(), True),
    StructField("to_date", DateType(), True),
    # tech columns
    StructField("to_date_year_month", IntegerType(), True),
])
print(f"Done in {datetime.now() - ct}")

print("creating DF from list: ")
ct = datetime.now()
df = spark.createDataFrame(l, df_schema)
df.printSchema()
df.show(5)
print(f"Done in {datetime.now() - ct}")

spark.sql("create database if not exists mytestdb")
spark.sql("Show databases").show()
spark.sql("use mytestdb")
spark.sql("select current_database()").show()
spark.sql("Show tables").show()
spark.sql("Show views").show()

# Drop table
print("Dropping table/view...")
spark.sql('drop table if exists t_my_date_partitioned')
spark.sql('drop view if exists v_my_date_partitioned_simple')
spark.sql('drop view if exists v_my_date_partitioned')

# save as managed 
print("Saving table...")
(
    df.write
    .option("header", True)
    .mode("overwrite")
    .partitionBy("to_date_year_month") 
    .format("parquet") # parquet, csv for testing / readability
    .saveAsTable("t_my_date_partitioned")
)

# Querying table
print("Querying the table...")
spark.sql("""
select from_date, to_date, count(1) cnt
from t_my_date_partitioned
group by from_date, to_date
order by from_date, to_date nulls first
""").show(5)

# create simple view WITHOUT constraint on partition column
print("Creating simple view...")
spark.sql("""
CREATE OR REPLACE VIEW v_my_date_partitioned_simple AS
SELECT id, name, age, district, 
        from_date, to_date,
        to_date_year_month
FROM t_my_date_partitioned t
""")

# create view WITH constraint on partition column
print("Creating simple view...")
spark.sql("""
CREATE OR REPLACE VIEW v_my_date_partitioned AS
SELECT id, name, age, district, 
        from_date, to_date,
        to_date_year_month
FROM t_my_date_partitioned t
WHERE 1 = 1
AND t.to_date_year_month = to_number(date_format(t.to_date, 'yyyyMM'), '000000')
""")

# inserting into view IS NOT SUPPORTED!!!...


# querying the view without specifying the technical column. partition key is NOT really getting used based on the explain plan above
print("Querying the view...")
dt_str = '20250601'
print(f"Date to query: {dt_str}") # create string from date

spark.sql(f"""
select *
FROM v_my_date_partitioned v
WHERE 1 = 1
AND v.from_date <= to_date({dt_str},'yyyyMMdd')
AND v.to_date > to_date({dt_str},'yyyyMMdd')
ORDER BY from_date, to_date DESC NULLS FIRST
""").show(truncate=False)


# Drop table
print("Dropping table/view...")
#spark.sql('drop table if exists t_my_date_partitioned')
#spark.sql('drop view if exists v_my_date_partitioned_simple')
#spark.sql('drop view if exists v_my_date_partitioned')

spark.sql("Show tables").show()
spark.sql("Show views").show()
print("Done.")

Creating list: 
Done in 0:00:00.000978
creating DF from list: 
root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- district: integer (nullable = true)
 |-- from_date: date (nullable = true)
 |-- to_date: date (nullable = true)
 |-- to_date_year_month: integer (nullable = true)



                                                                                

+---+------+----+--------+----------+----------+------------------+
| id|  name| age|district| from_date|   to_date|to_date_year_month|
+---+------+----+--------+----------+----------+------------------+
|  0|name-0|  72|    1001|2026-06-29|2026-08-15|            202608|
|  1|name-1|  77|    1002|2025-06-24|2025-09-15|            202509|
|  2|name-2|  90|    1003|2027-05-18|2027-06-27|            202706|
|  3|name-3|NULL|    1004|2027-02-17|2027-05-01|            202705|
|  4|name-4|  66|    1000|2025-12-03|2025-12-21|            202512|
+---+------+----+--------+----------+----------+------------------+
only showing top 5 rows

Done in 0:00:01.187049
+---------+
|namespace|
+---------+
|  default|
| mytestdb|
+---------+

+------------------+
|current_database()|
+------------------+
|          mytestdb|
+------------------+

+---------+--------------------+-----------+
|namespace|           tableName|isTemporary|
+---------+--------------------+-----------+
| mytestdb|t_my_date_parti

25/04/05 19:11:14 WARN DropTableCommand: org.apache.spark.sql.AnalysisException: [TABLE_OR_VIEW_NOT_FOUND] The table or view `t_my_date_partitioned` cannot be found. Verify the spelling and correctness of the schema and catalog.
If you did not qualify the name with a schema, verify the current_schema() output, or qualify the name with the correct schema and catalog.
To tolerate the error on drop use DROP VIEW IF EXISTS or DROP TABLE IF EXISTS.; line 4 pos 5
org.apache.spark.sql.AnalysisException: [TABLE_OR_VIEW_NOT_FOUND] The table or view `t_my_date_partitioned` cannot be found. Verify the spelling and correctness of the schema and catalog.
If you did not qualify the name with a schema, verify the current_schema() output, or qualify the name with the correct schema and catalog.
To tolerate the error on drop use DROP VIEW IF EXISTS or DROP TABLE IF EXISTS.; line 4 pos 5
	at org.apache.spark.sql.catalyst.analysis.package$AnalysisErrorAt.tableNotFound(package.scala:87)
	at org.apache.spa

Saving table...


25/04/05 19:11:16 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.
25/04/05 19:11:17 WARN HiveConf: HiveConf of name hive.internal.ss.authz.settings.applied.marker does not exist
25/04/05 19:11:17 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
25/04/05 19:11:17 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist


Querying the table...
+----------+----------+---+
| from_date|   to_date|cnt|
+----------+----------+---+
|2025-04-14|2025-05-21|  1|
|2025-04-30|2025-07-23|  1|
|2025-05-13|2025-08-12|  1|
|2025-05-16|2025-06-07|  1|
|2025-05-17|2025-07-07|  1|
+----------+----------+---+
only showing top 5 rows

Creating simple view...
Creating simple view...
Querying the view...
Date to query: 20250601
+---+-------+---+--------+----------+----------+------------------+
|id |name   |age|district|from_date |to_date   |to_date_year_month|
+---+-------+---+--------+----------+----------+------------------+
|72 |name-72|79 |1004    |2025-04-30|2025-07-23|202507            |
|59 |name-59|34 |1002    |2025-05-13|2025-08-12|202508            |
|7  |name-7 |26 |1003    |2025-05-16|2025-06-07|202506            |
|69 |name-69|65 |1001    |2025-05-17|2025-07-07|202507            |
+---+-------+---+--------+----------+----------+------------------+

Dropping table/view...
+---------+--------------------+--------