In [1]:
# Create SparkSession v1
from pyspark.sql import SparkSession
spark = (SparkSession.builder
            #.master("local[1]")  # local- no parallelizm at all, local[2] - 2 cores, local[*] - as many cores as local logical cores
            .appName('Spark Example App #1')
            .enableHiveSupport()  # if not enabled, tables are not persistent throughout the sessions...!!
            .getOrCreate())

print(spark.sparkContext)
print("Spark App Name : "+ spark.sparkContext.appName)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/01/29 11:35:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


<SparkContext master=local[*] appName=Spark Example App #1>
Spark App Name : Spark Example App #1


In [6]:
# Create RDD (Resilient Distributed Dataset)
rdd = spark.sparkContext.range(0, 10)
print(f"Number of partitions: {rdd.getNumPartitions()}")
print(rdd.collect())

Number of partitions: 4
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


In [4]:
# stop SparkContext (there can be only one context, otherwise get error: 
# ValueError: Cannot run multiple SparkContexts at once)
spark.stop()

In [2]:
# Create or get Spark Context v2. if previous not stopped, it will return that. 
# if stopped, will create a new context
from pyspark import SparkConf, SparkContext
conf = SparkConf()
conf.setMaster("local").setAppName("Spark Example App #2")
sc = SparkContext.getOrCreate(conf)
print(sc.appName)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/01/14 10:42:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/01/14 10:42:10 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


Spark Example App #2


In [6]:
# stop the context
spark.stop()

In [7]:
# Create SparkContext v3
from pyspark import SparkContext
sc = SparkContext("local", "Spark Example App #3")
print(sc.appName)

ValueError: Cannot run multiple SparkContexts at once; existing SparkContext(app=Spark Example App #2, master=local) created by getOrCreate at /tmp/ipykernel_6213/3106436231.py:6 

In [12]:
# Create RDD (Resilient Distributed Dataset)
#rdd = spark.sparkContext.range(1, 5)
rdd = sc.range(1, 5)
print(rdd.collect())

[1, 2, 3, 4]


In [11]:
# load some data
sc = spark.sparkContext
print('Application name: ', sc.appName)
lines = sc.textFile('testfile.txt')
parts = lines.map(lambda l: l.split(','))
people = parts.map(lambda p: Row(name=p[0],age=int(p[1].strip())))
people_df = spark.createDataFrame(people)
people_df.show()

Application name:  Spark Example App #2
+-------+---+
|   name|age|
+-------+---+
|Norbert| 15|
|   John| 30|
+-------+---+



In [16]:
# https://spark.apache.org/docs/latest/sql-data-sources-csv.html
# load some data without schema
df = (spark.read
    .option("header","true")
    .option("delimiter",",")
    .option("inferSchema", "true")
    .option("dateFormat", "yyyy-MM-dd")
    .option("timestampFormat","yyyy-MM-dd HH:mm:ss")
    .csv('csv_operations_source.csv')   
)
df.show()
df.printSchema()

# load some data with schema
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType, TimestampType, BooleanType
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("city", StringType(), True),
    StructField("date", DateType(), True),
    StructField("datetime", TimestampType(), True),
])

df = (spark.read
    .option("header","true")
    .option("delimiter",",")
    .option("ignoreLeadingWhiteSpace", "true") \
    .option("dateFormat", "yyyy-MM-dd")
    .option("timestampFormat","yyyy-MM-dd HH:mm:ss")
    .schema(schema)
    .csv('csv_operations_source.csv')   
)
df.show()
df.printSchema()


# query the data
#df.select('id','city').show()
df.select(df.id, (df.id + 10).alias('id_10')).show()

+---+------+----------+--------------------+
| id|  city|      date|            datetime|
+---+------+----------+--------------------+
|  1|city-1|2025-01-01| 2026-01-01 17:01:59|
|  2|city-2|2025-01-02| 2026-01-01 17:02:59|
|  3|city-3|2025-01-03| 2026-01-01 17:03:59|
+---+------+----------+--------------------+

root
 |-- id: integer (nullable = true)
 |-- city: string (nullable = true)
 |-- date: date (nullable = true)
 |-- datetime: string (nullable = true)

+---+------+----------+-------------------+
| id|  city|      date|           datetime|
+---+------+----------+-------------------+
|  1|city-1|2025-01-01|2026-01-01 17:01:59|
|  2|city-2|2025-01-02|2026-01-01 17:02:59|
|  3|city-3|2025-01-03|2026-01-01 17:03:59|
+---+------+----------+-------------------+

root
 |-- id: integer (nullable = true)
 |-- city: string (nullable = true)
 |-- date: date (nullable = true)
 |-- datetime: timestamp (nullable = true)

+---+-----+
| id|id_10|
+---+-----+
|  1|   11|
|  2|   12|
|  3|   13

In [94]:
arrayData = [
        ('James',['Java','Scala'],{'hair':'black','eye':'brown'}),
        ('Michael',['Spark','Java',None],{'hair':'brown','eye':None}),
        ('Robert',['CSharp',''],{'hair':'red','eye':''}),
        ('Washington',None,None),
        ('Jefferson',['1','2'],{})]
    
df = spark.createDataFrame(data=arrayData, schema = ['name','knownLanguages','properties'])
df.printSchema()
df.show()

# explode the knownLanguages (array) and properties (map) columns
# will only create a record if all columns exist, hence "inner join"
from pyspark.sql.functions import explode
df2 = df.select(df.name, explode(df.knownLanguages), explode(df.properties))
df2.printSchema()
df2.show()

# explode outer will create an "outer join", so a record for any data available
from pyspark.sql.functions import explode_outer
df3 = df.select(df.name, explode_outer(df.knownLanguages), explode_outer(df.properties))
df3.printSchema()
df3.show()

root
 |-- name: string (nullable = true)
 |-- knownLanguages: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- properties: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)

+----------+-------------------+--------------------+
|      name|     knownLanguages|          properties|
+----------+-------------------+--------------------+
|     James|      [Java, Scala]|{eye -> brown, ha...|
|   Michael|[Spark, Java, NULL]|{eye -> NULL, hai...|
|    Robert|         [CSharp, ]|{eye -> , hair ->...|
|Washington|               NULL|                NULL|
| Jefferson|             [1, 2]|                  {}|
+----------+-------------------+--------------------+

root
 |-- name: string (nullable = true)
 |-- col: string (nullable = true)
 |-- key: string (nullable = false)
 |-- value: string (nullable = true)

+-------+------+----+-----+
|   name|   col| key|value|
+-------+------+----+-----+
|  James|  Java| eye|brown|
|  J

In [6]:
# outer join 2 tables, resolving ambiguous colmuns
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType, TimestampType
from pyspark.sql import functions as F

df_schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
])

lst1 = [
        (1, 'Aname1'),
        (2, 'Aname2'),
        (3, 'Aname3'),
        ]
df1 = spark.createDataFrame(data=lst1, schema = df_schema)

lst2 = [
        (1, 'Bname1'),
        (2, 'Bname2'),
        ]
df2 = spark.createDataFrame(data=lst2, schema = df_schema)

lst3 = [
        (2, 'Cname2'),
        (3, 'Cname3'),
        ]
df3 = spark.createDataFrame(data=lst3, schema = df_schema)


# join key is id, only id will remain in the result DF. The rest of the columns need to be unique otherwise DF will be ambiguous
df_r1 = df1.alias("df1").join(df2.alias("df2"), F.col("df1.id") == F.col('df2.id'), "left").select(df1.id, df1.name, df2.name.alias('name2'))
df_r1.show()

# 3 tables (df1-+df2, df1-+df3), as the df1.id is projected after the first join and not both df1.id, df2.id(!), the 3rd table is joined to df1!
df_r1 = (
     df1.alias("df1")
    .join(df2.alias("df2"), "id", "left") # df1.id(!) by default!, df1.name, df2.name
    .join(df3.alias("df3"), "id", "left") # df1(!) join to df3
).select(df1.id.alias('id1'), df2.id.alias('id2'), df3.id.alias('id3'), df1.name.alias('name1'), df2.name.alias('name2'), df3.name.alias('name3'))
df_r1.show()

# if we want to join df1-+df2-+df3
df_r1 = (
     df1.alias("df1")
    .join(
        df2.alias("df2").join(df3.alias("df3"), "id", "left"), 
        "id", "left") # 
).select(df1.id.alias('id1'), df2.id.alias('id2'), df3.id.alias('id3'), df1.name.alias('name1'), df2.name.alias('name2'), df3.name.alias('name3'))
df_r1.show()

# otherwise there is a problem:
df_r1 = df1.join(df2, "id", "inner")
df_r1.show() # OK
#df_r1.describe() # FAIL!

# to resolve even after the join, result schema metadata of the join sources are kept under the hood for data lineage (can be inspected from df.explain())
# if the join sources are not yet deleted(!)
df_r2=df1.join(df2,"id", "inner")
df_r2 = df_r2.select(df1.id, df1.name, df2.name.alias('name2'))
df_r2.show()

# if deleted, but we have alias
df_r3 = df1.alias("df1").join(df2.alias("df2"), "id", "left")
del df1, df2
df_r3 = df_r3.select(F.col("df1.id"), F.col("df1.name").alias("name1"), F.col("df2.name").alias("name2"))
df_r3.show()

print("Done.")

+---+------+------+
| id|  name| name2|
+---+------+------+
|  1|Aname1|Bname1|
|  2|Aname2|Bname2|
|  3|Aname3|  NULL|
+---+------+------+

+---+----+----+------+------+------+
|id1| id2| id3| name1| name2| name3|
+---+----+----+------+------+------+
|  1|   1|NULL|Aname1|Bname1|  NULL|
|  2|   2|   2|Aname2|Bname2|Cname2|
|  3|NULL|   3|Aname3|  NULL|Cname3|
+---+----+----+------+------+------+

+---+----+----+------+------+------+
|id1| id2| id3| name1| name2| name3|
+---+----+----+------+------+------+
|  1|   1|NULL|Aname1|Bname1|  NULL|
|  2|   2|   2|Aname2|Bname2|Cname2|
|  3|NULL|NULL|Aname3|  NULL|  NULL|
+---+----+----+------+------+------+

+---+------+------+
| id|  name|  name|
+---+------+------+
|  1|Aname1|Bname1|
|  2|Aname2|Bname2|
+---+------+------+

+---+------+------+
| id|  name| name2|
+---+------+------+
|  1|Aname1|Bname1|
|  2|Aname2|Bname2|
+---+------+------+

+---+------+------+
| id| name1| name2|
+---+------+------+
|  1|Aname1|Bname1|
|  2|Aname2|Bname