In [3]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType


# Create SparkSession
spark = SparkSession.builder.appName("SparkReader").getOrCreate()

# Create a dataframe from a data set
data = [("James", 23), ("Ann", 40)]
columns = ["name", "age"]
df = spark.createDataFrame(data, columns)
df.show()

# Define schema
schema = StructType([
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True)
])
df = spark.createDataFrame(data, schema)
df.show()

# Define schema with nested elements
schema = StructType([
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("addresses", ArrayType(
        StructType([
            StructField("city", StringType(), True),
            StructField("state", StringType(), True)
        ])
    ), True)
])
data = [("James", 23, [("New York", "NY"), ("Los Angeles", "CA")]),
        ("Ann", 40, [("Chicago", "IL")])]
df = spark.createDataFrame(data, schema)
df.show(truncate=False)


# Create a list of Rows with nested elements
row_list = [
    Row(name="James", age=23, \
        addresses=[Row(city="New York", state="NY"),\
        Row(city="Los Angeles", state="CA")]),
    Row(name="Ann", age=40, addresses=[Row(city="Chicago", state="IL")])
]

# Create DataFrame
df = spark.createDataFrame(row_list) # We can skip the schema argument
df.show(truncate=False)

# Print schema
df.printSchema()

+-----+---+
| name|age|
+-----+---+
|James| 23|
|  Ann| 40|
+-----+---+

+-----+---+
| name|age|
+-----+---+
|James| 23|
|  Ann| 40|
+-----+---+

+-----+---+-----------------------------------+
|name |age|addresses                          |
+-----+---+-----------------------------------+
|James|23 |[{New York, NY}, {Los Angeles, CA}]|
|Ann  |40 |[{Chicago, IL}]                    |
+-----+---+-----------------------------------+

+-----+---+-----------------------------------+
|name |age|addresses                          |
+-----+---+-----------------------------------+
|James|23 |[{New York, NY}, {Los Angeles, CA}]|
|Ann  |40 |[{Chicago, IL}]                    |
+-----+---+-----------------------------------+

root
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)
 |-- addresses: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- city: string (nullable = true)
 |    |    |-- state: string (nullable = true)



24/09/12 02:18:39 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 258420 ms exceeds timeout 120000 ms
24/09/12 02:18:39 WARN SparkContext: Killing executors is not supported by current scheduler.
24/09/12 02:27:32 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$

Read Write Methods

In [None]:
# Reading a CSV file
df_csv = spark.read.csv("path/to/csvfile.csv", header=True, inferSchema=True)
df_csv.show()

# Writing a CSV file
df_csv.write.csv("path/to/output/csvfile.csv", header=True)

# Reading a JSON file
df_json = spark.read.json("path/to/jsonfile.json")
df_json.show()

# Writing a JSON file
df_json.write.json("path/to/output/jsonfile.json")

# Reading a Parquet file
df_parquet = spark.read.parquet("path/to/parquetfile.parquet")
df_parquet.show()

# Writing a Parquet file
df_parquet.write.parquet("path/to/output/parquetfile.parquet")

# Reading from a relational database
df_db = spark.read.format("jdbc").options(
    url="jdbc:mysql://hostname:port/dbname",
    driver="com.mysql.jdbc.Driver",
    dbtable="tablename",
    user="username",
    password="password"
).load()
df_db.show()

# Writing to a relational database
df_db.write.format("jdbc").options(
    url="jdbc:mysql://hostname:port/dbname",
    driver="com.mysql.jdbc.Driver",
    dbtable="output_tablename",
    user="username",
    password="password"
).mode('append').save()

# Reading a text file
df_text = spark.read.text("path/to/textfile.txt")
df_text.show()

# Writing a text file
df_text.write.text("path/to/output/textfile.txt")

# Reading an ORC file
df_orc = spark.read.orc("path/to/orcfile.orc")
df_orc.show()

# Writing an ORC file
df_orc.write.orc("path/to/output/orcfile.orc")

# Reading an Avro file
df_avro = spark.read.format("avro").load("path/to/avrofile.avro")
df_avro.show()

# Writing an Avro file
df_avro.write.format("avro").save("path/to/output/avrofile.avro")

# Reading an XML file (requires spark-xml package)
df_xml = spark.read.format("com.databricks.spark.xml").options(rowTag="row").load("path/to/xmlfile.xml")
df_xml.show()

# Writing an XML file (requires spark-xml package)
df_xml.write.format("com.databricks.spark.xml").options(rowTag="row").save("path/to/output/xmlfile.xml")

# Reading a Delta file (requires delta package)
df_delta = spark.read.format("delta").load("path/to/deltafile")
df_delta.show()

# Writing a Delta file (requires delta package)
df_delta.write.format("delta").save("path/to/output/deltafile")