In [0]:
orders = spark.read.csv('/public/retail_db/orders')
import getpass
username = getpass.getuser()

In [0]:
# WRITING DATA

# Basics
orders.write.csv(f'/user/{username}/retail_db/orders', header=True)
orders.write.format('json').save(f'/user/{username}/retail_db/orders')

# Compression
orders.write.csv(f'/user/{username}/retail_db/orders', compression='gzip')

# Mode
    # overwrite - delete the directory and create a new one
    # append - add new files to the directory
    # ignore - do nothing if the directory exists
    # error - throw an error if the directory exists
orders.write.csv(f'/user/{username}/retail_db/orders', mode='overwrite')

# Coalesce - only less
orders.coalesce(1).write.csv(f'/user/{username}/retail_db/orders', mode='overwrite')

# Repartition - less or more + reshuffling
orders.repartition(10).write.csv(f'/user/{username}/retail_db/orders', mode='overwrite')

In [0]:
# Deleting directory
dbutils.fs.rm(f'/user/{username}/retail_db/orders', recurse=True)

In [0]:
# Basic write (it will create a directory)
orders.write.csv(f'/user/{username}/retail_db/orders')
orders.write.format('csv').save(f'/user/{username}/retail_db/orders')

In [0]:
# Default number of files in the directory will be determinded implicitly based on several factors
dbutils.fs.ls(f'/user/{username}/retail_db/orders')

In [0]:
# Additional options can be passed like in reading methods

orders.write.csv(f'/user/{username}/retail_db/orders', header=True)
orders.write.option('header', True).csv(f'/user/{username}/retail_db/orders')
# etc...

In [0]:
# One of special arguments regards compression
# Compressing reduces file size: 2999944 -> 471106
# Different compression algorithms are available
# Reading methods automatically detect compression type and decompress

orders.write.csv(f'/user/{username}/retail_db/orders', compression='gzip')
dbutils.fs.ls(f'/user/{username}/retail_db/orders')

In [0]:
# We have 4 writing modes:
# overwrite - delete the directory and create a new one
# append - add new files to the directory
# ignore - do nothing if the directory exists
# error - throw an error if the directory exists

orders.write.csv(f'/user/{username}/retail_db/orders', mode='overwrite')

In [0]:
# JSON and PARQUET

orders.write.json(f'/user/{username}/retail_db/orders', mode='overwrite')
orders.write.parquet(f'/user/{username}/retail_db/orders', mode='overwrite')

In [0]:
# Coalesce and repartition

# coalesce is used to reduce the number of partitions
# repartition reshuffles data into more or less partitions
orders.coalesce(1).write.csv(f'/user/{username}/retail_db/orders', mode='overwrite')
orders.repartition(10).write.csv(f'/user/{username}/retail_db/orders', mode='overwrite')
dbutils.fs.ls(f'/user/{username}/retail_db/orders')
# Now we have 10 files


In [0]:
# In .repartition we can specify columns to partition by. 
# The final value can be lesser than specified if the number of unique values in column is lesser

orders.coalesce(1).write.csv(f'/user/{username}/retail_db/orders', mode='overwrite')
orders.repartition(10, '_c3').write.csv(f'/user/{username}/retail_db/orders', mode='overwrite')
dbutils.fs.ls(f'/user/{username}/retail_db/orders')