In [0]:
orders = spark.read.csv('/public/retail_db/orders')
import getpass
username = getpass.getuser()

In [0]:
# WRITING DATA

# Basics
orders.write.csv(f'/user/{username}/retail_db/orders', header=True)
orders.write.format('json').save(f'/user/{username}/retail_db/orders')

# Compression
orders.write.csv(f'/user/{username}/retail_db/orders', compression='gzip')

# Mode
    # overwrite - delete the directory and create a new one
    # append - add new files to the directory
    # ignore - do nothing if the directory exists
    # error - throw an error if the directory exists
orders.write.csv(f'/user/{username}/retail_db/orders', mode='overwrite')

# Coalesce - only less
orders.coalesce(1).write.csv(f'/user/{username}/retail_db/orders', mode='overwrite')

# Repartition - less or more + reshuffling
orders.repartition(10).write.csv(f'/user/{username}/retail_db/orders', mode='overwrite')

In [0]:
# Deleting directory
dbutils.fs.rm(f'/user/{username}/retail_db/orders', recurse=True)

True

In [0]:
# Basic write (it will create a directory)
orders.write.csv(f'/user/{username}/retail_db/orders')
orders.write.format('csv').save(f'/user/{username}/retail_db/orders')

In [0]:
# Default number of files in the directory will be determinded implicitly based on several factors
dbutils.fs.ls(f'/user/{username}/retail_db/orders')

[FileInfo(path='dbfs:/user/root/retail_db/orders/_SUCCESS', name='_SUCCESS', size=0, modificationTime=1744315383000),
 FileInfo(path='dbfs:/user/root/retail_db/orders/_committed_3139015035484414749', name='_committed_3139015035484414749', size=111, modificationTime=1744315383000),
 FileInfo(path='dbfs:/user/root/retail_db/orders/_started_3139015035484414749', name='_started_3139015035484414749', size=0, modificationTime=1744315382000),
 FileInfo(path='dbfs:/user/root/retail_db/orders/part-00000-tid-3139015035484414749-3dfeea15-dd45-4fe4-90c7-a2987f623118-6-1-c000.csv', name='part-00000-tid-3139015035484414749-3dfeea15-dd45-4fe4-90c7-a2987f623118-6-1-c000.csv', size=2999944, modificationTime=1744315383000)]

In [0]:
# Additional options can be passed like in reading methods

orders.write.csv(f'/user/{username}/retail_db/orders', header=True)
orders.write.option('header', True).csv(f'/user/{username}/retail_db/orders')
# etc...

In [0]:
# One of special arguments regards compression
# Compressing reduces file size: 2999944 -> 471106
# Different compression algorithms are available
# Reading methods automatically detect compression type and decompress

orders.write.csv(f'/user/{username}/retail_db/orders', compression='gzip')
dbutils.fs.ls(f'/user/{username}/retail_db/orders')

[FileInfo(path='dbfs:/user/root/retail_db/orders/_SUCCESS', name='_SUCCESS', size=0, modificationTime=1744352421000),
 FileInfo(path='dbfs:/user/root/retail_db/orders/_committed_5200104112532682271', name='_committed_5200104112532682271', size=114, modificationTime=1744352421000),
 FileInfo(path='dbfs:/user/root/retail_db/orders/_started_5200104112532682271', name='_started_5200104112532682271', size=0, modificationTime=1744352421000),
 FileInfo(path='dbfs:/user/root/retail_db/orders/part-00000-tid-5200104112532682271-6cedeeb5-265d-423c-a3e6-4992bc58fe17-2-1-c000.csv.gz', name='part-00000-tid-5200104112532682271-6cedeeb5-265d-423c-a3e6-4992bc58fe17-2-1-c000.csv.gz', size=471106, modificationTime=1744352421000)]

In [0]:
# We have 4 writing modes:
# overwrite - delete the directory and create a new one
# append - add new files to the directory
# ignore - do nothing if the directory exists
# error - throw an error if the directory exists

orders.write.csv(f'/user/{username}/retail_db/orders', mode='overwrite')

In [0]:
# JSON and PARQUET

orders.write.json(f'/user/{username}/retail_db/orders', mode='overwrite')
orders.write.parquet(f'/user/{username}/retail_db/orders', mode='overwrite')

In [0]:
# Coalesce and repartition

# coalesce is used to reduce the number of partitions
# repartition reshuffles data into more or less partitions
orders.coalesce(1).write.csv(f'/user/{username}/retail_db/orders', mode='overwrite')
orders.repartition(10).write.csv(f'/user/{username}/retail_db/orders', mode='overwrite')
dbutils.fs.ls(f'/user/{username}/retail_db/orders')
# Now we have 10 files


[FileInfo(path='dbfs:/user/root/retail_db/orders/_SUCCESS', name='_SUCCESS', size=0, modificationTime=1744353292000),
 FileInfo(path='dbfs:/user/root/retail_db/orders/_committed_5200104112532682271', name='_committed_5200104112532682271', size=114, modificationTime=1744352421000),
 FileInfo(path='dbfs:/user/root/retail_db/orders/_committed_6070535406786868790', name='_committed_6070535406786868790', size=209, modificationTime=1744352808000),
 FileInfo(path='dbfs:/user/root/retail_db/orders/_committed_7105511656959692079', name='_committed_7105511656959692079', size=212, modificationTime=1744352800000),
 FileInfo(path='dbfs:/user/root/retail_db/orders/_committed_8352992320644514807', name='_committed_8352992320644514807', size=997, modificationTime=1744353292000),
 FileInfo(path='dbfs:/user/root/retail_db/orders/_started_5200104112532682271', name='_started_5200104112532682271', size=0, modificationTime=1744352421000),
 FileInfo(path='dbfs:/user/root/retail_db/orders/_started_6070535406

In [0]:
# In .repartition we can specify columns to partition by. 
# The final value can be lesser than specified if the number of unique values in column is lesser

orders.coalesce(1).write.csv(f'/user/{username}/retail_db/orders', mode='overwrite')
orders.repartition(10, '_c3').write.csv(f'/user/{username}/retail_db/orders', mode='overwrite')
dbutils.fs.ls(f'/user/{username}/retail_db/orders')

[FileInfo(path='dbfs:/user/root/retail_db/orders/_SUCCESS', name='_SUCCESS', size=0, modificationTime=1744353433000),
 FileInfo(path='dbfs:/user/root/retail_db/orders/_committed_1529292230843254032', name='_committed_1529292230843254032', size=991, modificationTime=1744353431000),
 FileInfo(path='dbfs:/user/root/retail_db/orders/_committed_2935834174864987587', name='_committed_2935834174864987587', size=639, modificationTime=1744353433000),
 FileInfo(path='dbfs:/user/root/retail_db/orders/_committed_3082729863106594042', name='_committed_3082729863106594042', size=987, modificationTime=1744353404000),
 FileInfo(path='dbfs:/user/root/retail_db/orders/_committed_3572946910162240780', name='_committed_3572946910162240780', size=991, modificationTime=1744353406000),
 FileInfo(path='dbfs:/user/root/retail_db/orders/_committed_4402933881309942011', name='_committed_4402933881309942011', size=991, modificationTime=1744353423000),
 FileInfo(path='dbfs:/user/root/retail_db/orders/_committed_49