<a href="https://colab.research.google.com/github/DenysNunes/data-examples/blob/main/spark/basic/read_save.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Init spark**

In [1]:
!pip install -q pyspark==3.1.1
!sudo apt install tree
!rm -rf /tmp/read-save-example/

from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .master('local[*]') \
    .appName("New Session Example") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.jars.packages", "org.apache.hudi:hudi-spark3-bundle_2.12:0.9.0,org.apache.spark:spark-avro_2.12:3.0.1") \
    .enableHiveSupport() \
    .getOrCreate()

Reading package lists... Done
Building dependency tree       
Reading state information... Done
tree is already the newest version (1.7.0-5).
0 upgraded, 0 newly installed, 0 to remove and 37 not upgraded.


In [2]:
from pyspark.sql.types import Row
from datetime import datetime
import random

def get_random():
  return random.randrange(2000, 5000, 100)

raw_rows = [
        Row(id=1, name='John', salary=get_random(), date=datetime(2020, 1, 1)),
        Row(id=2, name='Joana', salary=get_random(), date=datetime(2020, 1, 1)),
        Row(id=3, name='Maria', salary=get_random(), date=datetime(2020, 1, 2)),
        Row(id=4, name='Sandra', salary=get_random(), date=datetime(2020, 1, 2)),
        Row(id=5, name='Ben', salary=get_random(), date=datetime(2020, 1, 3)),
        Row(id=6, name='Carl', salary=get_random(), date=datetime(2020, 1, 3)),
        Row(id=7, name='Joseph', salary=get_random(), date=datetime(2020, 1, 4)),
        Row(id=8, name='Oliver', salary=get_random(), date=datetime(2020, 1, 4))
]

df = spark.createDataFrame(raw_rows)

df.show()

+---+------+------+-------------------+
| id|  name|salary|               date|
+---+------+------+-------------------+
|  1|  John|  2800|2020-01-01 00:00:00|
|  2| Joana|  4700|2020-01-01 00:00:00|
|  3| Maria|  2200|2020-01-02 00:00:00|
|  4|Sandra|  2600|2020-01-02 00:00:00|
|  5|   Ben|  3400|2020-01-03 00:00:00|
|  6|  Carl|  2400|2020-01-03 00:00:00|
|  7|Joseph|  3600|2020-01-04 00:00:00|
|  8|Oliver|  4200|2020-01-04 00:00:00|
+---+------+------+-------------------+



# **Saving Mode 1**

In [3]:
df.write.save(path='/tmp/read-save-example/df1/', format='csv', delimiter=',', header=True)

In [4]:
!ls /tmp/read-save-example/df1/

part-00000-bb21edaa-1b88-418f-8b90-8f0e25258696-c000.csv  _SUCCESS
part-00001-bb21edaa-1b88-418f-8b90-8f0e25258696-c000.csv


In [5]:
!cat /tmp/read-save-example/df1/*.csv

id,name,salary,date
1,John,2800,2020-01-01T00:00:00.000Z
2,Joana,4700,2020-01-01T00:00:00.000Z
3,Maria,2200,2020-01-02T00:00:00.000Z
4,Sandra,2600,2020-01-02T00:00:00.000Z
id,name,salary,date
5,Ben,3400,2020-01-03T00:00:00.000Z
6,Carl,2400,2020-01-03T00:00:00.000Z
7,Joseph,3600,2020-01-04T00:00:00.000Z
8,Oliver,4200,2020-01-04T00:00:00.000Z


# **Saving Mode 2**

In [6]:
df.write.option('delimiter', ',').option('header', True).csv(path='/tmp/read-save-example/df2/')

In [7]:
!cat /tmp/read-save-example/df2/*.csv

id,name,salary,date
1,John,2800,2020-01-01T00:00:00.000Z
2,Joana,4700,2020-01-01T00:00:00.000Z
3,Maria,2200,2020-01-02T00:00:00.000Z
4,Sandra,2600,2020-01-02T00:00:00.000Z
id,name,salary,date
5,Ben,3400,2020-01-03T00:00:00.000Z
6,Carl,2400,2020-01-03T00:00:00.000Z
7,Joseph,3600,2020-01-04T00:00:00.000Z
8,Oliver,4200,2020-01-04T00:00:00.000Z


# **Saving with diferent formats**

In [8]:
# mode = overwrite -> remove all previously data and save your dataframe
# mode = append -> append new data with old

df.write.save(path='/tmp/read-save-example/otherformat/parquet/', 
              format='parquet', 
              mode='overwrite')

df.write.save(path='/tmp/read-save-example/otherformat/orc/', 
              format='orc', 
              mode='overwrite')

In [9]:
!tree /tmp/read-save-example/otherformat/

/tmp/read-save-example/otherformat/
├── orc
│   ├── part-00000-66670dca-5377-46ed-9e13-3ed4c78d8531-c000.snappy.orc
│   ├── part-00001-66670dca-5377-46ed-9e13-3ed4c78d8531-c000.snappy.orc
│   └── _SUCCESS
└── parquet
    ├── part-00000-13cefc25-f44e-47da-a613-44bb10ebf3dd-c000.snappy.parquet
    ├── part-00001-13cefc25-f44e-47da-a613-44bb10ebf3dd-c000.snappy.parquet
    └── _SUCCESS

2 directories, 6 files


# **Saving with a external format**

**Notice that hudi jar already added on session !!!**

More information about hudi [here](https://hudi.apache.org/).

In [10]:
hudi_options = {
    'hoodie.table.name': 'tb_hudi_salaries',
    'hoodie.datasource.write.recordkey.field': 'id',
    'hoodie.datasource.write.partitionpath.field': 'date',
    'hoodie.datasource.write.table.name': 'tb_hudi_salaries',
    'hoodie.datasource.write.operation': 'insert',
    'hoodie.datasource.write.precombine.field': 'ts',
    'hoodie.insert.shuffle.parallelism': 2
}

df.write.format("hudi").  \
    options(**hudi_options). \
    mode("overwrite"). \
    save('/tmp/read-save-example/otherformat/hudi/')


In [11]:
!tree /tmp/read-save-example/otherformat/hudi/

/tmp/read-save-example/otherformat/hudi/
├── 1577836800000000
│   └── a02820a1-7c4d-4fbd-9c86-88c18fd590a7-0_3-12-23_20211112232458.parquet
├── 1577923200000000
│   └── da0c34e1-bdde-48a3-a7ab-b8f8c0341740-0_1-12-21_20211112232458.parquet
├── 1578009600000000
│   └── 73c1b5ec-954b-48da-a970-8424c20d4257-0_2-12-22_20211112232458.parquet
└── 1578096000000000
    └── b56277cd-0d8a-41b7-8bf7-3cade009ad34-0_0-10-20_20211112232458.parquet

4 directories, 4 files


# **Reading mode 1 - All params like arguments**

In [12]:
df_load = spark.read.load(format='parquet', path='/tmp/read-save-example/otherformat/parquet/')
df_load.show()

+---+------+------+-------------------+
| id|  name|salary|               date|
+---+------+------+-------------------+
|  1|  John|  2800|2020-01-01 00:00:00|
|  2| Joana|  4700|2020-01-01 00:00:00|
|  3| Maria|  2200|2020-01-02 00:00:00|
|  4|Sandra|  2600|2020-01-02 00:00:00|
|  5|   Ben|  3400|2020-01-03 00:00:00|
|  6|  Carl|  2400|2020-01-03 00:00:00|
|  7|Joseph|  3600|2020-01-04 00:00:00|
|  8|Oliver|  4200|2020-01-04 00:00:00|
+---+------+------+-------------------+



# **Reading mode 2 - Implicit format method**

In [13]:
df_load2 = spark.read.parquet('/tmp/read-save-example/otherformat/parquet/')
df_load2.show()

+---+------+------+-------------------+
| id|  name|salary|               date|
+---+------+------+-------------------+
|  1|  John|  2800|2020-01-01 00:00:00|
|  2| Joana|  4700|2020-01-01 00:00:00|
|  3| Maria|  2200|2020-01-02 00:00:00|
|  4|Sandra|  2600|2020-01-02 00:00:00|
|  5|   Ben|  3400|2020-01-03 00:00:00|
|  6|  Carl|  2400|2020-01-03 00:00:00|
|  7|Joseph|  3600|2020-01-04 00:00:00|
|  8|Oliver|  4200|2020-01-04 00:00:00|
+---+------+------+-------------------+



# **Reading mode 3 - Using wildcards**

In [18]:
df_load3 = spark.read.parquet('/tmp/read-save-example/otherformat/parquet/part*.parquet')
df_load3.show()

+---+------+------+-------------------+
| id|  name|salary|               date|
+---+------+------+-------------------+
|  1|  John|  2800|2020-01-01 00:00:00|
|  2| Joana|  4700|2020-01-01 00:00:00|
|  3| Maria|  2200|2020-01-02 00:00:00|
|  4|Sandra|  2600|2020-01-02 00:00:00|
|  5|   Ben|  3400|2020-01-03 00:00:00|
|  6|  Carl|  2400|2020-01-03 00:00:00|
|  7|Joseph|  3600|2020-01-04 00:00:00|
|  8|Oliver|  4200|2020-01-04 00:00:00|
+---+------+------+-------------------+



# **Reading mode 4 - Spark SQL direct path**

In [17]:
df_load4 = spark.sql('select * from parquet.`/tmp/read-save-example/otherformat/parquet/`')
df_load4.show()

+---+------+------+-------------------+
| id|  name|salary|               date|
+---+------+------+-------------------+
|  1|  John|  2800|2020-01-01 00:00:00|
|  2| Joana|  4700|2020-01-01 00:00:00|
|  3| Maria|  2200|2020-01-02 00:00:00|
|  4|Sandra|  2600|2020-01-02 00:00:00|
|  5|   Ben|  3400|2020-01-03 00:00:00|
|  6|  Carl|  2400|2020-01-03 00:00:00|
|  7|Joseph|  3600|2020-01-04 00:00:00|
|  8|Oliver|  4200|2020-01-04 00:00:00|
+---+------+------+-------------------+

