<a href="https://colab.research.google.com/github/DenysNunes/data-examples/blob/main/spark/basic/read_save.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Init spark**

In [1]:
!pip install -q pyspark==3.1.1
!sudo apt install tree
!rm -rf /tmp/read-save-example/

from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .master('local[*]') \
    .appName("New Session Example") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.jars.packages", "org.apache.hudi:hudi-spark3-bundle_2.12:0.9.0,org.apache.spark:spark-avro_2.12:3.0.1") \
    .enableHiveSupport() \
    .getOrCreate()

Reading package lists... Done
Building dependency tree       
Reading state information... Done
tree is already the newest version (1.7.0-5).
0 upgraded, 0 newly installed, 0 to remove and 37 not upgraded.


In [3]:
from pyspark.sql.types import Row
from datetime import date
import random

def get_random():
  return random.randrange(2000, 5000, 100)

raw_rows = [
        Row(id=1, name='John', salary=get_random(), hire_date=date(2020, 1, 1)),
        Row(id=2, name='Joana', salary=get_random(), hire_date=date(2020, 1, 1)),
        Row(id=3, name='Maria', salary=get_random(), hire_date=date(2020, 1, 2)),
        Row(id=4, name='Sandra', salary=get_random(), hire_date=date(2020, 1, 2)),
        Row(id=5, name='Ben', salary=get_random(), hire_date=date(2020, 1, 3)),
        Row(id=6, name='Carl', salary=get_random(), hire_date=date(2020, 1, 3)),
        Row(id=7, name='Joseph', salary=get_random(), hire_date=date(2020, 1, 4)),
        Row(id=8, name='Oliver', salary=get_random(), hire_date=date(2020, 1, 4))
]

df = spark.createDataFrame(raw_rows)

df.show()

+---+------+------+----------+
| id|  name|salary| hire_date|
+---+------+------+----------+
|  1|  John|  3100|2020-01-01|
|  2| Joana|  2700|2020-01-01|
|  3| Maria|  3100|2020-01-02|
|  4|Sandra|  4500|2020-01-02|
|  5|   Ben|  3100|2020-01-03|
|  6|  Carl|  4300|2020-01-03|
|  7|Joseph|  4200|2020-01-04|
|  8|Oliver|  3800|2020-01-04|
+---+------+------+----------+



# **Saving Mode 1**

In [4]:
df.write.save(path='/tmp/read-save-example/df1/', format='csv', delimiter=',', header=True)

In [5]:
!ls /tmp/read-save-example/df1/

part-00000-9ecc7a71-cb4c-4a03-ab29-69a691d92b72-c000.csv  _SUCCESS
part-00001-9ecc7a71-cb4c-4a03-ab29-69a691d92b72-c000.csv


In [6]:
!cat /tmp/read-save-example/df1/*.csv

id,name,salary,hire_date
1,John,3100,2020-01-01
2,Joana,2700,2020-01-01
3,Maria,3100,2020-01-02
4,Sandra,4500,2020-01-02
id,name,salary,hire_date
5,Ben,3100,2020-01-03
6,Carl,4300,2020-01-03
7,Joseph,4200,2020-01-04
8,Oliver,3800,2020-01-04


# **Saving Mode 2**

In [7]:
df.write.option('delimiter', ',').option('header', True).csv(path='/tmp/read-save-example/df2/')

In [8]:
!cat /tmp/read-save-example/df2/*.csv

id,name,salary,hire_date
1,John,3100,2020-01-01
2,Joana,2700,2020-01-01
3,Maria,3100,2020-01-02
4,Sandra,4500,2020-01-02
id,name,salary,hire_date
5,Ben,3100,2020-01-03
6,Carl,4300,2020-01-03
7,Joseph,4200,2020-01-04
8,Oliver,3800,2020-01-04


# **Saving Mode 3 - Partitioned**

In [9]:
df.write.save(path='/tmp/read-save-example/df3/', format='parquet', partitionBy=['hire_date'])

In [10]:
!tree /tmp/read-save-example/df3/

/tmp/read-save-example/df3/
├── hire_date=2020-01-01
│   └── part-00000-a942ceed-0881-4277-9e6c-ee0243cc2ab4.c000.snappy.parquet
├── hire_date=2020-01-02
│   └── part-00000-a942ceed-0881-4277-9e6c-ee0243cc2ab4.c000.snappy.parquet
├── hire_date=2020-01-03
│   └── part-00001-a942ceed-0881-4277-9e6c-ee0243cc2ab4.c000.snappy.parquet
├── hire_date=2020-01-04
│   └── part-00001-a942ceed-0881-4277-9e6c-ee0243cc2ab4.c000.snappy.parquet
└── _SUCCESS

4 directories, 5 files


# **Saving Mode 4 - As Table**

In [12]:
df.write.saveAsTable(path='/tmp/read-save-example/df3/', name='tb_parquet_salaries')

In [13]:
!tree /tmp/read-save-example/df3/

/tmp/read-save-example/df3/
├── part-00000-4cef3893-bc2b-4fa1-8d8d-8bd57356b7ce-c000.snappy.parquet
├── part-00001-4cef3893-bc2b-4fa1-8d8d-8bd57356b7ce-c000.snappy.parquet
└── _SUCCESS

0 directories, 3 files


# **Saving with diferent formats**

In [14]:
# mode = overwrite -> remove all previously data and save your dataframe
# mode = append -> append new data with old

df.write.save(path='/tmp/read-save-example/otherformat/parquet/', 
              format='parquet', 
              mode='overwrite')

df.write.save(path='/tmp/read-save-example/otherformat/orc/', 
              format='orc', 
              mode='overwrite')

In [15]:
!tree /tmp/read-save-example/otherformat/

/tmp/read-save-example/otherformat/
├── orc
│   ├── part-00000-dbf2d30e-f7a5-44aa-958f-8f832bbe6aaf-c000.snappy.orc
│   ├── part-00001-dbf2d30e-f7a5-44aa-958f-8f832bbe6aaf-c000.snappy.orc
│   └── _SUCCESS
└── parquet
    ├── part-00000-cf491ff4-5e41-41b2-8189-6767f1f29242-c000.snappy.parquet
    ├── part-00001-cf491ff4-5e41-41b2-8189-6767f1f29242-c000.snappy.parquet
    └── _SUCCESS

2 directories, 6 files


# **Saving with a external format**

**Notice, HUDI jar already added on session !!!**

More information about hudi [here](https://hudi.apache.org/).

In [16]:
hudi_options = {
    'hoodie.table.name': 'tb_hudi_salaries',
    'hoodie.datasource.write.recordkey.field': 'id',
    'hoodie.datasource.write.partitionpath.field': 'hire_date',
    'hoodie.datasource.write.table.name': 'tb_hudi_salaries',
    'hoodie.datasource.write.operation': 'insert',
    'hoodie.datasource.write.precombine.field': 'ts',
    'hoodie.insert.shuffle.parallelism': 2
}

df.write.format("hudi").  \
    options(**hudi_options). \
    mode("overwrite"). \
    save('/tmp/read-save-example/otherformat/hudi/')


In [17]:
!tree /tmp/read-save-example/otherformat/hudi/

/tmp/read-save-example/otherformat/hudi/
├── 2020-01-01
│   └── 559c358b-b5ea-4995-b964-c27f956d4714-0_0-12-24_20211112234155.parquet
├── 2020-01-02
│   └── af195b55-ce37-4e24-9740-e2402af20a78-0_1-14-25_20211112234155.parquet
├── 2020-01-03
│   └── cdc1e64a-0b5f-49fe-9355-41ed78375a9a-0_2-14-26_20211112234155.parquet
└── 2020-01-04
    └── 2794b72b-99eb-405a-974c-bdabe531f7da-0_3-14-27_20211112234155.parquet

4 directories, 4 files


# **Reading mode 1 - All params like arguments**

In [18]:
df_load = spark.read.load(format='parquet', path='/tmp/read-save-example/otherformat/parquet/')
df_load.show()

+---+------+------+----------+
| id|  name|salary| hire_date|
+---+------+------+----------+
|  1|  John|  3100|2020-01-01|
|  2| Joana|  2700|2020-01-01|
|  3| Maria|  3100|2020-01-02|
|  4|Sandra|  4500|2020-01-02|
|  5|   Ben|  3100|2020-01-03|
|  6|  Carl|  4300|2020-01-03|
|  7|Joseph|  4200|2020-01-04|
|  8|Oliver|  3800|2020-01-04|
+---+------+------+----------+



# **Reading mode 2 - Implicit format method**

In [19]:
df_load2 = spark.read.parquet('/tmp/read-save-example/otherformat/parquet/')
df_load2.show()

+---+------+------+----------+
| id|  name|salary| hire_date|
+---+------+------+----------+
|  1|  John|  3100|2020-01-01|
|  2| Joana|  2700|2020-01-01|
|  3| Maria|  3100|2020-01-02|
|  4|Sandra|  4500|2020-01-02|
|  5|   Ben|  3100|2020-01-03|
|  6|  Carl|  4300|2020-01-03|
|  7|Joseph|  4200|2020-01-04|
|  8|Oliver|  3800|2020-01-04|
+---+------+------+----------+



# **Reading mode 3 - Using wildcards**

In [20]:
df_load3 = spark.read.parquet('/tmp/read-save-example/otherformat/parquet/part*.parquet')
df_load3.show()

+---+------+------+----------+
| id|  name|salary| hire_date|
+---+------+------+----------+
|  1|  John|  3100|2020-01-01|
|  2| Joana|  2700|2020-01-01|
|  3| Maria|  3100|2020-01-02|
|  4|Sandra|  4500|2020-01-02|
|  5|   Ben|  3100|2020-01-03|
|  6|  Carl|  4300|2020-01-03|
|  7|Joseph|  4200|2020-01-04|
|  8|Oliver|  3800|2020-01-04|
+---+------+------+----------+



# **Reading mode 4 - Spark SQL direct path**

In [21]:
df_load4 = spark.sql('select * from parquet.`/tmp/read-save-example/otherformat/parquet/`')
df_load4.show()

+---+------+------+----------+
| id|  name|salary| hire_date|
+---+------+------+----------+
|  1|  John|  3100|2020-01-01|
|  2| Joana|  2700|2020-01-01|
|  3| Maria|  3100|2020-01-02|
|  4|Sandra|  4500|2020-01-02|
|  5|   Ben|  3100|2020-01-03|
|  6|  Carl|  4300|2020-01-03|
|  7|Joseph|  4200|2020-01-04|
|  8|Oliver|  3800|2020-01-04|
+---+------+------+----------+



# **Reading mode 4 - Spark SQL Table**



In [22]:
df_load5 = spark.sql('select * from tb_parquet_salaries')
df_load5.show()

+---+------+------+----------+
| id|  name|salary| hire_date|
+---+------+------+----------+
|  1|  John|  3100|2020-01-01|
|  2| Joana|  2700|2020-01-01|
|  3| Maria|  3100|2020-01-02|
|  4|Sandra|  4500|2020-01-02|
|  5|   Ben|  3100|2020-01-03|
|  6|  Carl|  4300|2020-01-03|
|  7|Joseph|  4200|2020-01-04|
|  8|Oliver|  3800|2020-01-04|
+---+------+------+----------+

