In [1]:
from pyspark import SparkContext
from pyspark.sql import Row, SQLContext, SparkSession
import os
import pyspark.sql.functions as F

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
sc = SparkContext()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/09 12:50:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/05/09 12:50:50 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/05/09 12:50:50 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [4]:
spark = SparkSession(sc)

In [18]:
file_path = os.path.join('..', 'src-data', 'AA_data')
df = spark.read.csv(file_path, header = True, inferSchema = True)

### 1. Caching

- Save DataFrame in memory or disk.
- Improve velocity on transformation / action.
- Reduce source when using.

**Notes:** Don't caching it:
- Large dataset.
- local disk based on caching might be not the way for imporving productivities.
- Non-avaiable objects.


Use cache by `DF_name`.cache().`action()`

In [6]:
df.cache().count()

                                                                                

583718

Checking whether the object is caching or not.

Use `is_cached`.

In [8]:
print(f"df.is_cached: {df.is_cached}")

df.is_cached: True


Remove caching.

Use `unpersist()`

In [9]:
df.unpersist()
print(f"df.is_cached: {df.is_cached}")

df.is_cached: False


### 2. Parquet

`Parquet` is an open source file format built to handle flat columnar storage data formats. `Parquet` operates well with complex data in large volumes.It is known for its both performant data compression and its ability to handle a wide variety of encoding types. 

`Parquet` deploys Google's record-shredding and assembly algorithm that can address complex data structures within data storage. Some 
`Parquet` benefits include:

- Fast queries that can fetch specific column values without reading full row data

- Highly efficient column-wise compression

- High compatibility with with OLAP

![plot](https://github.com/DatacollectorVN/PySpark-Tutorial/blob/master/public-imgs/parquet.png?raw=true)

Save DataFrame to Parquet

In [10]:
df.rdd.getNumPartitions()

7

In [19]:
df = df.withColumnRenamed('Date (MM/DD/YYYY)', 'Date_MM_DD_YYYY')

In [21]:
df = df.withColumnRenamed('Flight Number', 'Flight_Number')

In [23]:
df = df.withColumnRenamed('Destination Airport', 'Destination_Airport')

In [26]:
df = df.withColumnRenamed('Actual elapsed time (Minutes)', 'Actual_elapsed_time_Minutes')
df.show()

+---------------+-------------+-------------------+---------------------------+
|Date_MM_DD_YYYY|Flight_Number|Destination_Airport|Actual_elapsed_time_Minutes|
+---------------+-------------+-------------------+---------------------------+
|     01/01/2015|            5|                HNL|                        526|
|     01/01/2015|            7|                OGG|                        517|
|     01/01/2015|           23|                SFO|                        233|
|     01/01/2015|           27|                LAS|                        165|
|     01/01/2015|           29|                ONT|                          0|
|     01/01/2015|           35|                HDN|                        178|
|     01/01/2015|           37|                SAN|                        187|
|     01/01/2015|           43|                DTW|                          0|
|     01/01/2015|           49|                SAN|                        178|
|     01/01/2015|           51|         

In [27]:
df.write.parquet(os.path.join('..', 'src-data', 'AA_DFW_ALL.parquet'))

                                                                                

*Expected output:* 7 partitions.