Objective: to test run parsing a SAS7BDAT SAS Dataset binary file.

1. Parse a SAS7BDAT SAS Dataset binary file into a Spark DataFrame `df_spark`
    (from `/data/18-83510-I94-Data-2016/i94_apr16_sub.sas7bdat`)
    
2. Write the Spark DataFrame `df_spark` into directory `sas_data/*.parquet`.

In [1]:
# Udacity Starter code to parse one SAS dataset (.sas7bdat) into a Spark DataFrame
from pyspark.sql import SparkSession

spark = SparkSession.builder.\
    config("spark.jars.repositories", "https://repos.spark-packages.org/").\
    config("spark.jars.packages", "saurfang:spark-sas7bdat:2.0.0-s_2.11").\
    enableHiveSupport().getOrCreate()

# Load a sample sas7bdat SAS dataset into a Spark DataFrame
# stored on Udacity server at: 
#   Absolute path: '/data/18-83510-I94-Data-2016/i94_apr16_sub.sas7bdat'
#   Relative path (to this notebook): '../../data/18-83510-I94-Data-2016/i94_apr16_sub.sas7bdat'
# let's use absolute path just to be safe.
df_spark = spark.read.format('com.github.saurfang.sas.spark')\
    .load('/data/18-83510-I94-Data-2016/i94_apr16_sub.sas7bdat')

In [2]:
# Take a peek
df_spark.take(5)

[Row(cicid=6.0, i94yr=2016.0, i94mon=4.0, i94cit=692.0, i94res=692.0, i94port='XXX', arrdate=20573.0, i94mode=None, i94addr=None, depdate=None, i94bir=37.0, i94visa=2.0, count=1.0, dtadfile=None, visapost=None, occup=None, entdepa='T', entdepd=None, entdepu='U', matflag=None, biryear=1979.0, dtaddto='10282016', gender=None, insnum=None, airline=None, admnum=1897628485.0, fltno=None, visatype='B2'),
 Row(cicid=7.0, i94yr=2016.0, i94mon=4.0, i94cit=254.0, i94res=276.0, i94port='ATL', arrdate=20551.0, i94mode=1.0, i94addr='AL', depdate=None, i94bir=25.0, i94visa=3.0, count=1.0, dtadfile='20130811', visapost='SEO', occup=None, entdepa='G', entdepd=None, entdepu='Y', matflag=None, biryear=1991.0, dtaddto='D/S', gender='M', insnum=None, airline=None, admnum=3736796330.0, fltno='00296', visatype='F1'),
 Row(cicid=15.0, i94yr=2016.0, i94mon=4.0, i94cit=101.0, i94res=101.0, i94port='WAS', arrdate=20545.0, i94mode=1.0, i94addr='MI', depdate=20691.0, i94bir=55.0, i94visa=2.0, count=1.0, dtadfile=

In [3]:
# remove the parquet output directory (if already exists)
!if [ -d "sas_data" ]; then rm -r sas_data; fi

In [4]:
# Write to parquet (auto create `sas_data/` and write `*.parquet` files into that directory)
df_spark.write.parquet("sas_data")

In [5]:
# Test read the parquet file from disk
df_spark_2=spark.read.parquet("sas_data")

In [6]:
# Take a peek (notice that order get shuffled as a result of unordered parallel write)
df_spark_2.take(5)

[Row(cicid=5748517.0, i94yr=2016.0, i94mon=4.0, i94cit=245.0, i94res=438.0, i94port='LOS', arrdate=20574.0, i94mode=1.0, i94addr='CA', depdate=20582.0, i94bir=40.0, i94visa=1.0, count=1.0, dtadfile='20160430', visapost='SYD', occup=None, entdepa='G', entdepd='O', entdepu=None, matflag='M', biryear=1976.0, dtaddto='10292016', gender='F', insnum=None, airline='QF', admnum=94953870030.0, fltno='00011', visatype='B1'),
 Row(cicid=5748518.0, i94yr=2016.0, i94mon=4.0, i94cit=245.0, i94res=438.0, i94port='LOS', arrdate=20574.0, i94mode=1.0, i94addr='NV', depdate=20591.0, i94bir=32.0, i94visa=1.0, count=1.0, dtadfile='20160430', visapost='SYD', occup=None, entdepa='G', entdepd='O', entdepu=None, matflag='M', biryear=1984.0, dtaddto='10292016', gender='F', insnum=None, airline='VA', admnum=94955622830.0, fltno='00007', visatype='B1'),
 Row(cicid=5748519.0, i94yr=2016.0, i94mon=4.0, i94cit=245.0, i94res=438.0, i94port='LOS', arrdate=20574.0, i94mode=1.0, i94addr='WA', depdate=20582.0, i94bir=29.

In [7]:
# ensure both df_spark and df_spark_2 have the same number of columns

df_spark_cols = len(df_spark.columns)
df_spark_2_cols = len(df_spark_2.columns)               

assert df_spark_cols == 28
assert df_spark_2_cols == df_spark_2_cols

In [8]:
# ensure both df_spark and df_spark_2 have the same number of rows
# the `.count()` method may take a while to run!
df_spark_rows = df_spark.count()
df_spark_2_rows = df_spark_2.count()              

assert df_spark_rows == 3096313
assert df_spark_2_rows == df_spark_2_rows 

In [9]:
df_spark.printSchema()

root
 |-- cicid: double (nullable = true)
 |-- i94yr: double (nullable = true)
 |-- i94mon: double (nullable = true)
 |-- i94cit: double (nullable = true)
 |-- i94res: double (nullable = true)
 |-- i94port: string (nullable = true)
 |-- arrdate: double (nullable = true)
 |-- i94mode: double (nullable = true)
 |-- i94addr: string (nullable = true)
 |-- depdate: double (nullable = true)
 |-- i94bir: double (nullable = true)
 |-- i94visa: double (nullable = true)
 |-- count: double (nullable = true)
 |-- dtadfile: string (nullable = true)
 |-- visapost: string (nullable = true)
 |-- occup: string (nullable = true)
 |-- entdepa: string (nullable = true)
 |-- entdepd: string (nullable = true)
 |-- entdepu: string (nullable = true)
 |-- matflag: string (nullable = true)
 |-- biryear: double (nullable = true)
 |-- dtaddto: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- insnum: string (nullable = true)
 |-- airline: string (nullable = true)
 |-- admnum: double (nullable = 

In [10]:
df_spark_2.printSchema()

root
 |-- cicid: double (nullable = true)
 |-- i94yr: double (nullable = true)
 |-- i94mon: double (nullable = true)
 |-- i94cit: double (nullable = true)
 |-- i94res: double (nullable = true)
 |-- i94port: string (nullable = true)
 |-- arrdate: double (nullable = true)
 |-- i94mode: double (nullable = true)
 |-- i94addr: string (nullable = true)
 |-- depdate: double (nullable = true)
 |-- i94bir: double (nullable = true)
 |-- i94visa: double (nullable = true)
 |-- count: double (nullable = true)
 |-- dtadfile: string (nullable = true)
 |-- visapost: string (nullable = true)
 |-- occup: string (nullable = true)
 |-- entdepa: string (nullable = true)
 |-- entdepd: string (nullable = true)
 |-- entdepu: string (nullable = true)
 |-- matflag: string (nullable = true)
 |-- biryear: double (nullable = true)
 |-- dtaddto: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- insnum: string (nullable = true)
 |-- airline: string (nullable = true)
 |-- admnum: double (nullable = 

In [11]:
# Confirm the two spark dataframes have the same columns (regardless of ordering)
assert set(df_spark.columns) == set(df_spark_2.columns)

In [12]:
# Confirm the two spark dataframes have the same columns (and same ordering)
assert df_spark.columns == df_spark_2.columns

In [13]:
# Terminate Spark Session gracefully
spark.stop()

Mini Conclusion:

* we have a way to parse a SAS dataset (binary fiile .sas7bdat) into a Spark DataFrame.
* we have a way to write the Spark DataFrame into parquet files.
* we have a way to read in the parquet files into a Spark DataFrame.
* we confirm that the Spark DataFrames that we read from SAS Dataset as well as Parquet
  have the same columns (in same set and ordering) and row counts.