# State Airports
## 1. ETL Process

In [1]:
import os

from itertools import chain

from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import first, count, when, col, create_map, lit, coalesce

In [2]:
spark = SparkSession \
    .builder \
    .config('spark.jars.packages', 'org.apache.hadoop:hadoop-aws:2.7.0') \
    .appName('us_airports') \
    .getOrCreate()

Enter AWS credentials below and uncomment if data are to be stored in S3:

In [3]:
# spark.sparkContext \
#      ._jsc \
#      .hadoopConfiguration().set('fs.s3a.access.key', <enter access key>)

# spark.sparkContext \
#      ._jsc \
#      .hadoopConfiguration().set('fs.s3a.secret.key', <enter secret key>)
    
# spark.sparkContext \
#      ._jsc \
#      .hadoopConfiguration().set('fs.s3a.endpoint', 's3.amazonaws.com')

# spark.sparkContext \
#      ._jsc \
#      .hadoopConfiguration().set('mapreduce.fileoutputcommitter.algorithm.version', '2')

A function which creates a map between SAS labels and their values is defined.

In [4]:
def code_mapper(label):
    with open('./I94_SAS_Labels_Descriptions.SAS') as f:
        sas_labels_content = f.read()

    sas_labels_content = sas_labels_content.replace('\t', '')

    label_content = sas_labels_content[sas_labels_content.index(label):]
    label_content = label_content[:label_content.index(';')].split('\n')
    label_content = [i.replace("'", "") for i in label_content]

    label_dict = [i.split('=') for i in label_content[1:]]
    label_dict = dict([i[0].strip(), i[1].strip()] for i in label_dict if len(i) == 2)

    return label_dict

A {state_code: state} map is created.

In [5]:
i94addr = code_mapper('i94addrl')

The states in map are formatted to comply with state names in other datasets.
- 'DIST. OF' is replaced with 'District of'
- 'S.', 'N.', 'W.' are replaced with 'South', 'North', 'West'
- all states are capitalized

In [6]:
def format_state(s):
    s = s.replace('DIST. OF', 'District of') \
         .replace('S.', 'South') \
         .replace('N.', 'North') \
         .replace('W.', 'West')
    return ' '.join([w.capitalize() if w != 'of' else w for w in s.split() ])

i94addr = {k: format_state(v) for k, v in i94addr.items() if k != '99'}

The data are read from .csv file in Spark DataFrame.

In [7]:
path = 'datasets/airport-codes_csv.csv'
df = spark.read.csv(path, header=True)

A new DataFrame is created containing the number of airports of each type in each state. Obviously, all closed and non-U.S. airports are dropped.

In [8]:
df.createOrReplaceTempView('airports')

In [9]:
df = spark.sql("""
    SELECT RIGHT(iso_region, LENGTH(iso_region) - 3) AS state_code, type, COUNT(*) AS airport_count
      FROM airports
     WHERE iso_country == "US" AND type != "closed"
     GROUP BY iso_region, type
     ORDER BY iso_region
""")

Pivot is used to create columns for each individual type in state and when a certain type does not exists, null are replaced by 0.

In [10]:
df = df.groupBy('state_code').pivot('type').agg(first('airport_count')).orderBy('state_code').fillna(0)

A 'state' column is created by mapping state codes to states.

In [11]:
mapping_expr = create_map([lit(x) for x in chain(*i94addr.items())])

df = df.withColumn('state', mapping_expr.getItem(col('state_code')))

All rows with invalid states are dropped.

In [12]:
df = df.na.drop()

Save DataFrame as .parquet in s3a://us-immigration-project or locally.

In [13]:
# write to S3 bucket
# output_data = 's3a://us-immigration-project/'
# df.write.parquet(os.path.join(output_data, 'state_airport'), 'overwrite')

# for local write
# df.write.parquet('./output/state_airport', 'overwrite')

## 2. Dataset Info

DataFrame Schema:

In [14]:
df.printSchema()

root
 |-- state_code: string (nullable = true)
 |-- balloonport: long (nullable = true)
 |-- heliport: long (nullable = true)
 |-- large_airport: long (nullable = true)
 |-- medium_airport: long (nullable = true)
 |-- seaplane_base: long (nullable = true)
 |-- small_airport: long (nullable = true)
 |-- state: string (nullable = true)



Sample DataFrame records:

In [15]:
df.show(n=2, truncate=False, vertical=True)

-RECORD 0-----------------
 state_code     | AK      
 balloonport    | 0       
 heliport       | 61      
 large_airport  | 2       
 medium_airport | 90      
 seaplane_base  | 146     
 small_airport  | 497     
 state          | Alaska  
-RECORD 1-----------------
 state_code     | AL      
 balloonport    | 0       
 heliport       | 134     
 large_airport  | 4       
 medium_airport | 11      
 seaplane_base  | 7       
 small_airport  | 183     
 state          | Alabama 
only showing top 2 rows



DataFrame rows and columns:

In [16]:
print('rows: {}'.format(df.count()), 'columns: {}'.format(len(df.columns)))

rows: 51 columns: 8


## 3. Data Quality Check

Check for nulls in each column:

In [17]:
df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).show(truncate=False, vertical=True)

-RECORD 0-------------
 state_code     | 0   
 balloonport    | 0   
 heliport       | 0   
 large_airport  | 0   
 medium_airport | 0   
 seaplane_base  | 0   
 small_airport  | 0   
 state          | 0   



## 4. Primary Key & Columns Related to Other Datasets

Total states in DataFrame:

In [18]:
df.select('state', 'state_code').distinct().orderBy('state').count()

51

Display all states and state codes:

In [19]:
df.select('state', 'state_code').distinct().orderBy('state').show(51, truncate=False)

+--------------------+----------+
|state               |state_code|
+--------------------+----------+
|Alabama             |AL        |
|Alaska              |AK        |
|Arizona             |AZ        |
|Arkansas            |AR        |
|California          |CA        |
|Colorado            |CO        |
|Connecticut         |CT        |
|Delaware            |DE        |
|District of Columbia|DC        |
|Florida             |FL        |
|Georgia             |GA        |
|Hawaii              |HI        |
|Idaho               |ID        |
|Illinois            |IL        |
|Indiana             |IN        |
|Iowa                |IA        |
|Kansas              |KS        |
|Kentucky            |KY        |
|Louisiana           |LA        |
|Maine               |ME        |
|Maryland            |MD        |
|Massachusetts       |MA        |
|Michigan            |MI        |
|Minnesota           |MN        |
|Mississippi         |MS        |
|Missouri            |MO        |
|Montana      