# I. Import libraries

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import StringType
from pyspark.sql.types import IntegerType
import datetime
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt

# II. Connect to the data

In [2]:
spark = SparkSession \
.builder \
.appName("Wrangling covid19 data") \
.getOrCreate()

In [3]:
path = 'data/jantojun2020.csv'
data = spark.read.csv(path, header=True)

# III. Assess the data

In [4]:
data.count()

2745847

In [5]:
data.columns

['YEAR',
 'QUARTER',
 'MONTH',
 'DAY_OF_MONTH',
 'DAY_OF_WEEK',
 'FL_DATE',
 'MKT_UNIQUE_CARRIER',
 'MKT_CARRIER_FL_NUM',
 'TAIL_NUM',
 'ORIGIN',
 'ORIGIN_CITY_NAME',
 'ORIGIN_STATE_ABR',
 'ORIGIN_STATE_NM',
 'DEST',
 'DEST_CITY_NAME',
 'DEST_STATE_ABR',
 'DEST_STATE_NM',
 'CRS_DEP_TIME',
 'DEP_TIME',
 'DEP_DELAY',
 'DEP_DELAY_NEW',
 'DEP_DEL15',
 'DEP_DELAY_GROUP',
 'DEP_TIME_BLK',
 'TAXI_OUT',
 'WHEELS_OFF',
 'WHEELS_ON',
 'TAXI_IN',
 'CRS_ARR_TIME',
 'ARR_TIME',
 'ARR_DELAY',
 'ARR_DELAY_NEW',
 'ARR_DEL15',
 'ARR_DELAY_GROUP',
 'ARR_TIME_BLK',
 'CANCELLED',
 'CANCELLATION_CODE',
 'CRS_ELAPSED_TIME',
 'ACTUAL_ELAPSED_TIME',
 'AIR_TIME',
 'DISTANCE',
 'DISTANCE_GROUP',
 'CARRIER_DELAY',
 'WEATHER_DELAY',
 'NAS_DELAY',
 'SECURITY_DELAY',
 'LATE_AIRCRAFT_DELAY']

# IV. Wrangling data
## 1. Split data

### 1.1 Create port location dataframe

In [15]:
def create_port_loc_df(path):
    df = spark.read.csv(path, header=True)
    for column in df.columns:
        df = df.withColumnRenamed(column, column.lower())
    port_loc_df = df.select('origin', 'origin_city_name', 'origin_state_abr').dropDuplicates()
    port_loc_df = port_loc_df.withColumn('origin_city_name', split(port_loc_df['origin_city_name'], ',').getItem(0))
    port_loc_df.toPandas().to_csv('data/port_loc.csv', index=False)

In [16]:
create_port_loc_df(path)

In [17]:
df = spark.read.csv('data/port_loc.csv', header=True)
df.show()

+------+----------------+----------------+
|origin|origin_city_name|origin_state_abr|
+------+----------------+----------------+
|   SAF|        Santa Fe|              NM|
|   MSP|     Minneapolis|              MN|
|   TUL|           Tulsa|              OK|
|   DBQ|         Dubuque|              IA|
|   LFT|       Lafayette|              LA|
|   ROW|         Roswell|              NM|
|   PIT|      Pittsburgh|              PA|
|   SLN|          Salina|              KS|
|   EAU|      Eau Claire|              WI|
|   DCA|      Washington|              VA|
|   PIA|          Peoria|              IL|
|   MOT|           Minot|              ND|
|   DAL|          Dallas|              TX|
|   ELP|         El Paso|              TX|
|   GSP|           Greer|              SC|
|   TOL|          Toledo|              OH|
|   PWM|        Portland|              ME|
|   MSN|         Madison|              WI|
|   MKG|        Muskegon|              MI|
|   ORF|         Norfolk|              VA|
+------+---

### 1.2 Create states dataframe

In [21]:
def create_states_df(path):
    df = spark.read.csv(path, header=True)
    for column in df.columns:
        df = df.withColumnRenamed(column, column.lower())
    state_df = df.select('origin_state_abr', 'origin_state_nm').dropDuplicates()
    state_df.toPandas().to_csv('data/states.csv', index=False)

In [22]:
create_states_df(path)

In [23]:
state_df = spark.read.csv('data/states.csv', header=True)
state_df.show()

+----------------+--------------------+
|origin_state_abr|     origin_state_nm|
+----------------+--------------------+
|              VI| U.S. Virgin Islands|
|              MT|             Montana|
|              NC|      North Carolina|
|              MD|            Maryland|
|              CO|            Colorado|
|              CT|         Connecticut|
|              IL|            Illinois|
|              WY|             Wyoming|
|              NJ|          New Jersey|
|              LA|           Louisiana|
|              TN|           Tennessee|
|              AR|            Arkansas|
|              AK|              Alaska|
|              CA|          California|
|              NM|          New Mexico|
|              UT|                Utah|
|              MI|            Michigan|
|              TT|U.S. Pacific Trus...|
|              NY|            New York|
|              NH|       New Hampshire|
+----------------+--------------------+
only showing top 20 rows



In [24]:
state_df.count()

52