# I. Import libraries

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import StringType
from pyspark.sql.types import IntegerType
import datetime
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt

# II. Connect to the data

In [2]:
spark = SparkSession \
.builder \
.appName("Wrangling covid19 data") \
.getOrCreate()

In [3]:
path = 'data/jantojun2020.csv'
data = spark.read.csv(path, header=True)

# III. Assess the data

In [4]:
data.count()

2745847

In [5]:
data.columns

['YEAR',
 'QUARTER',
 'MONTH',
 'DAY_OF_MONTH',
 'DAY_OF_WEEK',
 'FL_DATE',
 'MKT_UNIQUE_CARRIER',
 'MKT_CARRIER_FL_NUM',
 'TAIL_NUM',
 'ORIGIN',
 'ORIGIN_CITY_NAME',
 'ORIGIN_STATE_ABR',
 'ORIGIN_STATE_NM',
 'DEST',
 'DEST_CITY_NAME',
 'DEST_STATE_ABR',
 'DEST_STATE_NM',
 'CRS_DEP_TIME',
 'DEP_TIME',
 'DEP_DELAY',
 'DEP_DELAY_NEW',
 'DEP_DEL15',
 'DEP_DELAY_GROUP',
 'DEP_TIME_BLK',
 'TAXI_OUT',
 'WHEELS_OFF',
 'WHEELS_ON',
 'TAXI_IN',
 'CRS_ARR_TIME',
 'ARR_TIME',
 'ARR_DELAY',
 'ARR_DELAY_NEW',
 'ARR_DEL15',
 'ARR_DELAY_GROUP',
 'ARR_TIME_BLK',
 'CANCELLED',
 'CANCELLATION_CODE',
 'CRS_ELAPSED_TIME',
 'ACTUAL_ELAPSED_TIME',
 'AIR_TIME',
 'DISTANCE',
 'DISTANCE_GROUP',
 'CARRIER_DELAY',
 'WEATHER_DELAY',
 'NAS_DELAY',
 'SECURITY_DELAY',
 'LATE_AIRCRAFT_DELAY']

# IV. Wrangling data
## 1. Split data

### 1.1 Create port_loc dataframe

In [6]:
def create_port_loc_df(path):
    df = spark.read.csv(path, header=True)
    for column in df.columns:
        df = df.withColumnRenamed(column, column.lower())
    port_loc_df = df.select('origin', 'origin_city_name', 'origin_state_abr').dropDuplicates()
    port_loc_df = port_loc_df.withColumn('origin_city_name', split(port_loc_df['origin_city_name'], ',').getItem(0))
    port_loc_df.toPandas().to_csv('data/port_loc.csv', index=False)

In [7]:
create_port_loc_df(path)

In [8]:
df = spark.read.csv('data/port_loc.csv', header=True)
df.show()

+------+----------------+----------------+
|origin|origin_city_name|origin_state_abr|
+------+----------------+----------------+
|   SAF|        Santa Fe|              NM|
|   MSP|     Minneapolis|              MN|
|   TUL|           Tulsa|              OK|
|   DBQ|         Dubuque|              IA|
|   LFT|       Lafayette|              LA|
|   ROW|         Roswell|              NM|
|   PIT|      Pittsburgh|              PA|
|   SLN|          Salina|              KS|
|   EAU|      Eau Claire|              WI|
|   DCA|      Washington|              VA|
|   PIA|          Peoria|              IL|
|   MOT|           Minot|              ND|
|   DAL|          Dallas|              TX|
|   ELP|         El Paso|              TX|
|   GSP|           Greer|              SC|
|   TOL|          Toledo|              OH|
|   PWM|        Portland|              ME|
|   MSN|         Madison|              WI|
|   MKG|        Muskegon|              MI|
|   ORF|         Norfolk|              VA|
+------+---

### 1.2 Create states dataframe

In [9]:
def create_states_df(path):
    df = spark.read.csv(path, header=True)
    for column in df.columns:
        df = df.withColumnRenamed(column, column.lower())
    state_df = df.select('origin_state_abr', 'origin_state_nm').dropDuplicates()
    state_df.toPandas().to_csv('data/states.csv', index=False)

In [10]:
create_states_df(path)

In [11]:
state_df = spark.read.csv('data/states.csv', header=True)
state_df.show()

+----------------+--------------------+
|origin_state_abr|     origin_state_nm|
+----------------+--------------------+
|              VI| U.S. Virgin Islands|
|              MT|             Montana|
|              NC|      North Carolina|
|              MD|            Maryland|
|              CO|            Colorado|
|              CT|         Connecticut|
|              IL|            Illinois|
|              WY|             Wyoming|
|              NJ|          New Jersey|
|              LA|           Louisiana|
|              TN|           Tennessee|
|              AR|            Arkansas|
|              AK|              Alaska|
|              CA|          California|
|              NM|          New Mexico|
|              UT|                Utah|
|              MI|            Michigan|
|              TT|U.S. Pacific Trus...|
|              NY|            New York|
|              NH|       New Hampshire|
+----------------+--------------------+
only showing top 20 rows



In [12]:
state_df.count()

52

### 1.3 Create airline dataframe

In [13]:
def create_airline_code(path):
    with open(path) as f:
        content = f.readlines()
        content = [x.strip() for x in content]
        airline = content[10:20]
        splitted_airline = [c.split(":") for c in airline]
        c_airline = [x[0].replace("'","").strip() for x in splitted_airline]
        airline_name = [x[1].replace("'","").strip() for x in splitted_airline]
        airline_df = pd.DataFrame({"c_airline" : c_airline, "airline_name": airline_name})
        return airline_df.to_csv("data/airline.csv", index=False)

In [14]:
path = 'data/ColumnDescriptions.txt'
create_airline_code(path)

In [15]:
# test
spark.read.csv('data/airline.csv', header=True).show()

+---------+------------------+
|c_airline|      airline_name|
+---------+------------------+
|       AA| American Airlines|
|       AS|   Alaska Airlines|
|       B6|           JetBlue|
|       DL|   Delta Air Lines|
|       F9| Frontier Airlines|
|       G4|     Allegiant Air|
|       HA| Hawaiian Airlines|
|       NK|   Spirit Airlines|
|       UA|   United Airlines|
|       WN|Southwest Airlines|
+---------+------------------+



### 1.4 Create distance_group dataframe

In [16]:
data.select('distance', 'distance_group').show(10)

+--------+--------------+
|distance|distance_group|
+--------+--------------+
|     363|             2|
|     363|             2|
|     333|             2|
|     333|             2|
|     333|             2|
|     333|             2|
|     333|             2|
|     390|             2|
|     390|             2|
|     390|             2|
+--------+--------------+
only showing top 10 rows



By observing the table above, we can clearly figure out there are some wrong values in this dataframe: it should be 1 for those distances (Based on the explanation in ColumnDescriptions.txt). Therefore, we will fix it first.
##### Fix the wrong distance_group values

In [17]:
# use floor() to get the integer division
data = data.withColumn('distance_group', floor(data['distance'].cast('int')/250))
# test
data.select('distance', 'distance_group').show(10)

+--------+--------------+
|distance|distance_group|
+--------+--------------+
|     363|             1|
|     363|             1|
|     333|             1|
|     333|             1|
|     333|             1|
|     333|             1|
|     333|             1|
|     390|             1|
|     390|             1|
|     390|             1|
+--------+--------------+
only showing top 10 rows



##### Create the wanted dataframe

In [21]:
data.select('distance_group').describe().show()

+-------+------------------+
|summary|    distance_group|
+-------+------------------+
|  count|           2745847|
|   mean|  2.50195659117205|
| stddev|2.2411625220911175|
|    min|                 0|
|    max|                20|
+-------+------------------+



Because the min and the max values of the distance_group are 0 and 20, respectively,  we choose the range from 0 to 25 for this distance_group data frame.

In [22]:
def create_distance_group():
    data = []
    for i in range(26):
        data.append([i, "{} <= distance < {}".format(i * 250, (i + 1) * 250)])
        
    df = pd.DataFrame(data=data, columns=['distance_group', 'distance_range(miles)'])
    df.to_csv('data/distance_group.csv', index=False)    

In [23]:
create_distance_group()

In [26]:
# test
spark.read.csv('data/distance_group.csv', header=True).show(30,truncate=False)

+--------------+-----------------------+
|distance_group|distance_range(miles)  |
+--------------+-----------------------+
|0             |0 <= distance < 250    |
|1             |250 <= distance < 500  |
|2             |500 <= distance < 750  |
|3             |750 <= distance < 1000 |
|4             |1000 <= distance < 1250|
|5             |1250 <= distance < 1500|
|6             |1500 <= distance < 1750|
|7             |1750 <= distance < 2000|
|8             |2000 <= distance < 2250|
|9             |2250 <= distance < 2500|
|10            |2500 <= distance < 2750|
|11            |2750 <= distance < 3000|
|12            |3000 <= distance < 3250|
|13            |3250 <= distance < 3500|
|14            |3500 <= distance < 3750|
|15            |3750 <= distance < 4000|
|16            |4000 <= distance < 4250|
|17            |4250 <= distance < 4500|
|18            |4500 <= distance < 4750|
|19            |4750 <= distance < 5000|
|20            |5000 <= distance < 5250|
|21            |