# Process data

### Important

In order to process the airport, flight, and weather data, you must already have it available. If you have not already acquired the data, open notebook `01_get_data` and run all its cells.

In [1]:
import os
import pandas as pd
import re  # regular expressions

In [2]:
# Make `resources` the working directory, and set `data_dir`
os.chdir(os.path.join('..','resources'))
data_dir = os.path.join('.','data')

In [3]:
os.listdir(data_dir)

['05-2019.csv',
 '06-2019.csv',
 '07-2019.csv',
 '08-2019.csv',
 '09-2019.csv',
 '10-2019.csv',
 '11-2019.csv',
 '12-2019.csv',
 'GlobalAirportDatabase.txt',
 'GlobalAirportDatabase.zip',
 'historical-flight-and-weather-data.zip',
 'readme.txt']

In [4]:
faw_source_files = list(filter(
                            lambda item: re.fullmatch(
                                '\d{2}\-2019\.csv',
                                item,
                                flags=re.I
                            ) is not None,
                            os.listdir(data_dir)
                        ))

airports_source_file = 'GlobalAirportDatabase.txt'

In [5]:
def print_shape(df):
    df_rows, df_cols = df.shape
    print(f"{df_rows:,} rows × {df_cols:,} columns")

## Airports Data Set

In [6]:
# Remove leading and trailing whitespace from the Global Airport Database text
# (with `.strip()`), and assign it to a variable.
with open(os.path.join(data_dir,airports_source_file)) as gadb:
    gadb_text = gadb.read().strip()

In [7]:
# Examine some of the data to see what it looks like
gadb_text[:1000]

'AYGA:GKA:GOROKA:GOROKA:PAPUA NEW GUINEA:006:004:054:S:145:023:030:E:01610:-6.082:145.392\nAYLA:LAE:N/A:LAE:PAPUA NEW GUINEA:000:000:000:U:000:000:000:U:00000:0.000:0.000\nAYMD:MAG:MADANG:MADANG:PAPUA NEW GUINEA:005:012:025:S:145:047:019:E:00007:-5.207:145.789\nAYMH:HGU:MOUNT HAGEN:MOUNT HAGEN:PAPUA NEW GUINEA:005:049:034:S:144:017:046:E:01643:-5.826:144.296\nAYNZ:LAE:NADZAB:NADZAB:PAPUA NEW GUINEA:006:034:011:S:146:043:034:E:00073:-6.570:146.726\nAYPY:POM:PORT MORESBY JACKSONS INTERNATIONAL:PORT MORESBY:PAPUA NEW GUINEA:009:026:036:S:147:013:012:E:00045:-9.443:147.220\nAYRB:RAB:N/A:RABAUL:PAPUA NEW GUINEA:000:000:000:U:000:000:000:U:00000:0.000:0.000\nAYWK:WWK:WEWAK INTERNATIONAL:WEWAK:PAPUA NEW GUINEA:003:035:001:S:143:040:009:E:00006:-3.584:143.669\nBGAM:N/A:N/A:ANGMAGSSALIK:GREENLAND:000:000:000:U:000:000:000:U:00000:0.000:0.000\nBGAS:N/A:N/A:ANGISSOQ:GREENLAND:000:000:000:U:000:000:000:U:00000:0.000:0.000\nBGAT:N/A:N/A:APUTITEQ:GREENLAND:000:000:000:U:000:000:000:U:00000:0.000:0.0

In [8]:
# And again at the end of the data
gadb_text[-1000:]

'E:00139:45.623:126.250\nZYHE:N/A:N/A:HEIHE:CHINA:000:000:000:U:000:000:000:U:00000:0.000:0.000\nZYJD:N/A:N/A:JAGDAQI:CHINA:000:000:000:U:000:000:000:U:00000:0.000:0.000\nZYJM:N/A:JIAMUSI:JIAMUSI:CHINA:046:050:036:N:130:027:055:E:00080:46.843:130.465\nZYMD:N/A:HAILANG:MUDANJIANG:CHINA:044:031:026:N:129:034:008:E:00270:44.524:129.569\nZYNJ:N/A:N/A:NENJIANG:CHINA:000:000:000:U:000:000:000:U:00000:0.000:0.000\nZYQQ:N/A:N/A:QIQIHAR:CHINA:000:000:000:U:000:000:000:U:00000:0.000:0.000\nZYRD:CGQ:N/A:CHANGCHUN:CHINA:000:000:000:U:000:000:000:U:00000:0.000:0.000\nZYTH:N/A:N/A:TAHE:CHINA:000:000:000:U:000:000:000:U:00000:0.000:0.000\nZYTK:N/A:N/A:SHENYANG:CHINA:000:000:000:U:000:000:000:U:00000:0.000:0.000\nZYTL:DLC:ZHOUSHUIZI:DALIAN:CHINA:038:057:056:N:121:032:018:E:00033:38.966:121.538\nZYXC:N/A:N/A:XIANCHENG:CHINA:000:000:000:U:000:000:000:U:00000:0.000:0.000\nZYYC:N/A:N/A:YICHUN:CHINA:000:000:000:U:000:000:000:U:00000:0.000:0.000\nZYYJ:N/A:YANJI:YANJI:CHINA:042:052:054:N:129:026:054:E:00191:

Rows are separated by newline (`\n`) characters; columns are separated by colons.  
Missing values are indicated in a number of ways, depending on the data type.

According to [the source](https://www.partow.net/miscellaneous/airportdatabase/index.html):

> **Note:** Some tuples may have missing or otherwise unaviable pieces of data. In the event the values are not present, given the data type a default value will be used as follows:
> 
> - String : `N/A`
> - Integer: `0`
> - Char : `U`
> - Floating Point: `0.0`

We will split the data—first by newlines, then by colons—and convert values as appropriate.

In [9]:
# Create a function to convert data types
def process_airport(ap):
    # Convert integers
    for i in (5,6,7,9,10,11,13):
        ap[i] = int(ap[i])
    # Convert floats
    for i in (14,15):
        ap[i] = float(ap[i])
    # Convert missing values (0.0 will be coerced to 0, in this case)
    ap = tuple(map(lambda elem: None if elem in [0,'U','N/A'] else elem, ap))
    return ap

In [10]:
airports_tuples = [process_airport(ap.split(':')) for ap in gadb_text.split('\n')]

In [11]:
len(airports_tuples)

9300

In [12]:
for i in range (5):
    print(airports_tuples[i])

('AYGA', 'GKA', 'GOROKA', 'GOROKA', 'PAPUA NEW GUINEA', 6, 4, 54, 'S', 145, 23, 30, 'E', 1610, -6.082, 145.392)
('AYLA', 'LAE', None, 'LAE', 'PAPUA NEW GUINEA', None, None, None, None, None, None, None, None, None, None, None)
('AYMD', 'MAG', 'MADANG', 'MADANG', 'PAPUA NEW GUINEA', 5, 12, 25, 'S', 145, 47, 19, 'E', 7, -5.207, 145.789)
('AYMH', 'HGU', 'MOUNT HAGEN', 'MOUNT HAGEN', 'PAPUA NEW GUINEA', 5, 49, 34, 'S', 144, 17, 46, 'E', 1643, -5.826, 144.296)
('AYNZ', 'LAE', 'NADZAB', 'NADZAB', 'PAPUA NEW GUINEA', 6, 34, 11, 'S', 146, 43, 34, 'E', 73, -6.57, 146.726)


In [13]:
# Create dataframe

airports_columns = [
    'icao_code',
    'iata_code',
    'name',
    'city',
    'country',
    'lat_deg',
    'lat_min',
    'lat_sec',
    'lat_dir',
    'lon_deg',
    'lon_min',
    'lon_sec',
    'lon_dir',
    'altitude',
    'lat_decimal',
    'lon_decimal'
]

airports_df = pd.DataFrame(
    data=airports_tuples,
    columns=airports_columns
)

print_shape(airports_df)

9,300 rows × 16 columns


In [14]:
airports_df.head()

Unnamed: 0,icao_code,iata_code,name,city,country,lat_deg,lat_min,lat_sec,lat_dir,lon_deg,lon_min,lon_sec,lon_dir,altitude,lat_decimal,lon_decimal
0,AYGA,GKA,GOROKA,GOROKA,PAPUA NEW GUINEA,6.0,4.0,54.0,S,145.0,23.0,30.0,E,1610.0,-6.082,145.392
1,AYLA,LAE,,LAE,PAPUA NEW GUINEA,,,,,,,,,,,
2,AYMD,MAG,MADANG,MADANG,PAPUA NEW GUINEA,5.0,12.0,25.0,S,145.0,47.0,19.0,E,7.0,-5.207,145.789
3,AYMH,HGU,MOUNT HAGEN,MOUNT HAGEN,PAPUA NEW GUINEA,5.0,49.0,34.0,S,144.0,17.0,46.0,E,1643.0,-5.826,144.296
4,AYNZ,LAE,NADZAB,NADZAB,PAPUA NEW GUINEA,6.0,34.0,11.0,S,146.0,43.0,34.0,E,73.0,-6.57,146.726


In [15]:
# For each column, get data types, number of null values, and whether all (non-NULL) values in the column are unique
def airports_df_details():
    return pd.concat(
        [
            airports_df.dtypes,
            airports_df.isna().sum(),
            pd.Series(
                data=[airports_df[col].dropna().is_unique for col in airports_df.columns],
                index=airports_df.columns
            )
        ],
        axis=1,
        keys=['data_type','null_count','unique']
    )

In [16]:
airports_df_details()

Unnamed: 0,data_type,null_count,unique
icao_code,object,0,True
iata_code,object,5738,False
name,object,5105,False
city,object,0,False
country,object,0,False
lat_deg,float64,5171,False
lat_min,float64,5168,False
lat_sec,float64,5228,False
lat_dir,object,5112,False
lon_deg,float64,5195,False


`icao_code` is `UNIQUE` and `NON NULL`, and so can serve as the `PRIMARY KEY` of the `airports` database.

The `…_deg`, `…_min`, `…_sec`, and `altitude` columns should be `integer` types.

Expected `iata_code` to be unique, but it isn't.

In [17]:
# replace 
int_cols = [
    col
    for col in airports_df.columns
    if any(x in col for x in ['_deg','_min','_sec','altitude'])
]

In [18]:
airports_df[int_cols] = airports_df[int_cols].astype(pd.Int64Dtype())

In [19]:
airports_df.head()

Unnamed: 0,icao_code,iata_code,name,city,country,lat_deg,lat_min,lat_sec,lat_dir,lon_deg,lon_min,lon_sec,lon_dir,altitude,lat_decimal,lon_decimal
0,AYGA,GKA,GOROKA,GOROKA,PAPUA NEW GUINEA,6.0,4.0,54.0,S,145.0,23.0,30.0,E,1610.0,-6.082,145.392
1,AYLA,LAE,,LAE,PAPUA NEW GUINEA,,,,,,,,,,,
2,AYMD,MAG,MADANG,MADANG,PAPUA NEW GUINEA,5.0,12.0,25.0,S,145.0,47.0,19.0,E,7.0,-5.207,145.789
3,AYMH,HGU,MOUNT HAGEN,MOUNT HAGEN,PAPUA NEW GUINEA,5.0,49.0,34.0,S,144.0,17.0,46.0,E,1643.0,-5.826,144.296
4,AYNZ,LAE,NADZAB,NADZAB,PAPUA NEW GUINEA,6.0,34.0,11.0,S,146.0,43.0,34.0,E,73.0,-6.57,146.726


In [20]:
airports_df_details()

Unnamed: 0,data_type,null_count,unique
icao_code,object,0,True
iata_code,object,5738,False
name,object,5105,False
city,object,0,False
country,object,0,False
lat_deg,Int64,5171,False
lat_min,Int64,5168,False
lat_sec,Int64,5228,False
lat_dir,object,5112,False
lon_deg,Int64,5195,False


In [21]:
# Set of columns without `icao_code` (i.e., the primary key)
npk_cols = set(airports_df.columns) - {'icao_code'}

# Set of non-coordinate columns
nc_cols = {'icao_code','iata_code','name','city','country'}

# Set of coordinate columns
c_cols = set(airports_df.columns) - nc_cols

In [22]:
dup_rows = airports_df.duplicated(subset=npk_cols,keep=False)
dup_rows_w_iata = (dup_rows & airports_df.iata_code.notna())

print(f"There are {dup_rows.sum()} duplicate rows (not counting `icao_code`). Of those, {dup_rows_w_iata.sum()} have an IATA code present.")

del dup_rows
del dup_rows_w_iata

There are 162 duplicate rows (not counting `icao_code`). Of those, 0 have an IATA code present.


In [23]:
# Function to update duplicate- and NULL-related rows
def dn_info():
    global dup_iata_rows     # Rows with non-NULL, duplicate IATA codes
    global all_null_rows     # Rows that are empty in all coordinate columns
    global dup_iata_nc_rows  # Rows with duplicate IATA codes that are NULL in all coordinate columns
    dup_iata_rows = airports_df.iata_code.notna() & airports_df.iata_code.duplicated(keep=False)
    all_null_rows = (airports_df[c_cols].isna().sum(axis=1) == len(c_cols))
    dup_iata_nc_rows = dup_iata_rows & all_null_rows
    print(f"Not counting NULL values, there are {dup_iata_rows.sum()} rows with duplicate IATA codes.")
    print(f"There are {all_null_rows.sum()} rows that are NULL in all coordinate columns.")
    print(f"There are {dup_iata_nc_rows.sum()} rows that have duplicate IATA codes but no coordinate data.")

In [24]:
dn_info()

Not counting NULL values, there are 102 rows with duplicate IATA codes.
There are 5112 rows that are NULL in all coordinate columns.
There are 38 rows that have duplicate IATA codes but no coordinate data.


In [25]:
# Drop rows with duplicate IATA codes but no coordinate data.
airports_df.drop(index=airports_df.loc[dup_iata_nc_rows].index,inplace=True)

print_shape(airports_df)

9,262 rows × 16 columns


In [26]:
dn_info()

Not counting NULL values, there are 28 rows with duplicate IATA codes.
There are 5074 rows that are NULL in all coordinate columns.
There are 0 rows that have duplicate IATA codes but no coordinate data.


In [27]:
with pd.option_context('display.max_rows',None):
    display(airports_df.loc[dup_iata_rows].sort_values('iata_code'))

Unnamed: 0,icao_code,iata_code,name,city,country,lat_deg,lat_min,lat_sec,lat_dir,lon_deg,lon_min,lon_sec,lon_dir,altitude,lat_decimal,lon_decimal
5071,LTAG,ADA,INCIRLIK AB,ADANA,TURKEY,37,,7.0,N,35,25,33.0,E,73,37.002,35.426
5070,LTAF,ADA,ADANA,ADANA,TURKEY,36,58.0,55.0,N,35,16,49.0,E,20,36.982,35.28
488,DNAK,AKR,AKURE,AKURE,NIGERIA,7,14.0,48.0,N,5,18,3.0,E,336,7.247,5.301
3390,KAKR,AKR,AKRON FULTON INTERNATIONAL,AKRON,USA,41,2.0,15.0,N,81,28,,W,326,41.038,-81.467
8318,VEDB,DBO,DHANBAD,DHANBAD,INDIA,23,50.0,2.0,N,86,25,30.0,E,259,23.834,86.425
9068,YSDU,DBO,DUBBO,DUBBO,AUSTRALIA,32,13.0,,S,148,34,29.0,E,285,-32.217,148.575
3901,LEGR,GRX,GRANADA,GRANADA,SPAIN,37,11.0,19.0,N,3,46,38.0,W,567,37.189,-3.777
3899,LEGA,GRX,ARMILLA,GRANADA,SPAIN,37,7.0,59.0,N,3,38,8.0,W,701,37.133,-3.636
6138,OEAH,LEA,AL AHSA,AL-AHSA,SAUDI ARABIA,25,17.0,3.0,N,49,29,10.0,E,180,25.284,49.486
9055,YPLM,LEA,LEARMONTH,LEARMONTH,AUSTRALIA,22,14.0,8.0,S,114,5,19.0,E,6,-22.236,114.089


In [28]:
ix_drop_list = [
    5071,
#     5070,  # counterpart has NULL in `lat_min`
    488,
#     3390,  # International; also counterpart is non-USA (the flight-and-weather data is all US-based)
#     8318,  # counterpart has NULL in `lat_sec`
    9068,
    3901,
#     3899,  # has more specific `name` value than counterpart
    6138,
#     9055,
    3977,
#     4198,  # has more specific `name` value than counterpart
#     515,
    6518,
    6888,
#     6828,
    6902,
#     6897,  # International
#     7894,  # International
    7887,
#     3669,  # counterpart is non-USA
    5180,
    4549,
#     4553,  # has more specific `name` value than counterpart
    6748,
#     6745,  # has the airport's official name
    7516,
#     6552   # International
]

In [29]:
airports_df.drop(index=ix_drop_list,errors='ignore',inplace=True)

print_shape(airports_df)

9,248 rows × 16 columns


In [30]:
dn_info()

Not counting NULL values, there are 0 rows with duplicate IATA codes.
There are 5074 rows that are NULL in all coordinate columns.
There are 0 rows that have duplicate IATA codes but no coordinate data.


In [31]:
airports_df_details()

Unnamed: 0,data_type,null_count,unique
icao_code,object,0,True
iata_code,object,5738,True
name,object,5067,False
city,object,0,False
country,object,0,False
lat_deg,Int64,5133,False
lat_min,Int64,5129,False
lat_sec,Int64,5189,False
lat_dir,object,5074,False
lon_deg,Int64,5157,False


## Flights and Weather Data Set

In [32]:
faw_df = pd.concat([
    pd.read_csv(os.path.join(data_dir,filename))
    for filename in faw_source_files
]).reset_index(drop=True).rename_axis('id')

print_shape(faw_df)

5,512,903 rows × 35 columns


In [33]:
# Make column labels lowercase
faw_df.rename(columns=str.lower,inplace=True)

In [34]:
faw_df.head()

Unnamed: 0_level_0,carrier_code,flight_number,origin_airport,destination_airport,date,scheduled_elapsed_time,tail_number,departure_delay,arrival_delay,delay_carrier,...,hourlyprecipitation_x,hourlystationpressure_x,hourlyvisibility_x,hourlywindspeed_x,station_y,hourlydrybulbtemperature_y,hourlyprecipitation_y,hourlystationpressure_y,hourlyvisibility_y,hourlywindspeed_y
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,AS,121,SEA,ANC,2019-05-01,215,N615AS,-8,-16,0,...,0.0,29.59,10.0,8.0,70272530000.0,42.0,0.0,30.16,10.0,3.0
1,F9,402,LAX,DEN,2019-05-01,147,N701FR,17,-4,0,...,0.0,29.65,10.0,3.0,72565000000.0,34.0,0.0,24.43,4.0,0.0
2,F9,662,SFO,DEN,2019-05-01,158,N346FR,44,27,0,...,0.0,29.98,10.0,6.0,72565000000.0,34.0,0.0,24.43,4.0,0.0
3,F9,790,PDX,DEN,2019-05-01,156,N332FR,24,10,0,...,0.0,29.98,10.0,0.0,72565000000.0,34.0,0.0,24.43,4.0,0.0
4,AS,108,ANC,SEA,2019-05-01,210,N548AS,-9,-31,0,...,0.0,30.18,10.0,5.0,72793020000.0,44.0,0.0,29.58,10.0,7.0


In [35]:
# Get data types and number of null values for each column
pd.concat(
    [
        faw_df.dtypes,
        faw_df.isna().sum()
    ],
    axis=1,
    keys=['data_type','null_count']
)

Unnamed: 0,data_type,null_count
carrier_code,object,0
flight_number,int64,0
origin_airport,object,0
destination_airport,object,0
date,object,0
scheduled_elapsed_time,int64,0
tail_number,object,13556
departure_delay,int64,0
arrival_delay,int64,0
delay_carrier,int64,0


**Note:** Eventually, `cancelled_code` will be our target column for a machine-learning algorithm.  
Because the column in the source data is `cancelled_code` and not `canceled_code`, the double-l spelling will be used in this work.

In [36]:
# Examine the `carrier_code` column
faw_df.carrier_code.value_counts()

AA    1438798
DL    1207720
UA    1070050
WN     918320
AS     304198
B6     199314
NK     142041
F9      97482
G4      71728
HA      63252
Name: carrier_code, dtype: int64

### What do the codes mean?

According to the United States Department of Transportation Bureau of Transportation Statistics Airlines and Airports data, [Airline Codes](https://www.bts.gov/topics/airlines-and-airports/airline-codes) document:

**AIRLINE CODES:**
- `AA`-American Airlines Inc.
- `AS`-Alaska Airlines Inc.
- `B6`-JetBlue Airways
- `DL`-Delta Air Lines Inc.
- `F9`-Frontier Airlines Inc.
- `G4`-Allegiant Air
- `HA`-Hawaiian Airlines Inc.
- `NK`-Spirit Air Lines
- `UA`-United Air Lines Inc.
- `WN`-Southwest Airlines Co.

In [37]:
# Examine the `flight_number` column
# Are they unique?
faw_df.flight_number.duplicated(keep=False).sum()

5512864

They very much are not unique.

In [38]:
# What about the combination of `carrier_code` and `flight_number`?
faw_df[['carrier_code','flight_number']].duplicated(keep=False).sum()

5512259

Also not unique.

In [39]:
# In order to prevent the `date` column from being confused with the `date` data
# type or any `date` functions, rename the `date` column to `flight_date`
faw_df.rename(columns={'date':'flight_date'}, errors='ignore', inplace=True)

In [40]:
# Combine `year`, `month`, and `day` into a single `string` in the same format as the `date` column
# and check for equality against the actual date column.

# Check only a few rows
(
    faw_df[['year','month','day']][:3]
    .apply(lambda row: '-'.join([val.zfill(2) for val in row.values.astype(str)]), axis=1)
    .equals(
        faw_df.flight_date[:3]
    )
)

# Check all the rows
# (
#     faw_df[['year','month','day']]
#     .apply(lambda row: '-'.join([val.zfill(2) for val in row.values.astype(str)]), axis=1)
#     .equals(
#         faw_df.flight_date
#     )
# )

True

**Note:** The all-rows check, above, is commented out because it takes a long time, but when run, it does show equality between the entire `flight_date` series and the combined `year`-`month`-`day` series.

Because `year`, `month`, and `day` were originally stored as `int64` values, this also tells us that all the values in `flight_date` are properly formatted (no leading or trailing spaces, *etc*.).

The data is therefore redundant, and we don't need both.

`weekday` is likewise redundant, since it can be calculated from `flight_date`.

In [41]:
# Drop redundant date columns
faw_df.drop(
    columns=['year','month','day','weekday'],
    errors='ignore',
    inplace=True
)

print_shape(faw_df)

5,512,903 rows × 31 columns


In [42]:
# Examine `cancelled_code` column
faw_df.cancelled_code.value_counts()

N    5426150
B      41919
A      23451
C      21370
D         13
Name: cancelled_code, dtype: int64

### What do the codes mean?

According to the United States Department of Transportation Bureau of Transportation Statistics Airlines and Airports data, [Number 14 - On-Time Reporting](https://www.bts.gov/topics/airlines-and-airports/number-14-time-reporting):

**CANCELLATION CODES**
- `A`-Carrier Caused
- `B`-Weather
- `C`-National Aviation System
- `D`-Security

\[`N` is not on the list and represents "None" or "Not cancelled".\]

We are only interested in flights that were cancelled due to weather, so we will keep only rows with `cancelled_code` `B` or `N`.

In [43]:
faw_df = faw_df.loc[faw_df.cancelled_code.isin(['B','N'])]

print_shape(faw_df)

5,468,069 rows × 31 columns


In [44]:
# Check that there are now only `B` and `N` values
faw_df.cancelled_code.value_counts()

N    5426150
B      41919
Name: cancelled_code, dtype: int64

In [45]:
# Convert `cancelled_code` column into boolean `cancelled` column, where
# `B` = True (*was* cancelled) and `N` = False (*was not* cancelled)

try:
    print("Converting cancelled_code column to boolean… ", end="")
    faw_df.cancelled_code = (faw_df.cancelled_code == 'B')
    faw_df.rename(columns={'cancelled_code':'cancelled'},inplace=True)
    print()
except AttributeError:
    print("Column has already been processed.")

faw_df.cancelled.value_counts()

Converting cancelled_code column to boolean… 


False    5426150
True       41919
Name: cancelled, dtype: int64

In [46]:
# How many flights were cancelled|not cancelled vs. how many departed|arrived

departed = ~faw_df.actual_departure_dt.isna()
arrived = ~faw_df.actual_arrival_dt.isna()

mult_ix = pd.MultiIndex.from_tuples([
    ('departed',True),
    ('departed',False),
    ('arrived',True),
    ('arrived',False),
])

mult_cols = pd.MultiIndex.from_tuples([
    ('cancelled',False),
    ('cancelled',True)
])

pd.DataFrame(
    data=[
        [
            faw_df.loc[(~faw_df.cancelled) & (departed)].shape[0],
            faw_df.loc[(faw_df.cancelled) & (departed)].shape[0]
        ],
        [
            faw_df.loc[(~faw_df.cancelled) & (~departed)].shape[0],
            faw_df.loc[(faw_df.cancelled) & (~departed)].shape[0]
        ],
        [
            faw_df.loc[(~faw_df.cancelled) & (arrived)].shape[0],
            faw_df.loc[(faw_df.cancelled) & (arrived)].shape[0]
        ],
        [
            faw_df.loc[(~faw_df.cancelled) & (~arrived)].shape[0],
            faw_df.loc[(faw_df.cancelled) & (~arrived)].shape[0]
        ]
    ],
    index=mult_ix,
    columns=mult_cols
)

Unnamed: 0_level_0,Unnamed: 1_level_0,cancelled,cancelled
Unnamed: 0_level_1,Unnamed: 1_level_1,False,True
departed,True,5426150,1854
departed,False,0,40065
arrived,True,5424261,0
arrived,False,1889,41919


In [47]:
# Does anything stand out for cancelled flights that still departed?
faw_df.loc[faw_df.cancelled & departed].head().transpose()

id,16715,17002,17815,18640,18750
carrier_code,AA,AA,AA,AA,AA
flight_number,1393,346,2761,1271,5821
origin_airport,OKC,DFW,DFW,IAH,DFW
destination_airport,DFW,MSY,STL,DFW,ELP
flight_date,2019-05-01,2019-05-01,2019-05-01,2019-05-01,2019-05-01
scheduled_elapsed_time,69,85,105,75,104
tail_number,N751UW,N357PV,N971TW,N898NN,N243LR
departure_delay,176,83,111,113,28
arrival_delay,0,0,0,0,0
delay_carrier,0,0,0,0,0


In [48]:
# What about non-cancelled flights that didn't arrive?
faw_df.loc[~faw_df.cancelled & ~arrived].head().transpose()

id,5154,12535,13657,16277,17368
carrier_code,AS,AA,WN,WN,UA
flight_number,55,2028,2272,2212,6296
origin_airport,SCC,MEM,PDX,ABQ,IAD
destination_airport,BRW,DFW,DAL,DAL,DFW
flight_date,2019-05-01,2019-05-01,2019-05-01,2019-05-01,2019-05-01
scheduled_elapsed_time,45,99,230,105,209
tail_number,N609AS,N749US,N931WN,N788SA,N87353
departure_delay,29,398,-2,-5,212
arrival_delay,0,0,0,0,0
delay_carrier,0,0,0,0,0


### Ensure Airport Codes Exist in Airports Data

In [49]:
faw_airports = set(faw_df.origin_airport.values).union(set(faw_df.destination_airport.values))
faw_unk_airports = (faw_airports - set(airports_df.iata_code.values))

# How many IATA codes are in `flights_and_weather` but not in `airports`
print(f"There are {len(faw_airports)} distinct IATA codes in the flights and weather data, {len(faw_unk_airports)} of which do not appear in the airports data.")

There are 373 distinct IATA codes in the flights and weather data, 159 of which do not appear in the airports data.


In [50]:
# Series to determine whether both the origin and destination airports are in the `airports` data
both_aps_known = (
    faw_df.origin_airport.isin(airports_df.iata_code.values)
    &
    faw_df.destination_airport.isin(airports_df.iata_code.values)
)

In [51]:
# Number of flights with known/unknown airports
print("Number of flights with…")
print(f"Both airports known:  {faw_df.loc[both_aps_known].index.size:>9,}")
print(f"At least one unknown: {faw_df.loc[~both_aps_known].index.size:>9,}")
print(f"Total:                {faw_df.index.size:>9,}")

Number of flights with…
Both airports known:  4,731,434
At least one unknown:   736,635
Total:                5,468,069


### Establish `flights_and_weather` table definition

Steps:
1. Translate Pandas (*i.e.*, Numpy) data types into (default) PostgreSQL data types.
2. Edit data types on specific columns (especially those of type `object`) taking into consideration:
   - the length of data for any `char` or `varchar` columns
      - If values are always the same length, use `char`; if they vary, use `varchar`.
   - <s>`UNIQUE` constraints</s>
      - \[We already examined columns that might potentially have a `UNIQUE` constraint and determined that none do.\]
   - `NOT NULL` constraints
   - `FOREIGN KEY` columns that will `REFERENCE` a column from the `airports` table
6. Generate a CREATE TABLE string.

In [52]:
# Dictionary to translate Numpy data types into default PostgreSQL data types
type_repl = {
    'bool':'boolean',
    'float64':'numeric',
    'int64':'integer',
    'object':'text'
}

In [53]:
col_defs = pd.DataFrame(
    columns=['column','data_type'],
    data=zip(faw_df.dtypes.index, map(type_repl.get, [dt.name for dt in faw_df.dtypes.values]))
)

col_defs.head()

Unnamed: 0,column,data_type
0,carrier_code,text
1,flight_number,integer
2,origin_airport,text
3,destination_airport,text
4,flight_date,text


In [54]:
# Examine columns of type `object`
faw_df.select_dtypes('object').columns.tolist()

['carrier_code',
 'origin_airport',
 'destination_airport',
 'flight_date',
 'tail_number',
 'scheduled_departure_dt',
 'scheduled_arrival_dt',
 'actual_departure_dt',
 'actual_arrival_dt']

- `carrier_code`, `origin_airport`, `destination_airport`, and `tail_number` should have either `char` or `varchar` types (depending on the lengths of their respective values).
  - Although we will be uploading `carrier_code` and `tail_number` into the database, they are for identification purposes, only; they will not be features for the machine-learning model to come later.
  - <s>`origin_airport` and `destination_airport` will serve as `FOREIGN KEY`s that will `REFERENCE` the `iata_code` column from the `airports` table.</s> IATA Codes are strictly three letters, and so the columns should have `char(3)` data types.
    - \[Prior analysis indicated that there are flights in the flights-and-weather data to/from airports for which information does not exist in the airports data. Therefore, the `FOREIGN KEY CONSTRAINT` will not be implemented, here.\]
- `flight_date` should have a `date` data type.
- All columns ending in `_dt` should have `datetime` data types.
  - Similar to some previously mentioned columns, whle `actual_departure_dt` and `actual_arrival_dt` *can* be stored in the database, they absolutely should ***not*** be used as features for the machine learning model, because their presence or absence *defines* what it means for a flight to be cancelled, which is exactly what the model is intended to predict.

In [55]:
for col in ['carrier_code','origin_airport','destination_airport','tail_number']:
    col_length = [len(val) for val in faw_df[col].dropna()]
    print(f"{col} length: ({min(col_length)},{max(col_length)})")
    
del col_length

carrier_code length: (2,2)
origin_airport length: (3,3)
destination_airport length: (3,3)
tail_number length: (5,6)


In [56]:
change_list = {
    'carrier_code':'char(2)',
    'origin_airport':'char(3)',
    'destination_airport':'char(3)',
    'flight_date':'date',
    'tail_number':'varchar(6)'
}

for col in change_list.keys():
    col_defs.data_type.loc[col_defs.column == col] = change_list[col]

In [57]:
col_defs.data_type.loc[col_defs.column.str.contains('_dt')] = 'timestamp'

In [58]:
# Set columns to `NOT NULL` if the columns have no missing values in the source
# data and also are not already designated as `NOT NULL`

nn = (
    (faw_df.isna().sum().values == 0)
    &
    ~col_defs.data_type.str.contains(' NOT NULL')
)

col_defs.data_type.loc[nn] = (col_defs.data_type.loc[nn] + ' NOT NULL')

In [59]:
col_defs

Unnamed: 0,column,data_type
0,carrier_code,char(2) NOT NULL
1,flight_number,integer NOT NULL
2,origin_airport,char(3) NOT NULL
3,destination_airport,char(3) NOT NULL
4,flight_date,date NOT NULL
5,scheduled_elapsed_time,integer NOT NULL
6,tail_number,varchar(6)
7,departure_delay,integer NOT NULL
8,arrival_delay,integer NOT NULL
9,delay_carrier,integer NOT NULL


Create the `flights_and_weather` table-creation string by:
1. concatenating `col_def` values across rows (joined by ` `)
2. concatenating those rows (joined by `,\n    `)
3. inserting the result in between the appropriate table-creation text

In [60]:
faw_create = (
    'CREATE TABLE IF NOT EXISTS flights_and_weather (\n    id integer PRIMARY KEY,\n    '
    + col_defs.apply(lambda x: ' '.join(x), axis=1).str.cat(sep=',\n    ')
    + '\n);'
)

print(faw_create)

CREATE TABLE IF NOT EXISTS flights_and_weather (
    id integer PRIMARY KEY,
    carrier_code char(2) NOT NULL,
    flight_number integer NOT NULL,
    origin_airport char(3) NOT NULL,
    destination_airport char(3) NOT NULL,
    flight_date date NOT NULL,
    scheduled_elapsed_time integer NOT NULL,
    tail_number varchar(6),
    departure_delay integer NOT NULL,
    arrival_delay integer NOT NULL,
    delay_carrier integer NOT NULL,
    delay_weather integer NOT NULL,
    delay_national_aviation_system integer NOT NULL,
    delay_security integer NOT NULL,
    delay_late_aircarft_arrival integer NOT NULL,
    cancelled boolean NOT NULL,
    scheduled_departure_dt timestamp NOT NULL,
    scheduled_arrival_dt timestamp NOT NULL,
    actual_departure_dt timestamp,
    actual_arrival_dt timestamp,
    station_x numeric,
    hourlydrybulbtemperature_x numeric,
    hourlyprecipitation_x numeric,
    hourlystationpressure_x numeric,
    hourlyvisibility_x numeric,
    hourlywindspeed_x nu

## Some Cleanup

In [61]:
# Free up variables that won't be used later
for _var in [
    'change_list',
    'col_defs',
    'type_repl',
    'both_aps_known',
    'faw_unk_airports',
    'faw_airports,'
    'mult_cols',
    'mult_ix',
    'arrived',
    'departed',
    'ix_drop_list',
    'dup_iata_nc_rows',
    'all_null_rows',
    'dup_iata_rows',
    'dn_info',
    'c_cols',
    'nc_cols',
    'npk_cols',
    'int_cols',
    'airports_df_details',
    'airports_columns',
    'airports_tuples',
    'process_airport',
    'gadb_text',
    'gadb'
]:
    if _var in locals() or _var in globals():
        exec(f'del {_var}')

## Create Database Tables

In order to connect to the database, first, make sure you have a local (running) database with the `hostname`, `database` name, `username`, and `port` number as specified in `/resources/config/gadb_pg_config.py`.

**Note:** because the `flights_and_weather` table has columns that reference columns from the `airports` table, the `airports` table must be created *before* the `flights_and_weather` table. Likewise, if you wish to drop the tables, the `flights_and_weather` table must be dropped *before* attempting to drop the `airports` table (unless `DROP CASCADE` is used).

### Install [Psycopg2](https://pypi.org/project/psycopg2/)

If you do not already have Psycopg2 (and its binary extension) installed, **enable the cell below** by converting it to Cell Type `Code`. (In the Jupyter Notebook menus, select `Cell` > `Cell Type` > `Code`.)

Additional details about how to use Psycopg2 can be found in its [documentation](https://www.psycopg.org/docs/).

### Install [SQLAlchemy](https://www.sqlalchemy.org/)

If you do not already have SQLAlchemy installed, **enable the cell below** by converting it to Cell Type `Code`. (In the Jupyter Notebook menus, select `Cell` > `Cell Type` > `Code`.)

Additional details about how to use SQLAlchemy can be found in its [documentation](https://docs.sqlalchemy.org/en/14/).

In [62]:
# Database configuration details
from config import gadb_pg_config as cfg

# To connect to SQL database
import sqlalchemy as db
# from sqlalchemy import create_engine, MetaData, Table, text, types

# To enter passwords without exposing them
from getpass import getpass

Most of the database information is already in `cfg` (above). However, you will have to enter your password below.

In [63]:
password = getpass('Enter database password')

Enter database password········


In [64]:
db_string = f"postgresql+psycopg2://{cfg.username}:{password}@{cfg.hostname}:{cfg.port}/{cfg.database}"

In [65]:
engine = db.create_engine(
    future=True,
#     echo=True,
    url=db_string
)

In [66]:
# To ensure that no tables exist before trying to make them, enable this cell
with engine.begin() as conn:    
    conn.execute(db.text('DROP TABLE IF EXISTS flights_and_weather;'))
    conn.execute(db.text('DROP TABLE IF EXISTS airports;'))
print('Tables dropped.')

Tables dropped.


In [67]:
# Create the `airports` and `flights_and_weather` database tables
for table in (cfg.ap_create,faw_create):
    with engine.begin() as conn:    
        conn.execute(db.text(table))
print('Done.')

Done.


#### Recommendation

Check the database (via pgAdmin or some other means) to make sure that the tables exist as expected.

## Pre-Upload Checks

In [68]:
db_meta = db.MetaData()

In [69]:
# Access the `airports` and `flights_and_weather` tables and assign them to variables
airports_table = db.Table('airports', db_meta, autoload_with=engine)
faw_table = db.Table('flights_and_weather', db_meta, autoload_with=engine)

### Airport Data

In [70]:
# Check to see if `airports_table` has the same number of columns as the `airports_df`
len(airports_table.columns.keys()) == airports_df.columns.size

True

In [71]:
# Verify `airports_table` column definitions
airports_table.columns.values()

[Column('icao_code', VARCHAR(length=4), table=<airports>, primary_key=True, nullable=False),
 Column('iata_code', CHAR(length=3), table=<airports>),
 Column('name', TEXT(), table=<airports>),
 Column('city', TEXT(), table=<airports>),
 Column('country', TEXT(), table=<airports>),
 Column('lat_deg', INTEGER(), table=<airports>),
 Column('lat_min', INTEGER(), table=<airports>),
 Column('lat_sec', INTEGER(), table=<airports>),
 Column('lat_dir', CHAR(length=1), table=<airports>),
 Column('lon_deg', INTEGER(), table=<airports>),
 Column('lon_min', INTEGER(), table=<airports>),
 Column('lon_sec', INTEGER(), table=<airports>),
 Column('lon_dir', CHAR(length=1), table=<airports>),
 Column('altitude', INTEGER(), table=<airports>),
 Column('lat_decimal', NUMERIC(), table=<airports>),
 Column('lon_decimal', NUMERIC(), table=<airports>)]

In [72]:
print(db.insert(airports_table))

INSERT INTO airports (icao_code, iata_code, name, city, country, lat_deg, lat_min, lat_sec, lat_dir, lon_deg, lon_min, lon_sec, lon_dir, altitude, lat_decimal, lon_decimal) VALUES (:icao_code, :iata_code, :name, :city, :country, :lat_deg, :lat_min, :lat_sec, :lat_dir, :lon_deg, :lon_min, :lon_sec, :lon_dir, :altitude, :lat_decimal, :lon_decimal)


In [73]:
# Check query to test whether `airports` table has any rows
print(db.exists().select_from(airports_table).select())

SELECT EXISTS (SELECT * 
FROM airports) AS anon_1


### Flight and Weather Data

In [74]:
# Check to see if `faw_table` has the same number of columns as `faw_df` (+1 for the index/id column)
len(faw_table.columns.keys()) == faw_df.columns.size + 1

True

In [75]:
# Verify `faw_table` column definitions
faw_table.columns.values()

[Column('id', INTEGER(), table=<flights_and_weather>, primary_key=True, nullable=False),
 Column('carrier_code', CHAR(length=2), table=<flights_and_weather>, nullable=False),
 Column('flight_number', INTEGER(), table=<flights_and_weather>, nullable=False),
 Column('origin_airport', CHAR(length=3), table=<flights_and_weather>, nullable=False),
 Column('destination_airport', CHAR(length=3), table=<flights_and_weather>, nullable=False),
 Column('flight_date', DATE(), table=<flights_and_weather>, nullable=False),
 Column('scheduled_elapsed_time', INTEGER(), table=<flights_and_weather>, nullable=False),
 Column('tail_number', VARCHAR(length=6), table=<flights_and_weather>),
 Column('departure_delay', INTEGER(), table=<flights_and_weather>, nullable=False),
 Column('arrival_delay', INTEGER(), table=<flights_and_weather>, nullable=False),
 Column('delay_carrier', INTEGER(), table=<flights_and_weather>, nullable=False),
 Column('delay_weather', INTEGER(), table=<flights_and_weather>, nullable=

In [76]:
print(db.insert(faw_table))

INSERT INTO flights_and_weather (id, carrier_code, flight_number, origin_airport, destination_airport, flight_date, scheduled_elapsed_time, tail_number, departure_delay, arrival_delay, delay_carrier, delay_weather, delay_national_aviation_system, delay_security, delay_late_aircarft_arrival, cancelled, scheduled_departure_dt, scheduled_arrival_dt, actual_departure_dt, actual_arrival_dt, station_x, hourlydrybulbtemperature_x, hourlyprecipitation_x, hourlystationpressure_x, hourlyvisibility_x, hourlywindspeed_x, station_y, hourlydrybulbtemperature_y, hourlyprecipitation_y, hourlystationpressure_y, hourlyvisibility_y, hourlywindspeed_y) VALUES (:id, :carrier_code, :flight_number, :origin_airport, :destination_airport, :flight_date, :scheduled_elapsed_time, :tail_number, :departure_delay, :arrival_delay, :delay_carrier, :delay_weather, :delay_national_aviation_system, :delay_security, :delay_late_aircarft_arrival, :cancelled, :scheduled_departure_dt, :scheduled_arrival_dt, :actual_departure

In [77]:
# Check query to test whether `flights_and_weather` table has any rows
print(db.exists().select_from(faw_table).select())

SELECT EXISTS (SELECT * 
FROM flights_and_weather) AS anon_1


## Upload Data

In [78]:
import csv
from io import StringIO

In [79]:
# Alternative to_sql() *method* for DBs that support COPY FROM
# From <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#io-sql-method>
def psql_insert_copy(table, conn, keys, data_iter):
    """
    Execute SQL statement inserting data

    Parameters
    ----------
    table : pandas.io.sql.SQLTable
    conn : sqlalchemy.engine.Engine or sqlalchemy.engine.Connection
    keys : list of str
        Column names
    data_iter : Iterable that iterates the values to be inserted
    """
    # gets a DBAPI connection that can provide a cursor
    dbapi_conn = conn.connection
    with dbapi_conn.cursor() as cur:
        s_buf = StringIO()
        writer = csv.writer(s_buf)
        writer.writerows(data_iter)
        s_buf.seek(0)

        columns = ', '.join(['"{}"'.format(k) for k in keys])
        if table.schema:
            table_name = '{}.{}'.format(table.schema, table.name)
        else:
            table_name = table.name

        sql = 'COPY {} ({}) FROM STDIN WITH CSV'.format(
            table_name, columns)
        cur.copy_expert(sql=sql, file=s_buf)

### Airport Data

In [80]:
airports_df.head(3)

Unnamed: 0,icao_code,iata_code,name,city,country,lat_deg,lat_min,lat_sec,lat_dir,lon_deg,lon_min,lon_sec,lon_dir,altitude,lat_decimal,lon_decimal
0,AYGA,GKA,GOROKA,GOROKA,PAPUA NEW GUINEA,6,4,54,S,145,23,30,E,1610,-6.082,145.392
2,AYMD,MAG,MADANG,MADANG,PAPUA NEW GUINEA,5,12,25,S,145,47,19,E,7,-5.207,145.789
3,AYMH,HGU,MOUNT HAGEN,MOUNT HAGEN,PAPUA NEW GUINEA,5,49,34,S,144,17,46,E,1643,-5.826,144.296


In [81]:
# Upload
with engine.begin() as conn:
    ap_is_empty = not conn.execute(db.exists().select_from(airports_table).select()).scalar()

if (ap_is_empty):
    airports_df.to_sql(
        name='airports',
        con=engine,
        if_exists='append',
        index=False,
        method=psql_insert_copy
    )
    print('Done.')
else:
    print('`airports` table already populated.')

Done.


In [82]:
print('As a reminder, `airports_df` has:')
print_shape(airports_df)

As a reminder, `airports_df` has:
9,248 rows × 16 columns


### Flight and Weather Data

In [83]:
# Free up some memory by removing `airports_df`
try:
    del airports_df
except NameError:
    pass

In [84]:
# Determine if any row indices are already present in the `flights_and_weather` table
with engine.begin() as conn:
    result = conn.execute(db.select().with_only_columns(faw_table.c.id))
    existing_faw_ids = [row[0] for row in result]

existing_faw_ids

[]

In [85]:
# Drop existing IDs from the upload dataframe
faw_df.drop(index=existing_faw_ids,errors='ignore',inplace=True)

In [86]:
faw_df.head()

Unnamed: 0_level_0,carrier_code,flight_number,origin_airport,destination_airport,flight_date,scheduled_elapsed_time,tail_number,departure_delay,arrival_delay,delay_carrier,...,hourlyprecipitation_x,hourlystationpressure_x,hourlyvisibility_x,hourlywindspeed_x,station_y,hourlydrybulbtemperature_y,hourlyprecipitation_y,hourlystationpressure_y,hourlyvisibility_y,hourlywindspeed_y
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,AS,121,SEA,ANC,2019-05-01,215,N615AS,-8,-16,0,...,0.0,29.59,10.0,8.0,70272530000.0,42.0,0.0,30.16,10.0,3.0
1,F9,402,LAX,DEN,2019-05-01,147,N701FR,17,-4,0,...,0.0,29.65,10.0,3.0,72565000000.0,34.0,0.0,24.43,4.0,0.0
2,F9,662,SFO,DEN,2019-05-01,158,N346FR,44,27,0,...,0.0,29.98,10.0,6.0,72565000000.0,34.0,0.0,24.43,4.0,0.0
3,F9,790,PDX,DEN,2019-05-01,156,N332FR,24,10,0,...,0.0,29.98,10.0,0.0,72565000000.0,34.0,0.0,24.43,4.0,0.0
4,AS,108,ANC,SEA,2019-05-01,210,N548AS,-9,-31,0,...,0.0,30.18,10.0,5.0,72793020000.0,44.0,0.0,29.58,10.0,7.0


In [87]:
faw_df.to_sql(
    name='flights_and_weather',
    con=engine,
    if_exists='append',
    index=True,
    chunksize=100000,
    method=psql_insert_copy
    )
print('Done.')

Done.


In [88]:
print('As a reminder, `faw_df` has:')
print_shape(faw_df)

As a reminder, `faw_df` has:
5,468,069 rows × 31 columns


#### Recommendation

Check the database (via pgAdmin or some other means) to make sure that the tables exist and are populated as expected.