# Process data

### Important

In order to process the airport, flight, and weather data, you must already have it available. If you have not already acquired the data, open notebook `01_get_data` and run all its cells.

In [1]:
import os
import pandas as pd
import re  # regular expressions

In [2]:
# Make `resources` the working directory, and set `data_dir`
os.chdir(os.path.join('..','resources'))
data_dir = os.path.join('.','data')

In [3]:
os.listdir(data_dir)

['05-2019.csv',
 '06-2019.csv',
 '07-2019.csv',
 '08-2019.csv',
 '09-2019.csv',
 '10-2019.csv',
 '11-2019.csv',
 '12-2019.csv',
 'GlobalAirportDatabase.txt',
 'GlobalAirportDatabase.zip',
 'historical-flight-and-weather-data.zip',
 'readme.txt']

In [4]:
primary_source_files = list(filter(
                            lambda item: re.fullmatch(
                                '\d{2}\-2019\.csv',
                                item,
                                flags=re.I
                            ) is not None,
                            os.listdir(data_dir)
                        ))

# secondary_source_file = 'gadb_postgresql_create_airports_table.sql'
secondary_source_file = 'GlobalAirportDatabase.txt'

## Primary Data Set

In [5]:
primary_df = pd.concat([
    pd.read_csv(os.path.join(data_dir,filename))
    for filename in primary_source_files
])

primary_df_rows, primary_df_cols = primary_df.shape

print(f"{primary_df_rows:,} rows × {primary_df_cols:,} columns")

5,512,903 rows × 35 columns


In [6]:
primary_df.head()

Unnamed: 0,carrier_code,flight_number,origin_airport,destination_airport,date,scheduled_elapsed_time,tail_number,departure_delay,arrival_delay,delay_carrier,...,HourlyPrecipitation_x,HourlyStationPressure_x,HourlyVisibility_x,HourlyWindSpeed_x,STATION_y,HourlyDryBulbTemperature_y,HourlyPrecipitation_y,HourlyStationPressure_y,HourlyVisibility_y,HourlyWindSpeed_y
0,AS,121,SEA,ANC,2019-05-01,215,N615AS,-8,-16,0,...,0.0,29.59,10.0,8.0,70272530000.0,42.0,0.0,30.16,10.0,3.0
1,F9,402,LAX,DEN,2019-05-01,147,N701FR,17,-4,0,...,0.0,29.65,10.0,3.0,72565000000.0,34.0,0.0,24.43,4.0,0.0
2,F9,662,SFO,DEN,2019-05-01,158,N346FR,44,27,0,...,0.0,29.98,10.0,6.0,72565000000.0,34.0,0.0,24.43,4.0,0.0
3,F9,790,PDX,DEN,2019-05-01,156,N332FR,24,10,0,...,0.0,29.98,10.0,0.0,72565000000.0,34.0,0.0,24.43,4.0,0.0
4,AS,108,ANC,SEA,2019-05-01,210,N548AS,-9,-31,0,...,0.0,30.18,10.0,5.0,72793020000.0,44.0,0.0,29.58,10.0,7.0


In [7]:
# Get data types and number of null values for each column
pd.concat(
    [
        primary_df.dtypes,
        primary_df.isna().sum()
    ],
    axis=1,
    keys=['data_type','null_count']
)

Unnamed: 0,data_type,null_count
carrier_code,object,0
flight_number,int64,0
origin_airport,object,0
destination_airport,object,0
date,object,0
scheduled_elapsed_time,int64,0
tail_number,object,13556
departure_delay,int64,0
arrival_delay,int64,0
delay_carrier,int64,0


**Note:** Eventually, `cancelled_code` will be our target column for a machine-learning algorithm.  
Because the column in the source data is `cancelled_code` and not `canceled_code`, the double-l spelling will be used in this work.

In [8]:
# Examine the `carrier_code` column
primary_df.carrier_code.value_counts()

AA    1438798
DL    1207720
UA    1070050
WN     918320
AS     304198
B6     199314
NK     142041
F9      97482
G4      71728
HA      63252
Name: carrier_code, dtype: int64

### What do the codes mean?

According to the United States Department of Transportation Bureau of Transportation Statistics Airlines and Airports data, [Airline Codes](https://www.bts.gov/topics/airlines-and-airports/airline-codes) document:

**AIRLINE CODES:**
- `AA`-American Airlines Inc.
- `AS`-Alaska Airlines Inc.
- `B6`-JetBlue Airways
- `DL`-Delta Air Lines Inc.
- `F9`-Frontier Airlines Inc.
- `G4`-Allegiant Air
- `HA`-Hawaiian Airlines Inc.
- `NK`-Spirit Air Lines
- `UA`-United Air Lines Inc.
- `WN`-Southwest Airlines Co.

In [9]:
# Examine the `flight_number` column
# Are they unique?
primary_df.flight_number.duplicated(keep=False).sum()

5512864

They very much are not unique.

In [10]:
# What about the combination of `carrier_code` and `flight_number`?
primary_df[['carrier_code','flight_number']].duplicated(keep=False).sum()

5512259

Also not unique.

In [11]:
# In order to prevent the `date` column from being confused with the `date` data
# type or any `date` functions, rename the `date` column to `flight_date`
primary_df.rename(columns={'date':'flight_date'}, errors='ignore', inplace=True)

In [12]:
# Combine `year`, `month`, and `day` into a single `string` in the same format as the `date` column
# and check for equality against the actual date column.

# Check only a few rows
(
    primary_df[['year','month','day']][:3]
    .apply(lambda row: '-'.join([val.zfill(2) for val in row.values.astype(str)]), axis=1)
    .equals(
        primary_df.flight_date[:3]
    )
)

# Check all the rows
# (
#     primary_df[['year','month','day']]
#     .apply(lambda row: '-'.join([val.zfill(2) for val in row.values.astype(str)]), axis=1)
#     .equals(
#         primary_df.flight_date
#     )
# )

True

**Note:** The all-rows check, above, is commented out because it takes a long time, but when run, it does show equality between the entire `flight_date` series and the combined `year`-`month`-`day` series.

Because `year`, `month`, and `day` were originally stored as `int64` values, this also tells us that all the values in `flight_date` are properly formatted (no leading or trailing spaces, *etc*.).

The data is therefore redundant, and we don't need both.

`weekday` is likewise redundant, since it can be calculated from `flight_date`.

In [13]:
# Drop redundant date columns
primary_df.drop(
    columns=['year','month','day','weekday'],
    errors='ignore',
    inplace=True
)

primary_df_rows, primary_df_cols = primary_df.shape

print(f"{primary_df_rows:,} rows × {primary_df_cols:,} columns")

5,512,903 rows × 31 columns


In [14]:
# Examine `cancelled_code` column
primary_df.cancelled_code.value_counts()

N    5426150
B      41919
A      23451
C      21370
D         13
Name: cancelled_code, dtype: int64

### What do the codes mean?

According to the United States Department of Transportation Bureau of Transportation Statistics Airlines and Airports data, [Number 14 - On-Time Reporting](https://www.bts.gov/topics/airlines-and-airports/number-14-time-reporting):

**CANCELLATION CODES**
- `A`-Carrier Caused
- `B`-Weather
- `C`-National Aviation System
- `D`-Security

\[`N` is not on the list and represents "None" or "Not cancelled".\]

We are only interested in flights that were cancelled due to weather, so we will keep only rows with `cancelled_code` `B` or `N`.

In [15]:
primary_df = primary_df.loc[primary_df.cancelled_code.isin(['B','N'])]

primary_df_rows, primary_df_cols = primary_df.shape

print(f"{primary_df_rows:,} rows × {primary_df_cols:,} columns")

5,468,069 rows × 31 columns


In [16]:
# Check that there are now only `B` and `N` values
primary_df.cancelled_code.value_counts()

N    5426150
B      41919
Name: cancelled_code, dtype: int64

In [17]:
# Convert `cancelled_code` column into boolean `cancelled` column, where
# `B` = True (*was* cancelled) and `N` = False (*was not* cancelled)

try:
    print("Converting cancelled_code column to boolean… ", end="")
    primary_df.cancelled_code = (primary_df.cancelled_code == 'B')
    primary_df.rename(columns={'cancelled_code':'cancelled'},inplace=True)
    print()
except AttributeError:
    print("Column has already been processed.")

primary_df.cancelled.value_counts()

Converting cancelled_code column to boolean… 


False    5426150
True       41919
Name: cancelled, dtype: int64

In [18]:
# How many flights were cancelled|not cancelled vs. how many departed|arrived

departed = ~primary_df.actual_departure_dt.isna()
arrived = ~primary_df.actual_arrival_dt.isna()

mult_ix = pd.MultiIndex.from_tuples([
    ('departed',True),
    ('departed',False),
    ('arrived',True),
    ('arrived',False),
])

mult_cols = pd.MultiIndex.from_tuples([
    ('cancelled',False),
    ('cancelled',True)
])

pd.DataFrame(
    data=[
        [
            primary_df.loc[(~primary_df.cancelled) & (departed)].shape[0],
            primary_df.loc[(primary_df.cancelled) & (departed)].shape[0]
        ],
        [
            primary_df.loc[(~primary_df.cancelled) & (~departed)].shape[0],
            primary_df.loc[(primary_df.cancelled) & (~departed)].shape[0]
        ],
        [
            primary_df.loc[(~primary_df.cancelled) & (arrived)].shape[0],
            primary_df.loc[(primary_df.cancelled) & (arrived)].shape[0]
        ],
        [
            primary_df.loc[(~primary_df.cancelled) & (~arrived)].shape[0],
            primary_df.loc[(primary_df.cancelled) & (~arrived)].shape[0]
        ]
    ],
    index=mult_ix,
    columns=mult_cols
)

Unnamed: 0_level_0,Unnamed: 1_level_0,cancelled,cancelled
Unnamed: 0_level_1,Unnamed: 1_level_1,False,True
departed,True,5426150,1854
departed,False,0,40065
arrived,True,5424261,0
arrived,False,1889,41919


In [19]:
# Does anything stand out for cancelled flights that still departed?
primary_df.loc[primary_df.cancelled & departed].head().transpose()

Unnamed: 0,16715,17002,17815,18640,18750
carrier_code,AA,AA,AA,AA,AA
flight_number,1393,346,2761,1271,5821
origin_airport,OKC,DFW,DFW,IAH,DFW
destination_airport,DFW,MSY,STL,DFW,ELP
flight_date,2019-05-01,2019-05-01,2019-05-01,2019-05-01,2019-05-01
scheduled_elapsed_time,69,85,105,75,104
tail_number,N751UW,N357PV,N971TW,N898NN,N243LR
departure_delay,176,83,111,113,28
arrival_delay,0,0,0,0,0
delay_carrier,0,0,0,0,0


In [20]:
# What about non-cancelled flights that didn't arrive?
primary_df.loc[~primary_df.cancelled & ~arrived].head().transpose()

Unnamed: 0,5154,12535,13657,16277,17368
carrier_code,AS,AA,WN,WN,UA
flight_number,55,2028,2272,2212,6296
origin_airport,SCC,MEM,PDX,ABQ,IAD
destination_airport,BRW,DFW,DAL,DAL,DFW
flight_date,2019-05-01,2019-05-01,2019-05-01,2019-05-01,2019-05-01
scheduled_elapsed_time,45,99,230,105,209
tail_number,N609AS,N749US,N931WN,N788SA,N87353
departure_delay,29,398,-2,-5,212
arrival_delay,0,0,0,0,0
delay_carrier,0,0,0,0,0


### Establish `flights_and_weather` table definition

Steps:
1. Translate Pandas (*i.e.*, Numpy) data types into (default) PostgreSQL data types.
2. Edit data types on specific columns (especially those of type `object`) taking into consideration:
   - the length of data for any `char` or `varchar` columns
      - If values are always the same length, use `char`; if they vary, use `varchar`.
   - <s>`UNIQUE` constraints</s>
      - \[We already examined columns that might potentially have a `UNIQUE` constraint and determined that none do.\]
   - `NOT NULL` constraints
   - `FOREIGN KEY` columns that will `REFERENCE` a column from the `airports` table
6. Generate a CREATE TABLE string.

In [21]:
# Dictionary to translate Numpy data types into default PostgreSQL data types
type_repl = {
    'bool':'boolean',
    'float64':'numeric',
    'int64':'integer',
    'object':'text'
}

In [22]:
col_defs = pd.DataFrame(
    columns=['column','data_type'],
    data=zip(primary_df.dtypes.index, map(type_repl.get, [dt.name for dt in primary_df.dtypes.values]))
)

col_defs.head()

Unnamed: 0,column,data_type
0,carrier_code,text
1,flight_number,integer
2,origin_airport,text
3,destination_airport,text
4,flight_date,text


In [23]:
# Examine columns of type `object`
primary_df.select_dtypes('object').columns.tolist()

['carrier_code',
 'origin_airport',
 'destination_airport',
 'flight_date',
 'tail_number',
 'scheduled_departure_dt',
 'scheduled_arrival_dt',
 'actual_departure_dt',
 'actual_arrival_dt']

- `carrier_code`, `origin_airport`, `destination_airport`, and `tail_number` should have either `char` or `varchar` types (depending on the lengths of their respective values).
  - Although we will be uploading `carrier_code` and `tail_number` into the database, they are for identification purposes, only; they will not be features for the machine-learning model to come later.
  - `origin_airport` and `destination_airport` will serve as `FOREIGN KEY`s that will `REFERENCE` the `iata_code` column from the `airports` table. IATA Codes are strictly three letters, and so the columns should have `char(3)` data types.
- `flight_date` should have a `date` data type.
- All columns ending in `_dt` should have `datetime` data types.
  - Similar to some previously mentioned columns, whle `actual_departure_dt` and `actual_arrival_dt` *can* be stored in the database, they absolutely should ***not*** be used as features for the machine learning model, because their presence or absence *defines* what it means for a flight to be cancelled, which is exactly what the model is intended to predict.

In [24]:
# Function to print the shortest and longest values in a column
def min_max_column_length(test_column):
    col_lengths = [len(val) for val in primary_df[test_column].dropna()]
    print(f"{test_column} length: ({min(col_lengths)},{max(col_lengths)})")

In [25]:
min_max_column_length('carrier_code')
min_max_column_length('tail_number')

carrier_code length: (2,2)
tail_number length: (5,6)


In [26]:
change_list = {
    'carrier_code':'char(2)',
    'origin_airport':'char(3)',
    'destination_airport':'char(3)',
    'flight_date':'date',
    'tail_number':'varchar(6)'
}

for col in change_list.keys():
    col_defs.data_type.loc[col_defs.column == col] = change_list[col]

In [27]:
col_defs.data_type.loc[col_defs.column.str.contains('_dt')] = 'timestamp'

In [28]:
# Set columns to `NOT NULL` if the columns…
# 1) have no missing values in the source data
# 2) do not have names ending in '_airport' (those will be handled later as FOREIGN KEYS)
# 3) are not already designated as `NOT NULL`

nn = (
    (primary_df.isna().sum().values == 0)
    &
    ~col_defs.column.str.contains('_airport')
    &
    ~col_defs.data_type.str.contains(' NOT NULL')
)

col_defs.data_type.loc[nn] = (col_defs.data_type.loc[nn] + ' NOT NULL')

In [29]:
# Similar rules for FOREIGN KEY columns

fk = (col_defs.column.str.contains('_airport') & ~col_defs.data_type.str.contains(' NOT NULL'))

col_defs.data_type.loc[fk] = (col_defs.data_type.loc[fk] + ' NOT NULL REFERENCES airports (iata_code)')

In [30]:
col_defs

Unnamed: 0,column,data_type
0,carrier_code,char(2) NOT NULL
1,flight_number,integer NOT NULL
2,origin_airport,char(3) NOT NULL REFERENCES airports (iata_code)
3,destination_airport,char(3) NOT NULL REFERENCES airports (iata_code)
4,flight_date,date NOT NULL
5,scheduled_elapsed_time,integer NOT NULL
6,tail_number,varchar(6)
7,departure_delay,integer NOT NULL
8,arrival_delay,integer NOT NULL
9,delay_carrier,integer NOT NULL


Create the `flights_and_weather` table-creation string by:
1. concatenating `col_def` values across rows (joined by ` `)
2. concatenating those rows (joined by `,\n    `)
3. inserting the result in between the appropriate table-creation text

In [31]:
faw_create = (
    'CREATE TABLE IF NOT EXISTS flights_and_weather (\n    '
    + col_defs.apply(lambda x: ' '.join(x), axis=1).str.cat(sep=',\n    ')
    + '\n);'
)

print(faw_create)

CREATE TABLE IF NOT EXISTS flights_and_weather (
    carrier_code char(2) NOT NULL,
    flight_number integer NOT NULL,
    origin_airport char(3) NOT NULL REFERENCES airports (iata_code),
    destination_airport char(3) NOT NULL REFERENCES airports (iata_code),
    flight_date date NOT NULL,
    scheduled_elapsed_time integer NOT NULL,
    tail_number varchar(6),
    departure_delay integer NOT NULL,
    arrival_delay integer NOT NULL,
    delay_carrier integer NOT NULL,
    delay_weather integer NOT NULL,
    delay_national_aviation_system integer NOT NULL,
    delay_security integer NOT NULL,
    delay_late_aircarft_arrival integer NOT NULL,
    cancelled boolean NOT NULL,
    scheduled_departure_dt timestamp NOT NULL,
    scheduled_arrival_dt timestamp NOT NULL,
    actual_departure_dt timestamp,
    actual_arrival_dt timestamp,
    STATION_x numeric,
    HourlyDryBulbTemperature_x numeric,
    HourlyPrecipitation_x numeric,
    HourlyStationPressure_x numeric,
    HourlyVisibility

## Secondary Data Set

In [32]:
# Remove leading and trailing whitespace from the Global Airport Database text
# (with `.strip()`), and assign it to a variable.
with open(os.path.join(data_dir,secondary_source_file)) as gadb:
    gadb_text = gadb.read().strip()

In [33]:
# Examine some of the data to see what it looks like
gadb_text[:1000]

'AYGA:GKA:GOROKA:GOROKA:PAPUA NEW GUINEA:006:004:054:S:145:023:030:E:01610:-6.082:145.392\nAYLA:LAE:N/A:LAE:PAPUA NEW GUINEA:000:000:000:U:000:000:000:U:00000:0.000:0.000\nAYMD:MAG:MADANG:MADANG:PAPUA NEW GUINEA:005:012:025:S:145:047:019:E:00007:-5.207:145.789\nAYMH:HGU:MOUNT HAGEN:MOUNT HAGEN:PAPUA NEW GUINEA:005:049:034:S:144:017:046:E:01643:-5.826:144.296\nAYNZ:LAE:NADZAB:NADZAB:PAPUA NEW GUINEA:006:034:011:S:146:043:034:E:00073:-6.570:146.726\nAYPY:POM:PORT MORESBY JACKSONS INTERNATIONAL:PORT MORESBY:PAPUA NEW GUINEA:009:026:036:S:147:013:012:E:00045:-9.443:147.220\nAYRB:RAB:N/A:RABAUL:PAPUA NEW GUINEA:000:000:000:U:000:000:000:U:00000:0.000:0.000\nAYWK:WWK:WEWAK INTERNATIONAL:WEWAK:PAPUA NEW GUINEA:003:035:001:S:143:040:009:E:00006:-3.584:143.669\nBGAM:N/A:N/A:ANGMAGSSALIK:GREENLAND:000:000:000:U:000:000:000:U:00000:0.000:0.000\nBGAS:N/A:N/A:ANGISSOQ:GREENLAND:000:000:000:U:000:000:000:U:00000:0.000:0.000\nBGAT:N/A:N/A:APUTITEQ:GREENLAND:000:000:000:U:000:000:000:U:00000:0.000:0.0

In [34]:
# And again at the end of the data
gadb_text[-1000:]

'E:00139:45.623:126.250\nZYHE:N/A:N/A:HEIHE:CHINA:000:000:000:U:000:000:000:U:00000:0.000:0.000\nZYJD:N/A:N/A:JAGDAQI:CHINA:000:000:000:U:000:000:000:U:00000:0.000:0.000\nZYJM:N/A:JIAMUSI:JIAMUSI:CHINA:046:050:036:N:130:027:055:E:00080:46.843:130.465\nZYMD:N/A:HAILANG:MUDANJIANG:CHINA:044:031:026:N:129:034:008:E:00270:44.524:129.569\nZYNJ:N/A:N/A:NENJIANG:CHINA:000:000:000:U:000:000:000:U:00000:0.000:0.000\nZYQQ:N/A:N/A:QIQIHAR:CHINA:000:000:000:U:000:000:000:U:00000:0.000:0.000\nZYRD:CGQ:N/A:CHANGCHUN:CHINA:000:000:000:U:000:000:000:U:00000:0.000:0.000\nZYTH:N/A:N/A:TAHE:CHINA:000:000:000:U:000:000:000:U:00000:0.000:0.000\nZYTK:N/A:N/A:SHENYANG:CHINA:000:000:000:U:000:000:000:U:00000:0.000:0.000\nZYTL:DLC:ZHOUSHUIZI:DALIAN:CHINA:038:057:056:N:121:032:018:E:00033:38.966:121.538\nZYXC:N/A:N/A:XIANCHENG:CHINA:000:000:000:U:000:000:000:U:00000:0.000:0.000\nZYYC:N/A:N/A:YICHUN:CHINA:000:000:000:U:000:000:000:U:00000:0.000:0.000\nZYYJ:N/A:YANJI:YANJI:CHINA:042:052:054:N:129:026:054:E:00191:

Rows are separated by newline (`\n`) characters; columns are separated by colons.  
Missing values are indicated in a number of ways, depending on the data type.

According to [the source](https://www.partow.net/miscellaneous/airportdatabase/index.html):

> **Note:** Some tuples may have missing or otherwise unaviable pieces of data. In the event the values are not present, given the data type a default value will be used as follows:
> 
> - String : `N/A`
> - Integer: `0`
> - Char : `U`
> - Floating Point: `0.0`

We will split the data—first by newlines, then by colons—and convert values as appropriate.

In [35]:
# Create a function to convert data types
def process_airport(ap):
    # Convert integers
    for i in (5,6,7,9,10,11,13):
        ap[i] = int(ap[i])
    # Convert floats
    for i in (14,15):
        ap[i] = float(ap[i])
    # Convert missing values (0.0 will be coerced to 0, in this case)
    ap = tuple(map(lambda elem: None if elem in [0,'U','N/A'] else elem, ap))
    return ap

In [36]:
airports = [process_airport(ap.split(':')) for ap in gadb_text.split('\n')]

In [37]:
len(airports)

9300

In [38]:
for i in range (5):
    print(airports[i])

('AYGA', 'GKA', 'GOROKA', 'GOROKA', 'PAPUA NEW GUINEA', 6, 4, 54, 'S', 145, 23, 30, 'E', 1610, -6.082, 145.392)
('AYLA', 'LAE', None, 'LAE', 'PAPUA NEW GUINEA', None, None, None, None, None, None, None, None, None, None, None)
('AYMD', 'MAG', 'MADANG', 'MADANG', 'PAPUA NEW GUINEA', 5, 12, 25, 'S', 145, 47, 19, 'E', 7, -5.207, 145.789)
('AYMH', 'HGU', 'MOUNT HAGEN', 'MOUNT HAGEN', 'PAPUA NEW GUINEA', 5, 49, 34, 'S', 144, 17, 46, 'E', 1643, -5.826, 144.296)
('AYNZ', 'LAE', 'NADZAB', 'NADZAB', 'PAPUA NEW GUINEA', 6, 34, 11, 'S', 146, 43, 34, 'E', 73, -6.57, 146.726)


In [39]:
# Create dataframe
# The project is concerned only with US flights. So we'll also include only US
# airports in the data (and skip the country column as a result).

secondary_columns = [
    'icao_code',
    'iata_code',
    'name',
    'city',
    'country',
    'lat_deg',
    'lat_min',
    'lat_sec',
    'lat_dir',
    'lon_deg',
    'lon_min',
    'lon_sec',
    'lon_dir',
    'altitude',
    'lat_decimal',
    'lon_decimal'
]

secondary_df = pd.DataFrame(
    data=airports,
    columns=secondary_columns
).query('country == "USA"').drop(columns='country')

secondary_df_rows, secondary_df_cols = secondary_df.shape

print(f"{secondary_df_rows:,} rows × {secondary_df_cols:,} columns")

552 rows × 15 columns


In [40]:
secondary_df.head()

Unnamed: 0,icao_code,iata_code,name,city,lat_deg,lat_min,lat_sec,lat_dir,lon_deg,lon_min,lon_sec,lon_dir,altitude,lat_decimal,lon_decimal
3380,KABI,ABI,ABILENE RGNL,ABILENE,32.0,24.0,40.0,N,99.0,40.0,54.0,W,546.0,32.411,-99.682
3381,KABQ,ABQ,,ALBUQUERQUE,,,,,,,,,,,
3382,KACK,ACK,NANTUCKET MEM,NANTUCKET,41.0,15.0,10.0,N,70.0,3.0,36.0,W,15.0,41.253,-70.06
3383,KACT,ACT,WACO RGNL,WACO,31.0,36.0,40.0,N,97.0,13.0,49.0,W,158.0,31.611,-97.23
3384,KACY,ACY,ATLANTIC CITY INTERNATIONAL,ATLANTIC CITY,39.0,27.0,27.0,N,74.0,34.0,37.0,W,23.0,39.458,-74.577


In [41]:
# For each column, get data types, number of null values, and whether all (non-NULL) values in the column are unique
pd.concat(
    [
        secondary_df.dtypes,
        secondary_df.isna().sum(),
        pd.Series(
            data=[secondary_df[col].dropna().is_unique for col in secondary_df.columns],
            index=secondary_df.columns
        )
    ],
    axis=1,
    keys=['data_type','null_count','unique']
)

Unnamed: 0,data_type,null_count,unique
icao_code,object,0,True
iata_code,object,73,False
name,object,66,False
city,object,0,False
lat_deg,float64,66,False
lat_min,float64,73,False
lat_sec,float64,77,False
lat_dir,object,66,False
lon_deg,float64,66,False
lon_min,float64,73,False


`icao_code` is `UNIQUE` and `NON NULL`, and so can serve as the `PRIMARY KEY` of the `airports` database.

Expected `iata_code` to be unique, but it isn't.

In [42]:
# Examine the rows with (non-NULL) duplicate values
secondary_df.loc[~secondary_df.iata_code.isna() & secondary_df.iata_code.duplicated(keep=False)]

Unnamed: 0,icao_code,iata_code,name,city,lat_deg,lat_min,lat_sec,lat_dir,lon_deg,lon_min,lon_sec,lon_dir,altitude,lat_decimal,lon_decimal
3381,KABQ,ABQ,,ALBUQUERQUE,,,,,,,,,,,
3580,KIKR,ABQ,ALBUQUERQUE INTERNATIONAL SUNPORT,KIRTLAND A.F.B.,35.0,2.0,24.0,N,106.0,36.0,33.0,W,1633.0,35.04,-106.609


In [43]:
# Drop the airport at index 3381
secondary_df.drop(index=3381,errors='ignore',inplace=True)

secondary_df_rows, secondary_df_cols = secondary_df.shape

print(f"{secondary_df_rows:,} rows × {secondary_df_cols:,} columns")

551 rows × 15 columns


In [44]:
# For each column, get data types, number of null values, and whether all (non-NULL) values in the column are unique
pd.concat(
    [
        secondary_df.dtypes,
        secondary_df.isna().sum(),
        pd.Series(
            data=[secondary_df[col].dropna().is_unique for col in secondary_df.columns],
            index=secondary_df.columns
        )
    ],
    axis=1,
    keys=['data_type','null_count','unique']
)

Unnamed: 0,data_type,null_count,unique
icao_code,object,0,True
iata_code,object,73,True
name,object,65,False
city,object,0,False
lat_deg,float64,65,False
lat_min,float64,72,False
lat_sec,float64,76,False
lat_dir,object,65,False
lon_deg,float64,65,False
lon_min,float64,72,False


## Create Database Tables

In order to connect to the database, first, make sure you have a local (running) database with the `hostname`, `database` name, `username`, and `port` number as specified in `/resources/config/gadb_pg_config.py`.

### Install [Psycopg2](https://pypi.org/project/psycopg2/)

If you do not already have Psycopg2 (and its binary extension) installed, **enable the cell below** by converting it to Cell Type `Code`. (In the Jupyter Notebook menus, select `Cell` > `Cell Type` > `Code`.)

Additional details about how to use Psycopg2 can be found in its [documentation](https://www.psycopg.org/docs/).

### Install [SQLAlchemy](https://www.sqlalchemy.org/)

If you do not already have SQLAlchemy installed, **enable the cell below** by converting it to Cell Type `Code`. (In the Jupyter Notebook menus, select `Cell` > `Cell Type` > `Code`.)

Additional details about how to use SQLAlchemy can be found in its [documentation](https://docs.sqlalchemy.org/en/14/).

In [45]:
# Database configuration details
from config import gadb_pg_config as cfg

# To connect to SQL database
import sqlalchemy as db
# from sqlalchemy import create_engine, MetaData, Table, text, types

# To enter passwords without exposing them
from getpass import getpass

Most of the database information is in `cfg` (above). However, you will have to enter your password below.

In [46]:
password = getpass('Enter database password')

Enter database password········


In [47]:
db_string = f"postgresql+psycopg2://{cfg.username}:{password}@{cfg.hostname}:{cfg.port}/{cfg.database}"

In [48]:
engine = db.create_engine(
    future=True,
#     echo=True,
    url=db_string
)

**Note:** because the `flights_and_weather` table (made from the primary dataset) has columns that reference columns from the `airports` table (made from the secondary dataset), the `airports` table must be created *before* the `flights_and_weather` table.

In [49]:
# Create the `airports` and `flights_and_weather` database tables
for table_create in (cfg.ap_create, faw_create):
    with engine.begin() as conn:    
        conn.execute(db.text(table_create))
print('Done.')

Done.


#### Recommendation

Check the database (via pgAdmin or some other means) to make sure that the tables exist as expected.

## Upload Data

In [50]:
db_meta = db.MetaData()

In [51]:
# Access the `airports` and `flights_and_weather` tables and assign them to variables
table_ap = db.Table('airports', db_meta, autoload_with=engine)
table_faw = db.Table('flights_and_weather', db_meta, autoload_with=engine)

### Upload Airport Data

In [54]:
# Check to see if `table_ap` has the same number of columns as the `secondary_df`
len(table_ap.columns.keys()) == secondary_df.columns.size

True

In [55]:
# Verify `table_ap` column definitions
table_ap.columns.values()

[Column('icao_code', VARCHAR(length=4), table=<airports>, primary_key=True, nullable=False),
 Column('iata_code', CHAR(length=3), table=<airports>),
 Column('name', TEXT(), table=<airports>),
 Column('city', TEXT(), table=<airports>),
 Column('lat_deg', INTEGER(), table=<airports>),
 Column('lat_min', INTEGER(), table=<airports>),
 Column('lat_sec', INTEGER(), table=<airports>),
 Column('lat_dir', CHAR(length=1), table=<airports>),
 Column('lon_deg', INTEGER(), table=<airports>),
 Column('lon_min', INTEGER(), table=<airports>),
 Column('lon_sec', INTEGER(), table=<airports>),
 Column('lon_dir', CHAR(length=1), table=<airports>),
 Column('altitude', INTEGER(), table=<airports>),
 Column('lat_decimal', NUMERIC(), table=<airports>),
 Column('lon_decimal', NUMERIC(), table=<airports>)]

In [56]:
print(db.insert(table_ap))

INSERT INTO airports (icao_code, iata_code, name, city, lat_deg, lat_min, lat_sec, lat_dir, lon_deg, lon_min, lon_sec, lon_dir, altitude, lat_decimal, lon_decimal) VALUES (:icao_code, :iata_code, :name, :city, :lat_deg, :lat_min, :lat_sec, :lat_dir, :lon_deg, :lon_min, :lon_sec, :lon_dir, :altitude, :lat_decimal, :lon_decimal)


In [58]:
# Check query to test whether `airports` table has any rows
print(db.exists().select_from(table_ap).select())

SELECT EXISTS (SELECT * 
FROM airports) AS anon_1


In [59]:
# Upload
with engine.begin() as conn:
    ap_is_empty = not conn.execute(db.exists().select_from(table_ap).select()).scalar()

if (ap_is_empty):
    secondary_df.to_sql(
        name='airports',
        con=engine,
        if_exists='append',
        index=False,
        method='multi'
    )
    print('Done.')
else:
    print('`airports` table already populated.')

Done.


### Upload Flights and Weather Data

In [60]:
# Check to see if `table_faw` has the same number of columns as `primary_df`
len(table_faw.columns.keys()) == primary_df.columns.size

True

In [61]:
# Verify `table_ap` column definitions
table_faw.columns.values()

[Column('carrier_code', CHAR(length=2), table=<flights_and_weather>, nullable=False),
 Column('flight_number', INTEGER(), table=<flights_and_weather>, nullable=False),
 Column('origin_airport', CHAR(length=3), ForeignKey('airports.iata_code'), table=<flights_and_weather>, nullable=False),
 Column('destination_airport', CHAR(length=3), ForeignKey('airports.iata_code'), table=<flights_and_weather>, nullable=False),
 Column('flight_date', DATE(), table=<flights_and_weather>, nullable=False),
 Column('scheduled_elapsed_time', INTEGER(), table=<flights_and_weather>, nullable=False),
 Column('tail_number', VARCHAR(length=6), table=<flights_and_weather>),
 Column('departure_delay', INTEGER(), table=<flights_and_weather>, nullable=False),
 Column('arrival_delay', INTEGER(), table=<flights_and_weather>, nullable=False),
 Column('delay_carrier', INTEGER(), table=<flights_and_weather>, nullable=False),
 Column('delay_weather', INTEGER(), table=<flights_and_weather>, nullable=False),
 Column('dela

In [62]:
print(db.insert(table_faw))

INSERT INTO flights_and_weather (carrier_code, flight_number, origin_airport, destination_airport, flight_date, scheduled_elapsed_time, tail_number, departure_delay, arrival_delay, delay_carrier, delay_weather, delay_national_aviation_system, delay_security, delay_late_aircarft_arrival, cancelled, scheduled_departure_dt, scheduled_arrival_dt, actual_departure_dt, actual_arrival_dt, station_x, hourlydrybulbtemperature_x, hourlyprecipitation_x, hourlystationpressure_x, hourlyvisibility_x, hourlywindspeed_x, station_y, hourlydrybulbtemperature_y, hourlyprecipitation_y, hourlystationpressure_y, hourlyvisibility_y, hourlywindspeed_y) VALUES (:carrier_code, :flight_number, :origin_airport, :destination_airport, :flight_date, :scheduled_elapsed_time, :tail_number, :departure_delay, :arrival_delay, :delay_carrier, :delay_weather, :delay_national_aviation_system, :delay_security, :delay_late_aircarft_arrival, :cancelled, :scheduled_departure_dt, :scheduled_arrival_dt, :actual_departure_dt, :act

In [63]:
# Check query to test whether `airports` table has any rows
print(db.exists().select_from(table_faw).select())

SELECT EXISTS (SELECT * 
FROM flights_and_weather) AS anon_1


In [None]:
# Upload
with engine.begin() as conn:
    faw_is_empty = not conn.execute(db.exists().select_from(table_faw).select()).scalar()

if (faw_is_empty):
    primary_df.to_sql(
        name='flights_and_weather',
        con=engine,
        if_exists='append',
        index=False,
        method='multi'
    )
    print('Done.')
else:
    print('`flights_and_weather` table already populated.')