# Process data

### Important

In order to process the airport, flight, and weather data, you must already have it available. If you have not already acquired the data, open notebook `01_get_data` and run all its cells.

In [1]:
import os
import pandas as pd

In [2]:
# Make `resources` the working directory, and set `data_dir`
os.chdir(os.path.join('..','resources'))
data_dir = os.path.join('.','data')

In [3]:
os.listdir(data_dir)

['05-2019.csv',
 '06-2019.csv',
 '07-2019.csv',
 '08-2019.csv',
 '09-2019.csv',
 '10-2019.csv',
 '11-2019.csv',
 '12-2019.csv',
 'GlobalAirportDatabase.txt',
 'GlobalAirportDatabase.zip',
 'historical-flight-and-weather-data.zip',
 'readme.txt']

In [4]:
airports_source_file = 'GlobalAirportDatabase.txt'

## Airports Data Set

In [5]:
# Remove leading and trailing whitespace from the Global Airport Database text
# (with `.strip()`), and assign it to a variable.
with open(os.path.join(data_dir,airports_source_file)) as gadb:
    gadb_text = gadb.read().strip()

In [6]:
# Examine some of the data to see what it looks like
gadb_text[:1000]

'AYGA:GKA:GOROKA:GOROKA:PAPUA NEW GUINEA:006:004:054:S:145:023:030:E:01610:-6.082:145.392\nAYLA:LAE:N/A:LAE:PAPUA NEW GUINEA:000:000:000:U:000:000:000:U:00000:0.000:0.000\nAYMD:MAG:MADANG:MADANG:PAPUA NEW GUINEA:005:012:025:S:145:047:019:E:00007:-5.207:145.789\nAYMH:HGU:MOUNT HAGEN:MOUNT HAGEN:PAPUA NEW GUINEA:005:049:034:S:144:017:046:E:01643:-5.826:144.296\nAYNZ:LAE:NADZAB:NADZAB:PAPUA NEW GUINEA:006:034:011:S:146:043:034:E:00073:-6.570:146.726\nAYPY:POM:PORT MORESBY JACKSONS INTERNATIONAL:PORT MORESBY:PAPUA NEW GUINEA:009:026:036:S:147:013:012:E:00045:-9.443:147.220\nAYRB:RAB:N/A:RABAUL:PAPUA NEW GUINEA:000:000:000:U:000:000:000:U:00000:0.000:0.000\nAYWK:WWK:WEWAK INTERNATIONAL:WEWAK:PAPUA NEW GUINEA:003:035:001:S:143:040:009:E:00006:-3.584:143.669\nBGAM:N/A:N/A:ANGMAGSSALIK:GREENLAND:000:000:000:U:000:000:000:U:00000:0.000:0.000\nBGAS:N/A:N/A:ANGISSOQ:GREENLAND:000:000:000:U:000:000:000:U:00000:0.000:0.000\nBGAT:N/A:N/A:APUTITEQ:GREENLAND:000:000:000:U:000:000:000:U:00000:0.000:0.0

In [7]:
# And again at the end of the data
gadb_text[-1000:]

'E:00139:45.623:126.250\nZYHE:N/A:N/A:HEIHE:CHINA:000:000:000:U:000:000:000:U:00000:0.000:0.000\nZYJD:N/A:N/A:JAGDAQI:CHINA:000:000:000:U:000:000:000:U:00000:0.000:0.000\nZYJM:N/A:JIAMUSI:JIAMUSI:CHINA:046:050:036:N:130:027:055:E:00080:46.843:130.465\nZYMD:N/A:HAILANG:MUDANJIANG:CHINA:044:031:026:N:129:034:008:E:00270:44.524:129.569\nZYNJ:N/A:N/A:NENJIANG:CHINA:000:000:000:U:000:000:000:U:00000:0.000:0.000\nZYQQ:N/A:N/A:QIQIHAR:CHINA:000:000:000:U:000:000:000:U:00000:0.000:0.000\nZYRD:CGQ:N/A:CHANGCHUN:CHINA:000:000:000:U:000:000:000:U:00000:0.000:0.000\nZYTH:N/A:N/A:TAHE:CHINA:000:000:000:U:000:000:000:U:00000:0.000:0.000\nZYTK:N/A:N/A:SHENYANG:CHINA:000:000:000:U:000:000:000:U:00000:0.000:0.000\nZYTL:DLC:ZHOUSHUIZI:DALIAN:CHINA:038:057:056:N:121:032:018:E:00033:38.966:121.538\nZYXC:N/A:N/A:XIANCHENG:CHINA:000:000:000:U:000:000:000:U:00000:0.000:0.000\nZYYC:N/A:N/A:YICHUN:CHINA:000:000:000:U:000:000:000:U:00000:0.000:0.000\nZYYJ:N/A:YANJI:YANJI:CHINA:042:052:054:N:129:026:054:E:00191:

Rows are separated by newline (`\n`) characters; columns are separated by colons.  
Missing values are indicated in a number of ways, depending on the data type.

According to [the source](https://www.partow.net/miscellaneous/airportdatabase/index.html):

> **Note:** Some tuples may have missing or otherwise unaviable pieces of data. In the event the values are not present, given the data type a default value will be used as follows:
> 
> - String : `N/A`
> - Integer: `0`
> - Char : `U`
> - Floating Point: `0.0`

We will split the data—first by newlines, then by colons—and convert values as appropriate.

In [8]:
# Create a function to convert data types
def process_airport(ap):
    # Convert integers
    for i in (5,6,7,9,10,11,13):
        ap[i] = int(ap[i])
    # Convert floats
    for i in (14,15):
        ap[i] = float(ap[i])
    # Convert missing values (0.0 will be coerced to 0, in this case)
    ap = tuple(map(lambda elem: None if elem in [0,'U','N/A'] else elem, ap))
    return ap

In [9]:
airports_tuples = [process_airport(ap.split(':')) for ap in gadb_text.split('\n')]

In [10]:
len(airports_tuples)

9300

In [11]:
for i in range (5):
    print(airports_tuples[i])

('AYGA', 'GKA', 'GOROKA', 'GOROKA', 'PAPUA NEW GUINEA', 6, 4, 54, 'S', 145, 23, 30, 'E', 1610, -6.082, 145.392)
('AYLA', 'LAE', None, 'LAE', 'PAPUA NEW GUINEA', None, None, None, None, None, None, None, None, None, None, None)
('AYMD', 'MAG', 'MADANG', 'MADANG', 'PAPUA NEW GUINEA', 5, 12, 25, 'S', 145, 47, 19, 'E', 7, -5.207, 145.789)
('AYMH', 'HGU', 'MOUNT HAGEN', 'MOUNT HAGEN', 'PAPUA NEW GUINEA', 5, 49, 34, 'S', 144, 17, 46, 'E', 1643, -5.826, 144.296)
('AYNZ', 'LAE', 'NADZAB', 'NADZAB', 'PAPUA NEW GUINEA', 6, 34, 11, 'S', 146, 43, 34, 'E', 73, -6.57, 146.726)


In [12]:
def print_shape(df):
    df_rows, df_cols = df.shape
    print(f"{df_rows:,} rows × {df_cols:,} columns")

In [13]:
# Create dataframe
# The project is concerned only with US flights. So we'll also include only US
# airports in the data (and skip the country column as a result).

airports_columns = [
    'icao_code',
    'iata_code',
    'name',
    'city',
    'country',
    'lat_deg',
    'lat_min',
    'lat_sec',
    'lat_dir',
    'lon_deg',
    'lon_min',
    'lon_sec',
    'lon_dir',
    'altitude',
    'lat_decimal',
    'lon_decimal'
]

airports_df = pd.DataFrame(
    data=airports_tuples,
    columns=airports_columns
).query('country == "USA"').drop(columns='country')

print_shape(airports_df)

552 rows × 15 columns


In [14]:
airports_df.head()

Unnamed: 0,icao_code,iata_code,name,city,lat_deg,lat_min,lat_sec,lat_dir,lon_deg,lon_min,lon_sec,lon_dir,altitude,lat_decimal,lon_decimal
3380,KABI,ABI,ABILENE RGNL,ABILENE,32.0,24.0,40.0,N,99.0,40.0,54.0,W,546.0,32.411,-99.682
3381,KABQ,ABQ,,ALBUQUERQUE,,,,,,,,,,,
3382,KACK,ACK,NANTUCKET MEM,NANTUCKET,41.0,15.0,10.0,N,70.0,3.0,36.0,W,15.0,41.253,-70.06
3383,KACT,ACT,WACO RGNL,WACO,31.0,36.0,40.0,N,97.0,13.0,49.0,W,158.0,31.611,-97.23
3384,KACY,ACY,ATLANTIC CITY INTERNATIONAL,ATLANTIC CITY,39.0,27.0,27.0,N,74.0,34.0,37.0,W,23.0,39.458,-74.577


In [15]:
# For each column, get data types, number of null values, and whether all (non-NULL) values in the column are unique
pd.concat(
    [
        airports_df.dtypes,
        airports_df.isna().sum(),
        pd.Series(
            data=[airports_df[col].dropna().is_unique for col in airports_df.columns],
            index=airports_df.columns
        )
    ],
    axis=1,
    keys=['data_type','null_count','unique']
)

Unnamed: 0,data_type,null_count,unique
icao_code,object,0,True
iata_code,object,73,False
name,object,66,False
city,object,0,False
lat_deg,float64,66,False
lat_min,float64,73,False
lat_sec,float64,77,False
lat_dir,object,66,False
lon_deg,float64,66,False
lon_min,float64,73,False


`icao_code` is `UNIQUE` and `NON NULL`, and so can serve as the `PRIMARY KEY` of the `airports` database.

Expected `iata_code` to be unique, but it isn't.

In [16]:
# Examine the rows with (non-NULL) duplicate values
airports_df.loc[~airports_df.iata_code.isna() & airports_df.iata_code.duplicated(keep=False)]

Unnamed: 0,icao_code,iata_code,name,city,lat_deg,lat_min,lat_sec,lat_dir,lon_deg,lon_min,lon_sec,lon_dir,altitude,lat_decimal,lon_decimal
3381,KABQ,ABQ,,ALBUQUERQUE,,,,,,,,,,,
3580,KIKR,ABQ,ALBUQUERQUE INTERNATIONAL SUNPORT,KIRTLAND A.F.B.,35.0,2.0,24.0,N,106.0,36.0,33.0,W,1633.0,35.04,-106.609


In [17]:
# Drop the airport at index 3381
airports_df.drop(index=3381,errors='ignore',inplace=True)

print_shape(airports_df)

551 rows × 15 columns


In [18]:
# For each column, get data types, number of null values, and whether all (non-NULL) values in the column are unique
pd.concat(
    [
        airports_df.dtypes,
        airports_df.isna().sum(),
        pd.Series(
            data=[airports_df[col].dropna().is_unique for col in airports_df.columns],
            index=airports_df.columns
        )
    ],
    axis=1,
    keys=['data_type','null_count','unique']
)

Unnamed: 0,data_type,null_count,unique
icao_code,object,0,True
iata_code,object,73,True
name,object,65,False
city,object,0,False
lat_deg,float64,65,False
lat_min,float64,72,False
lat_sec,float64,76,False
lat_dir,object,65,False
lon_deg,float64,65,False
lon_min,float64,72,False


## Create Database Table

In order to connect to the database, first, make sure you have a local (running) database with the `hostname`, `database` name, `username`, and `port` number as specified in `/resources/config/gadb_pg_config.py`.

### Install [Psycopg2](https://pypi.org/project/psycopg2/)

If you do not already have Psycopg2 (and its binary extension) installed, **enable the cell below** by converting it to Cell Type `Code`. (In the Jupyter Notebook menus, select `Cell` > `Cell Type` > `Code`.)

Additional details about how to use Psycopg2 can be found in its [documentation](https://www.psycopg.org/docs/).

### Install [SQLAlchemy](https://www.sqlalchemy.org/)

If you do not already have SQLAlchemy installed, **enable the cell below** by converting it to Cell Type `Code`. (In the Jupyter Notebook menus, select `Cell` > `Cell Type` > `Code`.)

Additional details about how to use SQLAlchemy can be found in its [documentation](https://docs.sqlalchemy.org/en/14/).

In [19]:
# Database configuration details
from config import gadb_pg_config as cfg

# To connect to SQL database
import sqlalchemy as db
# from sqlalchemy import create_engine, MetaData, Table, text, types

# To enter passwords without exposing them
from getpass import getpass

Most of the database information is in `cfg` (above). However, you will have to enter your password below.

In [20]:
password = getpass('Enter database password')

Enter database password········


In [21]:
db_string = f"postgresql+psycopg2://{cfg.username}:{password}@{cfg.hostname}:{cfg.port}/{cfg.database}"

In [22]:
engine = db.create_engine(
    future=True,
#     echo=True,
    url=db_string
)

In [23]:
# Create the `airports` table
with engine.begin() as conn:    
    conn.execute(db.text(cfg.ap_create))
print('Done.')

Done.


#### Recommendation

Check the database (via pgAdmin or some other means) to make sure that the tables exist as expected.

## Upload Data

In [24]:
db_meta = db.MetaData()

In [25]:
# Access the `airports` and `flights_and_weather` tables and assign them to variables
airports_table = db.Table('airports', db_meta, autoload_with=engine)

### Upload Airport Data

In [26]:
# Check to see if `airports_table` has the same number of columns as the `airports_df`
len(airports_table.columns.keys()) == airports_df.columns.size

True

In [27]:
# Verify `airports_table` column definitions
airports_table.columns.values()

[Column('icao_code', VARCHAR(length=4), table=<airports>, primary_key=True, nullable=False),
 Column('iata_code', CHAR(length=3), table=<airports>),
 Column('name', TEXT(), table=<airports>),
 Column('city', TEXT(), table=<airports>),
 Column('lat_deg', INTEGER(), table=<airports>),
 Column('lat_min', INTEGER(), table=<airports>),
 Column('lat_sec', INTEGER(), table=<airports>),
 Column('lat_dir', CHAR(length=1), table=<airports>),
 Column('lon_deg', INTEGER(), table=<airports>),
 Column('lon_min', INTEGER(), table=<airports>),
 Column('lon_sec', INTEGER(), table=<airports>),
 Column('lon_dir', CHAR(length=1), table=<airports>),
 Column('altitude', INTEGER(), table=<airports>),
 Column('lat_decimal', NUMERIC(), table=<airports>),
 Column('lon_decimal', NUMERIC(), table=<airports>)]

In [28]:
print(db.insert(airports_table))

INSERT INTO airports (icao_code, iata_code, name, city, lat_deg, lat_min, lat_sec, lat_dir, lon_deg, lon_min, lon_sec, lon_dir, altitude, lat_decimal, lon_decimal) VALUES (:icao_code, :iata_code, :name, :city, :lat_deg, :lat_min, :lat_sec, :lat_dir, :lon_deg, :lon_min, :lon_sec, :lon_dir, :altitude, :lat_decimal, :lon_decimal)


In [29]:
# Check query to test whether `airports` table has any rows
print(db.exists().select_from(airports_table).select())

SELECT EXISTS (SELECT * 
FROM airports) AS anon_1


In [30]:
# Upload
with engine.begin() as conn:
    ap_is_empty = not conn.execute(db.exists().select_from(airports_table).select()).scalar()

if (ap_is_empty):
    airports_df.to_sql(
        name='airports',
        con=engine,
        if_exists='append',
        index=False,
        method='multi'
    )
    print('Done.')
else:
    print('`airports` table already populated.')

Done.
