# Process data

In [1]:
import os

In [2]:
data_dir = os.path.join('..','resources','data')

os.listdir(data_dir)

['05-2019.csv',
 '06-2019.csv',
 '07-2019.csv',
 '08-2019.csv',
 '09-2019.csv',
 '10-2019.csv',
 '11-2019.csv',
 '12-2019.csv',
 'airports.csv']

In [66]:
import wget

ModuleNotFoundError: No module named 'wget'

In [65]:
url = "https://www.python.org/static/img/python-logo@2x.png"
filename = os.path.basename(url)
filename
wget.download(url, 'c:/users/LikeGeeks/downloads/pythonLogo.png')

del url
del filename

ModuleNotFoundError: No module named 'wget'

In [17]:
# Import regular expressions library
import re

In [63]:
list(filter(lambda item: re.fullmatch('\d{2}\-2019\.csv', item, flags=re.I) is not None, os.listdir(data_dir)))

['05-2019.csv',
 '06-2019.csv',
 '07-2019.csv',
 '08-2019.csv',
 '09-2019.csv',
 '10-2019.csv',
 '11-2019.csv',
 '12-2019.csv']

In [64]:
list(filter(lambda item: re.fullmatch('\d{2}\-2019\.csv', item, flags=re.I) is None, os.listdir(data_dir)))

['airports.csv']

In [55]:
[re.fullmatch('^\d{2}\-2019\.csv$', item, flags=re.I) for item in os.listdir(data_dir)][0].string

'05-2019.csv'

In [3]:
primary_source_files = list(filter(
                            lambda item: re.fullmatch(
                                '\d{2}\-2019\.csv',
                                item,
                                flags=re.I
                            ) is not None,
                            os.listdir(data_dir)
                        ))
secondary_source_file = os.listdir(data_dir)[-1]

In [4]:
import pandas as pd

## Primary Data Set

In [5]:
primary_df = pd.concat([
    pd.read_csv(os.path.join(data_dir,filename))
    for filename in primary_source_files
])

primary_df_rows, primary_df_cols = primary_df.shape

print(f"{primary_df_rows:,} rows × {primary_df_cols:,} columns")

5,512,903 rows × 35 columns


In [6]:
primary_df.head()

Unnamed: 0,carrier_code,flight_number,origin_airport,destination_airport,date,scheduled_elapsed_time,tail_number,departure_delay,arrival_delay,delay_carrier,...,HourlyPrecipitation_x,HourlyStationPressure_x,HourlyVisibility_x,HourlyWindSpeed_x,STATION_y,HourlyDryBulbTemperature_y,HourlyPrecipitation_y,HourlyStationPressure_y,HourlyVisibility_y,HourlyWindSpeed_y
0,AS,121,SEA,ANC,2019-05-01,215,N615AS,-8,-16,0,...,0.0,29.59,10.0,8.0,70272530000.0,42.0,0.0,30.16,10.0,3.0
1,F9,402,LAX,DEN,2019-05-01,147,N701FR,17,-4,0,...,0.0,29.65,10.0,3.0,72565000000.0,34.0,0.0,24.43,4.0,0.0
2,F9,662,SFO,DEN,2019-05-01,158,N346FR,44,27,0,...,0.0,29.98,10.0,6.0,72565000000.0,34.0,0.0,24.43,4.0,0.0
3,F9,790,PDX,DEN,2019-05-01,156,N332FR,24,10,0,...,0.0,29.98,10.0,0.0,72565000000.0,34.0,0.0,24.43,4.0,0.0
4,AS,108,ANC,SEA,2019-05-01,210,N548AS,-9,-31,0,...,0.0,30.18,10.0,5.0,72793020000.0,44.0,0.0,29.58,10.0,7.0


In [7]:
# Get data types and number of null values for each column
pd.concat(
    [
        primary_df.dtypes,
        primary_df.isna().sum()
    ],
    axis=1,
    keys=['data_type','null_count']
)

Unnamed: 0,data_type,null_count
carrier_code,object,0
flight_number,int64,0
origin_airport,object,0
destination_airport,object,0
date,object,0
scheduled_elapsed_time,int64,0
tail_number,object,13556
departure_delay,int64,0
arrival_delay,int64,0
delay_carrier,int64,0


**Note:** Eventually, `cancelled_code` will be our target column for a machine-learning algorithm.  
Because the column in the source data is `cancelled_code` and not `canceled_code`, the double-l spelling will be used in this work.

In [8]:
# Combine `year`, `month`, and `day` into a single `string` in the same format as the `date` column
# and check for equality against the actual date column.

# Check only a few rows
(
    primary_df[['year','month','day']][:3]
    .apply(lambda row: '-'.join([val.zfill(2) for val in row.values.astype(str)]), axis=1)
    .equals(
        primary_df.date[:3]
    )
)

# Check all the rows
# (
#     primary_df[['year','month','day']]
#     .apply(lambda row: '-'.join([val.zfill(2) for val in row.values.astype(str)]), axis=1)
#     .equals(
#         primary_df.date
#     )
# )

True

**Note:** The all-rows check, above, is commented out because it takes a long time, but when run, it does show equality between the entire `date` series and the combined `year`-`month`-`day` series.

Because `year`, `month`, and `day` were originally stored as `int64` values, this also tells us that all the values in `date` are properly formatted (no leading or trailing spaces, *etc*.).

The data is therefore redundant, and we don't need both.

`weekday` is likewise redundant, since it can be calculated from `date`.

In [9]:
# Drop redundant date columns
primary_df.drop(
    columns=['year','month','day','weekday'],
    errors='ignore',
    inplace=True
)

primary_df_rows, primary_df_cols = primary_df.shape

print(f"{primary_df_rows:,} rows × {primary_df_cols:,} columns")

5,512,903 rows × 31 columns


In [10]:
# Examine `cancelled_code` column
primary_df.cancelled_code.value_counts()

N    5426150
B      41919
A      23451
C      21370
D         13
Name: cancelled_code, dtype: int64

### What do the codes mean?

According to the United States Department of Transportation Bureau of Transportation Statistics Airlines and Airports data, [Number 14 - On-Time Reporting](https://www.bts.gov/topics/airlines-and-airports/number-14-time-reporting):

**CANCELLATION CODES**
- `A`-Carrier Caused
- `B`-Weather
- `C`-National Aviation System
- `D`-Security

\[`N` is not on the list and represents "None" or "Not cancelled".\]

We are only interested in flights that were cancelled due to weather, so we will keep only rows with `cancelled_code` `B` or `N`.

In [11]:
primary_df = primary_df.loc[primary_df.cancelled_code.isin(['B','N'])]

primary_df_rows, primary_df_cols = primary_df.shape

print(f"{primary_df_rows:,} rows × {primary_df_cols:,} columns")

5,468,069 rows × 31 columns


In [12]:
# Check that there are now only `B` and `N` values
primary_df.cancelled_code.value_counts()

N    5426150
B      41919
Name: cancelled_code, dtype: int64

In [13]:
# Convert `cancelled_code` column into boolean `cancelled` column, where
# `B` = True (*was* cancelled) and `N` = False (*was not* cancelled)

try:
    print("Converting cancelled_code column to boolean… ", end="")
    primary_df.cancelled_code = (primary_df.cancelled_code == 'B')
    primary_df.rename(columns={'cancelled_code':'cancelled'},inplace=True)
    print()
except AttributeError:
    print("Column has already been processed.")

primary_df.cancelled.value_counts()

Converting cancelled_code column to boolean… 


False    5426150
True       41919
Name: cancelled, dtype: int64

In [21]:
# How many flights were cancelled|not cancelled vs. how many departed|arrived

departed = ~primary_df.actual_departure_dt.isna()
arrived = ~primary_df.actual_arrival_dt.isna()

mult_ix = pd.MultiIndex.from_tuples([
    ('departed',True),
    ('departed',False),
    ('arrived',True),
    ('arrived',False),
])

mult_cols = pd.MultiIndex.from_tuples([
    ('cancelled',False),
    ('cancelled',True)
])

pd.DataFrame(
    data=[
        [
            primary_df.loc[(~primary_df.cancelled) & (departed)].shape[0],
            primary_df.loc[(primary_df.cancelled) & (departed)].shape[0]
        ],
        [
            primary_df.loc[(~primary_df.cancelled) & (~departed)].shape[0],
            primary_df.loc[(primary_df.cancelled) & (~departed)].shape[0]
        ],
        [
            primary_df.loc[(~primary_df.cancelled) & (arrived)].shape[0],
            primary_df.loc[(primary_df.cancelled) & (arrived)].shape[0]
        ],
        [
            primary_df.loc[(~primary_df.cancelled) & (~arrived)].shape[0],
            primary_df.loc[(primary_df.cancelled) & (~arrived)].shape[0]
        ]
    ],
    index=mult_ix,
    columns=mult_cols
)

Unnamed: 0_level_0,Unnamed: 1_level_0,cancelled,cancelled
Unnamed: 0_level_1,Unnamed: 1_level_1,False,True
departed,True,5426150,1854
departed,False,0,40065
arrived,True,5424261,0
arrived,False,1889,41919


In [15]:
# Does anything stand out for cancelled flights that still departed?
primary_df.loc[primary_df.cancelled & departed].head().transpose()

Unnamed: 0,16715,17002,17815,18640,18750
carrier_code,AA,AA,AA,AA,AA
flight_number,1393,346,2761,1271,5821
origin_airport,OKC,DFW,DFW,IAH,DFW
destination_airport,DFW,MSY,STL,DFW,ELP
date,2019-05-01,2019-05-01,2019-05-01,2019-05-01,2019-05-01
scheduled_elapsed_time,69,85,105,75,104
tail_number,N751UW,N357PV,N971TW,N898NN,N243LR
departure_delay,176,83,111,113,28
arrival_delay,0,0,0,0,0
delay_carrier,0,0,0,0,0


In [16]:
# What about non-cancelled flights that didn't arrive?
primary_df.loc[~primary_df.cancelled & ~arrived].head().transpose()

Unnamed: 0,5154,12535,13657,16277,17368
carrier_code,AS,AA,WN,WN,UA
flight_number,55,2028,2272,2212,6296
origin_airport,SCC,MEM,PDX,ABQ,IAD
destination_airport,BRW,DFW,DAL,DAL,DFW
date,2019-05-01,2019-05-01,2019-05-01,2019-05-01,2019-05-01
scheduled_elapsed_time,45,99,230,105,209
tail_number,N609AS,N749US,N931WN,N788SA,N87353
departure_delay,29,398,-2,-5,212
arrival_delay,0,0,0,0,0
delay_carrier,0,0,0,0,0


In [17]:
primary_obj_cols = primary_df.select_dtypes('object').columns

primary_obj_cols.tolist()

['carrier_code',
 'origin_airport',
 'destination_airport',
 'date',
 'tail_number',
 'scheduled_departure_dt',
 'scheduled_arrival_dt',
 'actual_departure_dt',
 'actual_arrival_dt']

`carrier_code`, `origin_airport`, `destination_airport`, and `tail_number` are legitimate `string`/`text` columns.

`carrier_code` and `tail_number` are for identification purposes, only, though, and so will not be features for the machine-learning model.

`origin_airport` and `destination_airport` will serve as foreign keys to join to the airport data from the secondary dataset.

`date`, `scheduled_departure_dt`, `scheduled_arrival_dt`, `actual_departure_dt`, `actual_arrival_dt` are currently `string`s, but they can be converted to `date`, `datetime` or `timestamp` formats, if necessary prior to uploading to the SQL database.

`actual_departure_dt` and `actual_arrival_dt` can be stored in the database, but they absolutely should ***not*** be used as features for machine learning, as their presence or absence *defines* what it means for a flight to be cancelled, which is exactly what we want the model to predict.

## Secondary Data Set

In [22]:
secondary_df = pd.read_csv(os.path.join(data_dir,secondary_source_file))

secondary_df_rows, secondary_df_cols = secondary_df.shape

print(f"{secondary_df_rows:,} rows × {secondary_df_cols:,} columns")

29,408 rows × 23 columns


In [102]:
# Display dataframe head including all columns
with pd.option_context('display.max_columns', None):
    display(secondary_df.head())

Unnamed: 0,id,ident,type,name,latitude_deg,longitude_deg,elevation_ft,continent,country_name,iso_country,region_name,iso_region,local_region,municipality,scheduled_service,gps_code,iata_code,local_code,home_link,wikipedia_link,keywords,score,last_updated
0,3632,KLAX,large_airport,Los Angeles International Airport,33.942501,-118.407997,125.0,,United States,US,California,US-CA,CA,Los Angeles,1,KLAX,LAX,LAX,https://www.flylax.com/,https://en.wikipedia.org/wiki/Los_Angeles_Inte...,,1335475,2020-04-26T22:37:22+00:00
1,3754,KORD,large_airport,Chicago O'Hare International Airport,41.9786,-87.9048,672.0,,United States,US,Illinois,US-IL,IL,Chicago,1,KORD,ORD,ORD,https://www.flychicago.com/ohare/home/pages/de...,https://en.wikipedia.org/wiki/O'Hare_Internati...,"CHI, Orchard Place",1503175,2018-09-16T02:35:35+00:00
2,3622,KJFK,large_airport,John F Kennedy International Airport,40.639801,-73.7789,13.0,,United States,US,New York,US-NY,NY,New York,1,KJFK,JFK,JFK,https://www.jfkairport.com/,https://en.wikipedia.org/wiki/John_F._Kennedy_...,"Manhattan, New York City, NYC, Idlewild, IDL, ...",1052075,2021-11-21T19:43:08+00:00
3,3384,KATL,large_airport,Hartsfield Jackson Atlanta International Airport,33.6367,-84.428101,1026.0,,United States,US,Georgia,US-GA,GA,Atlanta,1,KATL,ATL,ATL,http://www.atlanta-airport.com/,https://en.wikipedia.org/wiki/Hartsfield–Jacks...,,2002475,2018-09-19T14:50:01+00:00
4,3878,KSFO,large_airport,San Francisco International Airport,37.618999,-122.375,13.0,,United States,US,California,US-CA,CA,San Francisco,1,KSFO,SFO,SFO,http://www.flysfo.com/,https://en.wikipedia.org/wiki/San_Francisco_In...,"QSF, QBA",1112475,2008-06-13T14:30:04+00:00


In [87]:
# Get data types and number of null values for each column
pd.concat(
    [
        secondary_df.dtypes,
        pd.Series([secondary_df[col].is_unique for col in secondary_df.columns],index=secondary_df.columns),
        secondary_df.isna().sum()
    ],
    axis=1,
    keys=[
        'data_type',
        'unique',
        'null_count'
    ]
)

Unnamed: 0,data_type,unique,null_count
id,int64,True,0
ident,object,True,0
type,object,False,0
name,object,False,0
latitude_deg,float64,False,0
longitude_deg,float64,False,0
elevation_ft,float64,False,2005
continent,object,False,29291
country_name,object,False,0
iso_country,object,False,0


In [45]:
secondary_df.iata_code.unique().size

2028

Hmm… only 2028 of the `iata_code`s are unique (out of 29,408 total rows).

In [91]:
airport_codes = secondary_df[['iata_code','local_code']].dropna(how='all')

# Get number of null values for each column
pd.DataFrame(
    [
        airport_codes.iata_code.isna().sum(),
        airport_codes.local_code.isna().sum()
    ],
    index=['iata','local'],
    columns=['null_values']
)

Unnamed: 0,iata_code,local_code
0,LAX,LAX
1,ORD,ORD
2,JFK,JFK
3,ATL,ATL
4,SFO,SFO
...,...,...
29397,,3LA5
29400,,MI51
29404,,FD75
29405,,0OH2


In [94]:
airport_codes.loc[airport_codes.local_code.isna()]

Unnamed: 0,iata_code,local_code
819,CGX,
1982,QWG,
2087,MXG,
2778,NUN,
3175,HLM,
3296,MSD,
3325,DWN,
3340,PMX,
3531,NGZ,
3532,CLG,


In [75]:
# See if the two columns match
airport_codes.iata_code.equals(airport_codes.local_code)

False

In [95]:
# What's different?
airport_codes.loc[~airport_codes.iata_code.eq(airport_codes.local_code)]

Unnamed: 0,iata_code,local_code
137,SDX,SEZ
140,SCF,SDL
146,,CMA
153,CLD,CRQ
158,HSH,HND
...,...,...
29397,,3LA5
29400,,MI51
29404,,FD75
29405,,0OH2


In [83]:
airport_codes.eq("CLD").any(1)

0        False
1        False
2        False
3        False
4        False
         ...  
25127    False
27475    False
27518    False
28534    False
29396    False
Length: 1979, dtype: bool

In [86]:
airport_codes[airport_codes.eq("LTH").any(axis=1)]

Unnamed: 0,iata_code,local_code
24070,LTH,U75


In [58]:
(
    secondary_df
    .loc[~secondary_df.iata_code.isin(secondary_df.iata_code.unique())]
    .sort_values(by='iata_code', ascending=True)
    .head()
    .transpose()
)

id
ident
type
name
latitude_deg
longitude_deg
elevation_ft
continent
country_name
iso_country
region_name
