# Data acquisition

In [1]:
from pathlib import Path

import pandas as pd
import functions.data_acquisition as data_funcs
import configuration as config

Path(config.RAW_DATA_DIRECTORY).mkdir(parents=True, exist_ok=True)
Path(config.INTERIM_DATA_DIRECTORY).mkdir(parents=True, exist_ok=True)
Path(config.PROCESSED_DATA_DIRECTORY).mkdir(parents=True, exist_ok=True)

## 1. NTSB incident data

Data source: [NTSB Downloadable Aviation Datasets](https://data.ntsb.gov/avdata)

### 1.1. Download

In [2]:
data_funcs.download_data(config.INCIDENT_DATA_URL, config.RAW_DATA_DIRECTORY, config.RAW_INCIDENTS_MDB_FILE)

### 1.2. Parse and save as CSV

Uses [access_parser](https://github.com/claroty/access_parser) to convert Microsoft Access database file to CSV and then read into Pandas DataFrame.

In [3]:
table=data_funcs.parse_mdb(config.RAW_INCIDENTS_MDB_FILE, config.RAW_INCIDENTS_CSV_FILE)

## 2. On-time performance data

Data source: [Airline Service Quality Performance 234 (On-Time performance data)](https://www.bts.gov/browse-statistical-products-and-data/bts-publications/airline-service-quality-performance-234-time)

### 2.1. Get download links

In [4]:
links=data_funcs.get_ontime_links(config.ONTIME_DATA_URL)

### 2.2. Download on-time data

In [5]:
data_funcs.download_ontime_data(links[:config.ONTIME_FILES], config.ONTIME_DATA_LINK_PREFIX, config.RAW_DATA_DIRECTORY)

### 2.3. Parse and combine on-time datafiles

In [None]:
ontime_df=data_funcs.parse_asc_datafiles(config.ONTIME_FILES, config.RAW_DATA_DIRECTORY, config.RAW_ONTIME_CSV_FILE)

../data/raw/ontime.td.202412.asc


../data/raw/ontime.td.202410.asc
../data/raw/ontime.td.202411.asc


## 3. Feature extraction

### 3.1. Incident data

In [7]:
incidents_df=pd.read_csv(config.RAW_INCIDENTS_CSV_FILE, low_memory=False)
incidents_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29157 entries, 0 to 29156
Data columns (total 93 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Aircraft_Key             29157 non-null  int64  
 1   cert_max_gr_wt           19951 non-null  float64
 2   fc_seats                 11273 non-null  float64
 3   cc_seats                 1387 non-null   float64
 4   pax_seats                10435 non-null  float64
 5   total_seats              22995 non-null  float64
 6   num_eng                  24963 non-null  float64
 7   date_last_insp           19274 non-null  object 
 8   afm_hrs_last_insp        7949 non-null   float64
 9   afm_hrs                  18910 non-null  float64
 10  dprt_time                18521 non-null  float64
 11  phase_flt_spec           0 non-null      float64
 12  lchg_date                29157 non-null  object 
 13  rwy_len                  12906 non-null  float64
 14  rwy_width             

In [8]:
pd.set_option('display.max_rows', 200)
incidents_df.head().transpose()

Unnamed: 0,0,1,2,3,4
Aircraft_Key,1,1,2,1,1
cert_max_gr_wt,,2400.0,1960.0,2850.0,9920.0
fc_seats,,,,,
cc_seats,,,,,
pax_seats,,,,,
total_seats,,4.0,2.0,2.0,9.0
num_eng,,1.0,1.0,1.0,1.0
date_last_insp,,2007-04-01 00:00:00,2007-01-01 00:00:00,2007-09-01 00:00:00,
afm_hrs_last_insp,,75.0,,,
afm_hrs,,2865.0,340.0,516.0,1871.0


In [9]:
incident_features={
    'dprt_time': 'departure_time',
    'dprt_apt_id': 'origin',
    'dest_apt_id': 'destination',
    'regis_no': 'tail_number'
}

extracted_incident_df=incidents_df[incident_features.keys()].copy()
extracted_incident_df=extracted_incident_df.rename(columns=incident_features)
extracted_incident_df['incident']=[1]*len(extracted_incident_df)
extracted_incident_df.dropna(inplace=True)
extracted_incident_df.to_csv(config.EXTRACTED_INCIDENTS_FILE, index=False)
extracted_incident_df.head()

Unnamed: 0,departure_time,origin,destination,tail_number,incident
1,2200.0,O69,OQ3,N8037W,1
2,2230.0,OQ3,O70,N15EX,1
3,2100.0,T20,AXH,N579RM,1
4,400.0,PHX,PWA,N398J,1
5,2215.0,KFNL,KFNL,N24965,1


In [10]:
extracted_incident_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14393 entries, 1 to 29139
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   departure_time  14393 non-null  float64
 1   origin          14393 non-null  object 
 2   destination     14393 non-null  object 
 3   tail_number     14393 non-null  object 
 4   incident        14393 non-null  int64  
dtypes: float64(1), int64(1), object(3)
memory usage: 674.7+ KB


### 3.2. On-time data

In [11]:
ontime_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1902949 entries, 0 to 1902948
Columns: 121 entries, DL to 70.1
dtypes: float64(72), int64(29), object(20)
memory usage: 1.7+ GB


In [12]:
ontime_df.head().transpose()

Unnamed: 0,0,1,2,3,4
DL,DL,DL,DL,DL,DL
4032,4032.0,3667.0,4066.0,4066.0,3664.0
OO,OO,OO,OO,OO,OO
4032.1,4032.0,3667.0,4066.0,4066.0,3664.0
9E,9E,9E,9E,9E,9E
5539,5538.0,5540.0,5538.0,5539.0,5541.0
ORF,LGA,DTW,MSP,BIS,MSN
LGA,ORF,MSN,BIS,MSP,DTW
20241213,20241213.0,20241202.0,20241202.0,20241202.0,20241202.0
5,5.0,1.0,1.0,1.0,1.0


In [13]:
ontime_features={
    # 'carrier': 0,
    # 'flight_number': 1,
    'origin': 6,
    'destination': 7,
    # 'date': 8,
    'departure_time': 12,
    'tail_number': 25 
}

extracted_ontime_df=ontime_df.iloc[:,list(ontime_features.values())].copy()
extracted_ontime_df.columns=ontime_features.keys()
extracted_ontime_df['incident']=[0]*len(extracted_ontime_df)
extracted_ontime_df.dropna(inplace=True)
extracted_ontime_df.to_csv(config.EXTRACTED_ONTIME_FILE, index=False)
extracted_ontime_df.head()

Unnamed: 0,origin,destination,departure_time,tail_number,incident
0,LGA,ORF,1338.0,N915XJ,0
1,DTW,MSN,821.0,N907XJ,0
2,MSP,BIS,901.0,N166PQ,0
3,BIS,MSP,1135.0,N166PQ,0
4,MSN,DTW,928.0,N907XJ,0


In [14]:
extracted_ontime_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 631177 entries, 0 to 632033
Data columns (total 5 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   origin          631177 non-null  object 
 1   destination     631177 non-null  object 
 2   departure_time  631177 non-null  float64
 3   tail_number     631177 non-null  object 
 4   incident        631177 non-null  int64  
dtypes: float64(1), int64(1), object(3)
memory usage: 28.9+ MB


### 3.3. Combine incident and on-time data

In [15]:
data_df=pd.concat([extracted_ontime_df, extracted_incident_df], axis=0)
data_df.reset_index(inplace=True, drop=True)
data_df['origin']=data_df['origin'].astype(str)
data_df['destination']=data_df['destination'].astype(str)
data_df['departure_time']=data_df['departure_time'].astype(float)
data_df['tail_number']=data_df['tail_number'].astype(str)
data_df['incident']=data_df['incident'].astype(int)
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 645570 entries, 0 to 645569
Data columns (total 5 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   origin          645570 non-null  object 
 1   destination     645570 non-null  object 
 2   departure_time  645570 non-null  float64
 3   tail_number     645570 non-null  object 
 4   incident        645570 non-null  int64  
dtypes: float64(1), int64(1), object(3)
memory usage: 24.6+ MB


### 3.4. Save

In [16]:
data_df.to_csv(config.COMBINED_DATAFILE, index=False)