# Data acquisition

In [1]:
import pandas as pd
import functions.data_acquisition as data_funcs

## 1. NTSB incident data

Data source: [NTSB Downloadable Aviation Datasets](https://data.ntsb.gov/avdata)

### 1.1. Download

In [2]:
url='https://data.ntsb.gov/avdata/FileDirectory/DownloadFile?fileID=C%3A%5Cavdata%5Cavall.zip'
data_funcs.download_data(url)

### 1.2. Parse and save as CSV

Uses [access_parser](https://github.com/claroty/access_parser) to convert Microsoft Access database file to CSV and then read into Pandas DataFrame.

In [3]:
file='avall.mdb'
table=data_funcs.parse_mdb(file)

### 1.3. Check CSV contents

In [4]:
aircraft_df=pd.read_csv('../data/raw/aircraft.csv',low_memory=False)
aircraft_df.head()

Unnamed: 0,Aircraft_Key,cert_max_gr_wt,fc_seats,cc_seats,pax_seats,total_seats,num_eng,date_last_insp,afm_hrs_last_insp,afm_hrs,...,lchg_userid,afm_hrs_since,rwy_num,site_seeing,air_medical,med_type_flight,fuel_on_board,elt_manufacturer,elt_model,elt_reason_other
0,1,,,,,,,,,,...,gibs-i,N,,N,N,,,,,
1,1,2400.0,,,,4.0,1.0,2007-04-01 00:00:00,75.0,2865.0,...,,Y,7.0,N,N,,,,,
2,2,1960.0,,,,2.0,1.0,2007-01-01 00:00:00,,340.0,...,,Y,7.0,N,N,,,,,
3,1,2850.0,,,,2.0,1.0,2007-09-01 00:00:00,,516.0,...,,N,90.0,N,N,,,,,
4,1,9920.0,,,,9.0,1.0,,,1871.0,...,,N,,N,N,,,,,


In [5]:
aircraft_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29157 entries, 0 to 29156
Data columns (total 93 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Aircraft_Key             29157 non-null  int64  
 1   cert_max_gr_wt           19951 non-null  float64
 2   fc_seats                 11273 non-null  float64
 3   cc_seats                 1387 non-null   float64
 4   pax_seats                10435 non-null  float64
 5   total_seats              22995 non-null  float64
 6   num_eng                  24963 non-null  float64
 7   date_last_insp           19274 non-null  object 
 8   afm_hrs_last_insp        7949 non-null   float64
 9   afm_hrs                  18910 non-null  float64
 10  dprt_time                18521 non-null  float64
 11  phase_flt_spec           0 non-null      float64
 12  lchg_date                29157 non-null  object 
 13  rwy_len                  12906 non-null  float64
 14  rwy_width             

## 2. On-time performance data

Data source: [Airline Service Quality Performance 234 (On-Time performance data)](https://www.bts.gov/browse-statistical-products-and-data/bts-publications/airline-service-quality-performance-234-time)

### 2.1. Get download links

In [6]:
url='https://www.bts.gov/browse-statistical-products-and-data/bts-publications/airline-service-quality-performance-234-time'
links=data_funcs.get_ontime_links(url)

### 2.2. Download on-time data

In [7]:
n_files=3

data_funcs.download_ontime_data(links[:n_files])

### 2.3. Parse and combine on-time datafiles

In [8]:
data_df=data_funcs.read_asc_datafiles(n_files)

../data/raw/ontime.td.202411.asc
../data/raw/ontime.td.202410.asc
../data/raw/ontime.td.202412.asc


In [20]:
pd.options.display.max_columns = None
pd.options.display.max_rows = None
data_df.head(20).transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
DL,DL,DL,DL,DL,DL,DL,DL,DL,DL,DL,DL,DL,DL,DL,DL,DL,DL,DL,DL,DL
3673,3673.0,4800.0,4800.0,4800.0,4800.0,4800.0,4800.0,4800.0,4800.0,4800.0,4800.0,4800.0,4800.0,4800.0,4800.0,4800.0,4800.0,4800.0,4800.0,4800.0
OO,OO,,,,,,,,,,,,,,,,,,,
3673.1,3673.0,,,,,,,,,,,,,,,,,,,
9E,9E,9E,9E,9E,9E,9E,9E,9E,9E,9E,9E,9E,9E,9E,9E,9E,9E,9E,9E,9E
5538,5539.0,4800.0,4800.0,4800.0,4800.0,4800.0,4800.0,4800.0,4800.0,4800.0,4800.0,4800.0,4800.0,4800.0,4800.0,4800.0,4800.0,4800.0,4800.0,4800.0
DTW,MSN,CLT,CLT,CLT,CLT,CLT,CLT,CLT,CLT,CLT,TYS,TYS,TYS,TYS,TYS,TYS,TYS,TYS,TYS,TYS
MSN,DTW,LGA,LGA,LGA,LGA,LGA,LGA,LGA,LGA,LGA,ATL,ATL,ATL,ATL,ATL,ATL,ATL,ATL,ATL,ATL
20241121,20241121.0,20241101.0,20241102.0,20241103.0,20241104.0,20241105.0,20241106.0,20241107.0,20241108.0,20241109.0,20241111.0,20241112.0,20241113.0,20241114.0,20241115.0,20241116.0,20241117.0,20241118.0,20241119.0,20241120.0
4,4.0,5.0,6.0,7.0,1.0,2.0,3.0,4.0,5.0,6.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,1.0,2.0,3.0


In [11]:
data_df.info(max_cols=len(data_df))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1902949 entries, 0 to 1902948
Data columns (total 121 columns):
 #    Column       Dtype  
---   ------       -----  
 0    DL           object 
 1    3673         float64
 2    OO           object 
 3    3673.1       float64
 4    9E           object 
 5    5538         float64
 6    DTW          object 
 7    MSN          object 
 8    20241121     float64
 9    4            float64
 10   1230         float64
 11   1230.1       float64
 12   1511         float64
 13   1254         float64
 14   1254.1       float64
 15   1545         float64
 16   0            int64  
 17   0.1          int64  
 18   24           float64
 19   94           float64
 20   161          float64
 21   171          float64
 22   70           float64
 23   1552         float64
 24   1541         float64
 25   N909XJ       object 
 26   41           float64
 27   4.1          float64
 28   49           float64
 29   Unnamed: 29  object 
 30   101          flo

### 2.3. Save as CSV

In [None]:
data_df.to_csv('../data/raw/on_time.csv', index=False)

## 3. Feature extraction

### 3.1. On-time data

In [None]:
ontime_features={
    'carrier': 0,
    'flight_number': 1,
    'origin': 6,
    'destination': 7,
    'date': 8,
    'departure_time': 12    
}

extracted_data_df=data_df.iloc[:,list(ontime_features.values())]
extracted_data_df.columns=ontime_features.keys()
extracted_data_df.head()

Unnamed: 0,carrier,flight_number,origin,destination,date,departure_time
0,DL,3673.0,MSN,DTW,20241121.0,1623.0
1,DL,4800.0,CLT,LGA,20241101.0,1828.0
2,DL,4800.0,CLT,LGA,20241102.0,1731.0
3,DL,4800.0,CLT,LGA,20241103.0,1731.0
4,DL,4800.0,CLT,LGA,20241104.0,1732.0


In [41]:
extracted_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1902949 entries, 0 to 1902948
Data columns (total 6 columns):
 #   Column          Dtype  
---  ------          -----  
 0   carrier         object 
 1   flight_number   float64
 2   origin          object 
 3   destination     object 
 4   date            float64
 5   departure_time  float64
dtypes: float64(3), object(3)
memory usage: 87.1+ MB
