# Data acquisition

In [1]:
import pandas as pd
import functions.data_acquisition as data_funcs

## 1. NTSB incident data

Data source: [NTSB Downloadable Aviation Datasets](https://data.ntsb.gov/avdata)

### 1.1. Download

In [2]:
url='https://data.ntsb.gov/avdata/FileDirectory/DownloadFile?fileID=C%3A%5Cavdata%5Cavall.zip'
data_funcs.download_data(url)

### 1.2. Parse and save as CSV

Uses [access_parser](https://github.com/claroty/access_parser) to convert Microsoft Access database file to CSV and then read into Pandas DataFrame.

In [3]:
file='avall.mdb'
table=data_funcs.parse_mdb(file)

### 1.3. Check CSV contents

In [4]:
aircraft_df=pd.read_csv('../data/raw/aircraft.csv',low_memory=False)
aircraft_df.head()

Unnamed: 0,Aircraft_Key,cert_max_gr_wt,fc_seats,cc_seats,pax_seats,total_seats,num_eng,date_last_insp,afm_hrs_last_insp,afm_hrs,...,lchg_userid,afm_hrs_since,rwy_num,site_seeing,air_medical,med_type_flight,fuel_on_board,elt_manufacturer,elt_model,elt_reason_other
0,1,,,,,,,,,,...,gibs-i,N,,N,N,,,,,
1,1,2400.0,,,,4.0,1.0,2007-04-01 00:00:00,75.0,2865.0,...,,Y,7.0,N,N,,,,,
2,2,1960.0,,,,2.0,1.0,2007-01-01 00:00:00,,340.0,...,,Y,7.0,N,N,,,,,
3,1,2850.0,,,,2.0,1.0,2007-09-01 00:00:00,,516.0,...,,N,90.0,N,N,,,,,
4,1,9920.0,,,,9.0,1.0,,,1871.0,...,,N,,N,N,,,,,


In [11]:
aircraft_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29157 entries, 0 to 29156
Data columns (total 93 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Aircraft_Key             29157 non-null  int64  
 1   cert_max_gr_wt           19951 non-null  float64
 2   fc_seats                 11273 non-null  float64
 3   cc_seats                 1387 non-null   float64
 4   pax_seats                10435 non-null  float64
 5   total_seats              22995 non-null  float64
 6   num_eng                  24963 non-null  float64
 7   date_last_insp           19274 non-null  object 
 8   afm_hrs_last_insp        7949 non-null   float64
 9   afm_hrs                  18910 non-null  float64
 10  dprt_time                18521 non-null  float64
 11  phase_flt_spec           0 non-null      float64
 12  lchg_date                29157 non-null  object 
 13  rwy_len                  12906 non-null  float64
 14  rwy_width             

## 2. On-time performance data

Data source: [Airline Service Quality Performance 234 (On-Time performance data)](https://www.bts.gov/browse-statistical-products-and-data/bts-publications/airline-service-quality-performance-234-time)

### 2.1. Get download links

In [5]:
url='https://www.bts.gov/browse-statistical-products-and-data/bts-publications/airline-service-quality-performance-234-time'
links=data_funcs.get_ontime_links(url)

### 2.2. Download on-time data

In [6]:
n_files=3

data_funcs.download_ontime_data(links[:n_files])

### 2.3. Parse and combine on-time datafiles

In [7]:
data_df=data_funcs.read_asc_datafiles(n_files)

../data/raw/ontime.td.202411.asc
../data/raw/ontime.td.202410.asc
../data/raw/ontime.td.202412.asc


In [8]:
data_df.head()

Unnamed: 0,DL,3673,OO,3673.1,9E,5538,DTW,MSN,20241121,4,...,95,82,148,1617,1712,N915XJ,21,6,55,148.1
0,DL,3673.0,OO,3673.0,9E,5539.0,MSN,DTW,20241121.0,4.0,...,,,,,,,,,,
1,DL,4800.0,,,9E,4800.0,CLT,LGA,20241101.0,5.0,...,,,,,,,,,,
2,DL,4800.0,,,9E,4800.0,CLT,LGA,20241102.0,6.0,...,,,,,,,,,,
3,DL,4800.0,,,9E,4800.0,CLT,LGA,20241103.0,7.0,...,,,,,,,,,,
4,DL,4800.0,,,9E,4800.0,CLT,LGA,20241104.0,1.0,...,,,,,,,,,,


In [9]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1902949 entries, 0 to 1902948
Columns: 121 entries, DL to 148.1
dtypes: float64(72), int64(29), object(20)
memory usage: 1.7+ GB


### 2.3. Save as CSV

In [10]:
data_df.to_csv('../data/raw/on_time.csv', index=False)