# Data acquisition

In [1]:
import pandas as pd
import functions.data_acquisition as data_funcs

## 1. NTSB incident data

Data source: [NTSB Downloadable Aviation Datasets](https://data.ntsb.gov/avdata)

### 1.1. Download

In [2]:
url='https://data.ntsb.gov/avdata/FileDirectory/DownloadFile?fileID=C%3A%5Cavdata%5Cavall.zip'
data_funcs.download_data(url)

### 1.2. Parse

Uses [access_parser](https://github.com/claroty/access_parser) to convert Microsoft Access database file to CSV and then read into Pandas DataFrame.

In [16]:
file='avall.mdb'
table=data_funcs.parse_mdb(file)


### 1.3. Read

In [13]:
aircraft_df=pd.read_csv('../data/raw/aircraft.csv',low_memory=False)
aircraft_df.head(92)

Unnamed: 0,Aircraft_Key,cert_max_gr_wt,fc_seats,cc_seats,pax_seats,total_seats,num_eng,date_last_insp,afm_hrs_last_insp,afm_hrs,...,lchg_userid,afm_hrs_since,rwy_num,site_seeing,air_medical,med_type_flight,fuel_on_board,elt_manufacturer,elt_model,elt_reason_other
0,1,,,,,,,,,,...,gibs-i,N,,N,N,,,,,
1,1,2400.0,,,,4.0,1.0,2007-04-01 00:00:00,75.0,2865.000000,...,,Y,7,N,N,,,,,
2,2,1960.0,,,,2.0,1.0,2007-01-01 00:00:00,,340.000000,...,,Y,7,N,N,,,,,
3,1,2850.0,,,,2.0,1.0,2007-09-01 00:00:00,,516.000000,...,,N,090,N,N,,,,,
4,1,9920.0,,,,9.0,1.0,,,1871.000000,...,,N,,N,N,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87,1,500.0,,,,1.0,1.0,,,0.000000,...,,N,11,N,N,,,,,
88,1,,,,,,1.0,,,,...,,N,34R,N,N,,,,,
89,1,,,,,,1.0,,,,...,,N,18,N,N,,,,,
90,1,,,,,,1.0,2008-01-01 00:00:00,,6472.100098,...,,N,,N,N,,,,,


In [None]:
events_df=pd.read_csv('../data/raw/events.csv',low_memory=False)
events_df.head()

## 2. On-time performance data

### 2.1. Get download links

In [None]:
url='https://www.bts.gov/browse-statistical-products-and-data/bts-publications/airline-service-quality-performance-234-time'
links=data_funcs.get_ontime_links(url)

### 2.2. Download on-time data

In [None]:
n_files=3

data_funcs.download_ontime_data(links[:n_files])

### 2.3. Parse and combine on-time datafiles

In [None]:
data_df=data_funcs.read_asc_datafiles(n_files)

In [None]:
data_df.head()

In [None]:
data_df.info()

### 2.3. Save as CSV

In [None]:
data_df.to_csv('../data/raw/on_time.csv', index=False)