# Data acquisition

In [1]:
import pandas as pd
import functions.data_acquisition as data_funcs

## 1. NTSB incident data

Data source: [NTSB Downloadable Aviation Datasets](https://data.ntsb.gov/avdata)

### 1.1. Download

In [2]:
url='https://data.ntsb.gov/avdata/FileDirectory/DownloadFile?fileID=C%3A%5Cavdata%5Cavall.zip'
data_funcs.download_data(url)

### 1.2. Parse and save as CSV

Uses [access_parser](https://github.com/claroty/access_parser) to convert Microsoft Access database file to CSV and then read into Pandas DataFrame.

In [3]:
file='avall.mdb'
table=data_funcs.parse_mdb(file)

### 1.3. Check CSV contents

In [4]:
incidents_df=pd.read_csv('../data/raw/aircraft.csv',low_memory=False)
incidents_df.head()

Unnamed: 0,Aircraft_Key,cert_max_gr_wt,fc_seats,cc_seats,pax_seats,total_seats,num_eng,date_last_insp,afm_hrs_last_insp,afm_hrs,...,lchg_userid,afm_hrs_since,rwy_num,site_seeing,air_medical,med_type_flight,fuel_on_board,elt_manufacturer,elt_model,elt_reason_other
0,1,,,,,,,,,,...,gibs-i,N,,N,N,,,,,
1,1,2400.0,,,,4.0,1.0,2007-04-01 00:00:00,75.0,2865.0,...,,Y,7.0,N,N,,,,,
2,2,1960.0,,,,2.0,1.0,2007-01-01 00:00:00,,340.0,...,,Y,7.0,N,N,,,,,
3,1,2850.0,,,,2.0,1.0,2007-09-01 00:00:00,,516.0,...,,N,90.0,N,N,,,,,
4,1,9920.0,,,,9.0,1.0,,,1871.0,...,,N,,N,N,,,,,


In [5]:
incidents_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29157 entries, 0 to 29156
Data columns (total 93 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Aircraft_Key             29157 non-null  int64  
 1   cert_max_gr_wt           19951 non-null  float64
 2   fc_seats                 11273 non-null  float64
 3   cc_seats                 1387 non-null   float64
 4   pax_seats                10435 non-null  float64
 5   total_seats              22995 non-null  float64
 6   num_eng                  24963 non-null  float64
 7   date_last_insp           19274 non-null  object 
 8   afm_hrs_last_insp        7949 non-null   float64
 9   afm_hrs                  18910 non-null  float64
 10  dprt_time                18521 non-null  float64
 11  phase_flt_spec           0 non-null      float64
 12  lchg_date                29157 non-null  object 
 13  rwy_len                  12906 non-null  float64
 14  rwy_width             

## 2. On-time performance data

Data source: [Airline Service Quality Performance 234 (On-Time performance data)](https://www.bts.gov/browse-statistical-products-and-data/bts-publications/airline-service-quality-performance-234-time)

### 2.1. Get download links

In [6]:
url='https://www.bts.gov/browse-statistical-products-and-data/bts-publications/airline-service-quality-performance-234-time'
links=data_funcs.get_ontime_links(url)

### 2.2. Download on-time data

In [7]:
n_files=3

data_funcs.download_ontime_data(links[:n_files])

### 2.3. Parse and combine on-time datafiles

In [8]:
ontime_df=data_funcs.read_asc_datafiles(n_files)

../data/raw/ontime.td.202411.asc
../data/raw/ontime.td.202410.asc
../data/raw/ontime.td.202412.asc


### 2.3. Save as CSV

In [9]:
ontime_df.to_csv('../data/raw/on_time.csv', index=False)

## 3. Feature extraction

### 3.1. Incident data

In [10]:
incidents_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29157 entries, 0 to 29156
Data columns (total 93 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Aircraft_Key             29157 non-null  int64  
 1   cert_max_gr_wt           19951 non-null  float64
 2   fc_seats                 11273 non-null  float64
 3   cc_seats                 1387 non-null   float64
 4   pax_seats                10435 non-null  float64
 5   total_seats              22995 non-null  float64
 6   num_eng                  24963 non-null  float64
 7   date_last_insp           19274 non-null  object 
 8   afm_hrs_last_insp        7949 non-null   float64
 9   afm_hrs                  18910 non-null  float64
 10  dprt_time                18521 non-null  float64
 11  phase_flt_spec           0 non-null      float64
 12  lchg_date                29157 non-null  object 
 13  rwy_len                  12906 non-null  float64
 14  rwy_width             

In [11]:
pd.set_option('display.max_columns', 200)
incidents_df.head()

Unnamed: 0,Aircraft_Key,cert_max_gr_wt,fc_seats,cc_seats,pax_seats,total_seats,num_eng,date_last_insp,afm_hrs_last_insp,afm_hrs,dprt_time,phase_flt_spec,lchg_date,rwy_len,rwy_width,acft_year,commercial_space_flight,unmanned,ifr_equipped_cert,elt_mounted_aircraft,elt_connected_antenna,ev_id,regis_no,ntsb_no,acft_missing,far_part,flt_plan_filed,flight_plan_activated,damage,acft_fire,acft_expl,acft_make,acft_model,acft_series,acft_serial_no,acft_category,acft_reg_cls,homebuilt,fixed_retractable,type_last_insp,elt_install,elt_oper,elt_aided_loc_ev,elt_type,owner_acft,owner_street,owner_city,owner_state,owner_country,owner_zip,oper_individual_name,oper_name,oper_same,oper_dba,oper_addr_same,oper_street,oper_city,oper_state,oper_country,oper_zip,oper_code,certs_held,oprtng_cert,oper_cert,oper_cert_num,oper_sched,oper_dom_int,oper_pax_cargo,type_fly,second_pilot,dprt_pt_same_ev,dprt_apt_id,dprt_city,dprt_state,dprt_country,dprt_timezn,dest_same_local,dest_apt_id,dest_city,dest_state,dest_country,report_to_icao,evacuation,lchg_userid,afm_hrs_since,rwy_num,site_seeing,air_medical,med_type_flight,fuel_on_board,elt_manufacturer,elt_model,elt_reason_other
0,1,,,,,,,,,,,,2023-08-03 13:29:08,,,,False,False,False,False,False,20080211X00175,N530NA,DFW08RA039,N,NUSN,,,SUBS,,,Eurocopter France,AS350B3,,,HELI,,N,FIXD,,,,,,,,,,,,N,Prism Helicopters,,,,,,,,,,,,,,,,,EXLD,,,,,,,,,,,,,,,gibs-i,N,,N,N,,,,,
1,1,2400.0,,,,4.0,1.0,2007-04-01 00:00:00,75.0,2865.0,2200.0,,2020-09-25 18:05:31,2700.0,45.0,,False,False,False,False,False,20080107X00026,N8037W,SEA08LA057A,N,091,NONE,,MINR,NONE,NONE,PIPER,PA 28-180,,28-2104,AIR,,N,FIXD,ANNL,Y,N,N,,Marshall White,,Sebastopol,CA,USA,95472,N,Timothy Bennett,,,,,Novato,CA,USA,94949.0,,Y,,,,,,,PERS,N,,O69,Petaluma,CA,USA,,,OQ3,Sonoma,CA,USA,,,,Y,7.0,N,N,,,,,
2,2,1960.0,,,,2.0,1.0,2007-01-01 00:00:00,,340.0,2230.0,,2020-09-25 18:05:31,2700.0,45.0,,False,False,False,False,False,20080107X00026,N15EX,SEA08LA057B,N,091,NONE,,SUBS,NONE,NONE,Barnard/Stancil,Glastar,,5496,AIR,,Y,FIXD,COAW,Y,N,N,,Kenneth Couey,,Pioneer,CA,USA,95666,N,Kenneth Couey,,,,,Pioneer,CA,USA,95666.0,,Y,,,,,,,PERS,N,,OQ3,Sonoma,CA,USA,,,O70,Jackson,CA,USA,,,,Y,7.0,N,N,,,,,
3,1,2850.0,,,,2.0,1.0,2007-09-01 00:00:00,,516.0,2100.0,,2020-09-25 18:05:31,5003.0,100.0,,False,False,False,False,False,20080109X00036,N579RM,DFW08CA054,N,091,NONE,N,SUBS,NONE,NONE,Micco Aircraft Company,MAC-145B,,260004,AIR,,N,RETR,ANNL,Y,N,N,,,,Gonzales,TX,USA,78629,Y,,,,,,,,,,,Y,,,,,,,PERS,N,,T20,GONZALES,TX,USA,,,AXH,HOUSTON,TX,USA,,,,N,90.0,N,N,,,,,
4,1,9920.0,,,,9.0,1.0,,,1871.0,400.0,,2020-09-25 18:04:02,,,,False,False,False,False,False,20080107X00027,N398J,DFW08LA055,N,091,IFR,Y,,NONE,NONE,Pilatus,PC-12/45,,398,AIR,,N,RETR,UNK,Y,N,N,,Oklahoma cardiovascular Associates PC,,Oklahoma City,OK,USA,73`120,N,,,,,,,,,,,Y,,,,,,,PERS,N,,PHX,Phoenix,CA,USA,,,PWA,Oklahoma City,OK,USA,,,,N,,N,N,,,,,


In [21]:
incident_features={
    'dprt_time': 'departure_time',
    'dprt_apt_id': 'origin',
    'dest_apt_id': 'destination'
}

extracted_incident_df=incidents_df[incident_features.keys()].copy()
extracted_incident_df=extracted_incident_df.rename(columns=incident_features)
extracted_incident_df['incident']=[1]*len(extracted_incident_df)

extracted_incident_df.to_csv('../data/interim/extracted_incident_data.csv', index=False)

extracted_incident_df.head()

Unnamed: 0,departure_time,origin,destination,incident
0,,,,1
1,2200.0,O69,OQ3,1
2,2230.0,OQ3,O70,1
3,2100.0,T20,AXH,1
4,400.0,PHX,PWA,1


### 3.2. On-time data

In [24]:
ontime_features={
    # 'carrier': 0,
    # 'flight_number': 1,
    'origin': 6,
    'destination': 7,
    # 'date': 8,
    'departure_time': 12    
}

extracted_ontime_df=ontime_df.iloc[:,list(ontime_features.values())].copy()
extracted_ontime_df.columns=ontime_features.keys()
extracted_ontime_df['incident']=[0]*len(extracted_ontime_df)

extracted_ontime_df.head()

extracted_ontime_df.to_csv('../data/interim/extracted_on_time_data.csv', index=False)

### 3.3. Combine incident and on-time data

In [25]:
data_df=pd.concat([extracted_ontime_df, extracted_incident_df], axis=0)
data_df.to_csv('../data/processed/combined_data.csv', index=False)
data_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1932106 entries, 0 to 29156
Data columns (total 4 columns):
 #   Column          Dtype  
---  ------          -----  
 0   origin          object 
 1   destination     object 
 2   departure_time  float64
 3   incident        int64  
dtypes: float64(1), int64(1), object(2)
memory usage: 73.7+ MB
