# Data preparation

In [1]:
import pickle
from pathlib import Path

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder

import configuration as config

## 1. Load combined data

In [2]:
data_df=pd.read_csv(config.COMBINED_DATAFILE)
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 668357 entries, 0 to 668356
Data columns (total 5 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   origin          668357 non-null  object 
 1   destination     668357 non-null  object 
 2   departure_time  668357 non-null  float64
 3   tail_number     668357 non-null  object 
 4   incident        668357 non-null  int64  
dtypes: float64(1), int64(1), object(3)
memory usage: 25.5+ MB


## 2. Combine origin/destination to 'route'

In [3]:
data_df['route'] = data_df['origin'] + ' to ' + data_df['destination']

## 3. Encode departure time

In [None]:
import numpy as np

# departure_times = data_df['departure_time'].copy() 

# def time_to_seconds(time_str):
#     parts = time_str.split(':')
#     hours = int(parts[0])
#     minutes = int(parts[1])
#     seconds = int(parts[2]) if len(parts) > 2 else 0
#     return hours * 3600 + minutes * 60 + seconds

# if departure_times.dtype == 'object':
#     total_seconds = departure_times.apply(time_to_seconds)
# else:
#     total_seconds = departure_times

# max_seconds_in_day = 24 * 3600
# normalized_time = total_seconds / max_seconds_in_day

# Departure time is already a float, storing the 24 hr clock time, don't need 
# to try and parse it.'''

data_df['departure_time_sin'] = np.sin(2 * np.pi * data_df['departure_time']/2400)
data_df['departure_time_cos'] = np.cos(2 * np.pi * data_df['departure_time']/2400)

print(data_df[['departure_time', 'departure_time_sin', 'departure_time_cos']].head())

   departure_time  departure_time_sin  departure_time_cos
0          1732.0           -0.984196           -0.177085
1          1730.0           -0.983255           -0.182236
2          1726.0           -0.981293           -0.192522
3          1734.0           -0.985109           -0.171929
4          1730.0           -0.983255           -0.182236


In [5]:
# Shouldn't combine the sin & cos columns here - this results in a two member
# list in each cell - models can't take that, they need a number.

# data_df['departure_time'] = list(zip(data_df['departure_time_sin'], data_df['departure_time_cos']))
# data_df = data_df.drop(['departure_time_sin', 'departure_time_cos'], axis=1)

## 4. Clean up features

In [6]:
# Added drop of un-concatenated origin and destination and original departure time.
data_df = data_df.drop(['tail_number','origin', 'destination', 'departure_time'], axis=1)

In [7]:
data_df.head()

Unnamed: 0,incident,route,departure_time_sin,departure_time_cos
0,0,CLT to LGA,-0.984196,-0.177085
1,0,CLT to LGA,-0.983255,-0.182236
2,0,CLT to LGA,-0.981293,-0.192522
3,0,CLT to LGA,-0.985109,-0.171929
4,0,CLT to LGA,-0.983255,-0.182236


## 5. Train-test split

In [8]:
from sklearn.model_selection import train_test_split

# Cleaner to keep the features and labels together here - less files to
# save and load later

# X = data_df.drop('incident', axis=1) 
# y = data_df['incident']            

train_df, test_df = train_test_split(data_df, test_size=0.2, random_state=42)

## 6. Encode route

In [9]:
from sklearn.preprocessing import OrdinalEncoder

encoder=OrdinalEncoder()

data_df['route']=encoder.fit(data_df['route'].to_frame())
train_df['route']=encoder.transform(train_df['route'].to_frame())
test_df['route']=encoder.transform(test_df['route'].to_frame())

## 7. Save

In [10]:
Path(config.PROCESSED_DATA_DIRECTORY).mkdir(exist_ok=True)

data_df.to_csv(config.ENCODED_DATAFILE)

train_df.to_csv(config.TRAINING_DATAFILE)
test_df.to_csv(config.TESTING_DATAFILE)