# Data preparation

In [1]:
import pickle
from pathlib import Path

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder

import configuration as config

## 1. Load combined data

In [2]:
data_df=pd.read_csv(config.COMBINED_DATAFILE)

## 2. Combine origin/destination to 'route'

In [3]:
data_df['route'] = data_df['origin'] + ' to ' + data_df['destination']

In [4]:
print(data_df.columns)

Index(['origin', 'destination', 'departure_time', 'tail_number', 'incident',
       'route'],
      dtype='object')


## 3. Encode departure time

In [5]:
import numpy as np

departure_times = data_df['departure_time'].copy() 

def time_to_seconds(time_str):
    parts = time_str.split(':')
    hours = int(parts[0])
    minutes = int(parts[1])
    seconds = int(parts[2]) if len(parts) > 2 else 0
    return hours * 3600 + minutes * 60 + seconds

if departure_times.dtype == 'object':
    total_seconds = departure_times.apply(time_to_seconds)
else:
    total_seconds = departure_times

max_seconds_in_day = 24 * 3600
normalized_time = total_seconds / max_seconds_in_day

data_df['departure_time_sin'] = np.sin(2 * np.pi * normalized_time)
data_df['departure_time_cos'] = np.cos(2 * np.pi * normalized_time)

print(data_df[['departure_time', 'departure_time_sin', 'departure_time_cos']].head())

   departure_time  departure_time_sin  departure_time_cos
0          1338.0            0.097149            0.995270
1           821.0            0.059669            0.998218
2           901.0            0.065476            0.997854
3          1135.0            0.082446            0.996596
4           928.0            0.067435            0.997724


In [6]:
# 1. Combine 'departure_time_sin' and 'departure_time_cos' into a list of tuples
data_df['departure_time'] = list(zip(data_df['departure_time_sin'], data_df['departure_time_cos']))

# 2. Remove the original sine and cosine columns
data_df = data_df.drop(['departure_time_sin', 'departure_time_cos'], axis=1)

# Now your DataFrame 'data_df' will have a single 'departure_time' column
# containing tuples of (sin_value, cos_value).

## 4. Clean up features

In [7]:
data_df = data_df.drop('tail_number', axis=1)

In [8]:
data_df.head()

Unnamed: 0,origin,destination,departure_time,incident,route
0,LGA,ORF,"(0.09714864061033167, 0.9952698838142168)",0,LGA to ORF
1,DTW,MSN,"(0.05966933989164823, 0.9982181975284237)",0,DTW to MSN
2,MSP,BIS,"(0.06547569540580747, 0.9978541643502451)",0,MSP to BIS
3,BIS,MSP,"(0.08244584060375283, 0.9965955465318619)",0,BIS to MSP
4,MSN,DTW,"(0.06743485000226511, 0.9977236796854989)",0,MSN to DTW


## 5. Train-test split

In [14]:
from sklearn.model_selection import train_test_split


X = data_df.drop('incident', axis=1) 
y = data_df['incident']            

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## 6. Encode route

In [10]:
# Your code here... Hint: try OrdinalEncoder(), OneHotEncoder() seems like it's the right
# thing to do - but think about why it might be problematic with this data!

## 7. Save

In [11]:
Path(config.PROCESSED_DATA_DIRECTORY).mkdir(exist_ok=True)

data_df.to_csv(config.ENCODED_DATAFILE)
train_df.to_csv(config.TRAINING_DATAFILE)
test_df.to_csv(config.TESTING_DATAFILE)

NameError: name 'train_df' is not defined