In [1]:
from enum import Enum
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Available locations
class Location(Enum):
    OSLO = "OSL"
    TRONDHEIM = "TRD"

LOCATION = Location.OSLO

# Label Encoding

In [2]:
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv("../data/clean/%s.csv" % LOCATION.value)
print(df.head())


y_original = df["TARGET"].values

X_original = df[[
    'MONTH',
    'DAY_OF_MONTH',
    'WEEKDAY',
    'HOUR',
    'WIND_SPEED', 
    'WIND_DIRECTION', 
    'AIR_TEMPERATURE', 
    'PRECIPITATION', 
    'VISIBILITY'
]]

# Parse string features to numeric values
le = LabelEncoder()
le.fit(df['AIRLINE_IATA'].values)
X_original.insert(2, "AIRLINE_IATA", le.transform(df['AIRLINE_IATA'].values), True) 

le = LabelEncoder()
le.fit(df['FLIGHT_ID'].values)
X_original.insert(2, "FLIGHT_ID", le.transform(df['FLIGHT_ID'].values), True)

le = LabelEncoder()
le.fit(df['GATE_STAND'].astype(str).values)
X_original.insert(2, "GATE_STAND", le.transform(df['GATE_STAND'].astype(str).values), True)

le = LabelEncoder()
le.fit(df['INT_DOM_SCHENGEN'].values)
X_original.insert(2, "INT_DOM_SCHENGEN", le.transform(df['INT_DOM_SCHENGEN'].values), True)


le = LabelEncoder()
le.fit(df['DEP_ARR'].values)
X_original.insert(2, "DEP_ARR", le.transform(df['DEP_ARR'].values), True) 

le = LabelEncoder()
le.fit(df['TO_FROM'].values)
X_original.insert(2, "TO_FROM", le.transform(df['TO_FROM'].values), True)

X_columns = list(X_original.columns)
X_original = X_original.values

X, y = X_original, y_original

   YEAR  MONTH  DAY_OF_MONTH  WEEKDAY  HOUR    DEP_ARR AIRLINE_IATA FLIGHT_ID  \
0  2016      1             1        4     6  DEPARTURE           AF    AF1275   
1  2016      1             1        4     7    ARRIVAL           SK     SK843   
2  2016      1             1        4     7  DEPARTURE           WF     WF124   
3  2016      1             1        4     8    ARRIVAL           AY     AY651   
4  2016      1             1        4     8  DEPARTURE           SK     SK864   

  TO_FROM INT_DOM_SCHENGEN GATE_STAND  PRECIPITATION  WIND_DIRECTION  \
0     CDG                S         39            0.0               0   
1     ARN                S         39            0.2             100   
2     FRO                D          2            0.2             100   
3     HEL                S        185            0.2             100   
4     ARN                S         39            0.2             100   

   VISIBILITY  AIR_TEMPERATURE  WIND_SPEED  TARGET  
0        7000              

# Selecting features (F-scores & Mutual Information)
Adapted from: https://towardsdatascience.com/dont-overfit-how-to-prevent-overfitting-in-your-deep-learning-models-63274e552323


In [8]:
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
import pprint

# For F-scores
selector_f = SelectKBest(f_classif, k=10)
selected_features_f = selector_f.fit_transform(X, y)
f_score_sorted_indexes = (-selector_f.scores_).argsort()

sorted_features_f = {}
for index in f_score_sorted_indexes:
    sorted_features_f[X_columns[index]] = float('{:.2f}'.format(selector_f.scores_[index]))

print('Sorted features based on their F-scores:')
pprint.pprint(sorted_features_f, sort_dicts=False)

# For Mutual Information (MI)
selector_mi = SelectKBest(mutual_info_classif, k=10)
selected_features_mi = selector_mi.fit_transform(X, y)
mi_sorted_indexes = (-selector_mi.scores_).argsort()

sorted_features_mi = {}
for index in mi_sorted_indexes:
    sorted_features_mi[X_columns[index]] = float('{:.4f}'.format(selector_mi.scores_[index]))

print('\nSorted features based on their Mutual Information-score:')
pprint.pprint(sorted_features_mi, sort_dicts=False)



Sorted features based on their F-scores:
{'HOUR': 13980.27,
 'INT_DOM_SCHENGEN': 3796.57,
 'VISIBILITY': 3407.96,
 'FLIGHT_ID': 1852.89,
 'AIRLINE_IATA': 1713.2,
 'WEEKDAY': 1426.94,
 'GATE_STAND': 716.21,
 'MONTH': 328.1,
 'AIR_TEMPERATURE': 235.34,
 'PRECIPITATION': 166.83,
 'WIND_SPEED': 20.93,
 'TO_FROM': 6.08,
 'DAY_OF_MONTH': 2.21,
 'WIND_DIRECTION': 2.01,
 'DEP_ARR': 0.12}

Sorted features based on their Mutual Information-score:
{'INT_DOM_SCHENGEN': 0.0541,
 'AIRLINE_IATA': 0.0415,
 'VISIBILITY': 0.0406,
 'DEP_ARR': 0.0383,
 'FLIGHT_ID': 0.0321,
 'WEEKDAY': 0.021,
 'HOUR': 0.0203,
 'MONTH': 0.0198,
 'TO_FROM': 0.0149,
 'AIR_TEMPERATURE': 0.0101,
 'WIND_SPEED': 0.0098,
 'GATE_STAND': 0.0051,
 'WIND_DIRECTION': 0.0042,
 'DAY_OF_MONTH': 0.0041,
 'PRECIPITATION': 0.0009}
