In [1]:
import pandas as pd
import sqlite3
import numpy as np

pd.set_option('display.max_columns', 500)

## Get most complete data sources and remove duplicates

In [2]:
conn_1 = sqlite3.connect('../data/raw/batch_1.db')
conn_2 = sqlite3.connect('../data/raw/batch_2.db')
conn_test = sqlite3.connect('../data/raw/test.db')
fuel = pd.read_parquet('../data/raw/fuel.parquet')

In [3]:
vols_1 = pd.read_sql_query("SELECT * FROM vols", conn_1)
vols_2 = pd.read_sql_query("SELECT * FROM vols", conn_2)

In [4]:
vols = pd.concat([vols_1, vols_2])

In [5]:
vols.head()

Unnamed: 0,IDENTIFIANT,VOL,CODE AVION,AEROPORT DEPART,AEROPORT ARRIVEE,DEPART PROGRAMME,HEURE DE DEPART,RETART DE DEPART,TEMPS DE DEPLACEMENT A TERRE AU DECOLLAGE,DECOLLAGE,TEMPS PROGRAMME,TEMPS PASSE,TEMPS DE VOL,DISTANCE,ATTERRISSAGE,TEMPS DE DEPLACEMENT A TERRE A L'ATTERRISSAGE,ARRIVEE PROGRAMMEE,HEURE D'ARRIVEE,RETARD A L'ARRIVEE,DETOURNEMENT,ANNULATION,RAISON D'ANNULATION,RETARD SYSTEM,RETARD SECURITE,RETARD COMPAGNIE,RETARD AVION,RETARD METEO,COMPAGNIE AERIENNE,NOMBRE DE PASSAGERS,DATE,NIVEAU DE SECURITE
0,1259209,4661,a02782cd75,CEB,AAL,1707,1658.0,-9.0,20.0,1718.0,67.0,71.0,45.0,232,1803.0,6.0,1814,1809.0,-5.0,0,0,,,,,,,MAF,379,15/8/2018,10
1,4886177,5026,707f6ea54f,GOI,LTK,600,553.0,-7.0,11.0,604.0,130.0,119.0,91.0,738,835.0,17.0,910,852.0,-18.0,0,0,,,,,,,I6F,9,2/11/2016,10
2,183332,2021,b116987956,DSS,JNB,1749,1747.0,-2.0,9.0,1756.0,248.0,228.0,215.0,1671,1831.0,4.0,1857,1835.0,-22.0,0,0,,,,,,,NVPPA,2491,9/6/2017,10
3,937517,1320,a4b8db63f5,AGP,GOA,2301,2322.0,21.0,19.0,2341.0,65.0,89.0,59.0,214,40.0,11.0,6,51.0,45.0,0,0,,24.0,0.0,0.0,6.0,15.0,NVPPA,1241,26/5/2018,10
4,2157498,508,34604053c0,BRU,BOD,612,603.0,-9.0,13.0,616.0,302.0,259.0,238.0,2288,1314.0,8.0,1414,1322.0,-52.0,0,0,,,,,,,THA,78,10/11/2018,10


In [6]:
aeroports_2 = pd.read_sql_query("SELECT * FROM aeroports", conn_2)

In [7]:
aeroports = aeroports_2.drop_duplicates(subset=['CODE IATA'], keep='last')

In [8]:
aeroports.head()

Unnamed: 0,CODE IATA,NOM,LIEU,PAYS,LONGITUDE,LATITUDE,HAUTEUR,PRIX RETARD PREMIERE 20 MINUTES,PRIS RETARD POUR CHAQUE MINUTE APRES 10 MINUTES
0,MCT,Muscat International Airport,Muscat,OM,58.284400939941406,23.593299865722656,48.0,53,3
1,SOU,Southampton Airport,Southampton,GB,-1.3567999601364136,50.95029830932617,44.0,24,5
2,PNH,Phnom Penh International Airport,Phnom Penh,KH,104.84400177001952,11.546600341796877,40.0,33,3
3,BLR,Kempegowda International Airport,Bangalore,IN,77.706299,13.1979,3000.0,70,9
4,FFD,RAF Fairford,Fairford,GB,-1.7900300025900002,51.6822013855,286.0,65,3


In [9]:
compagnies = pd.read_sql_query("SELECT * FROM compagnies", conn_2)

In [10]:
compagnies.head()

Unnamed: 0,COMPAGNIE,CODE,NOMBRE D EMPLOYES,CHIFFRE D AFFAIRE
0,Try Hard Airlines,THA,12909,2310000000
1,Corporate Overlord Airways,COA,130595,40579000000
2,Neverland Airlines,,18567,7651000000
3,Always A Problem Flights,AAPF,92000,51000000000
4,Overpriced Air,OA,12200,6649000000


In [11]:
fuel = pd.read_parquet('../data/raw/fuel.parquet')

In [12]:
fuel.head()

Unnamed: 0,DATE,PRIX DU BARIL
0,04/01/16,57.78
1,05/01/16,56.365
2,06/01/16,55.07
3,07/01/16,53.69
4,08/01/16,53.23


## Prepare data for aggregation

In [18]:
import sys
sys.path.append('../src')

In [22]:
from data_preparation import *
from feature_engineering import *

In [23]:
vols_new_col_names = rename_dataframe_columns(vols)
compagnies_new_col_names = rename_dataframe_columns(compagnies, 'compagnies_')
fuel_new_col_names = rename_dataframe_columns(fuel, 'fuel_')
depart_aeroports = rename_dataframe_columns(aeroports, 'depart_')
arrivee_aeroports = rename_dataframe_columns(aeroports, 'arrivee_')

In [25]:
fuel_prepared = prepare_fuel_time_series(fuel_new_col_names)

In [26]:
vols_new_col_names['date'] = pd.to_datetime(vols_new_col_names['date'], format='%d/%m/%Y')

## Aggregate data

In [None]:
flights_with_airlines = merge_flights_with_airlines(vols_new_col_names, compagnies_new_col_names)

In [None]:
flights_with_departures = merge_flights_with_departures_airports(flights_with_airlines, depart_aeroports)

In [None]:
flights_with_arrivals = merge_flights_with_arrivals_airports(flights_with_departures, arrivee_aeroports)

In [None]:
flights_agg = merge_flights_with_fuel(flights_with_arrivals, fuel_prepared)

In [None]:
flights_agg.head()

## Features engineering

In [None]:
POSSIBLE_LEAK_COLUMNS = ['heure_de_depart', 'retart_de_depart', 'temps_de_deplacement_a_terre_au_decollage', 
                         'decollage', 'temps_de_vol', 'temps_passe', 'atterrissage', 
                         "temps_de_deplacement_a_terre_a_l'atterrissage", "heure_d'arrivee",  
                         'detournement', 'annulation', "raison_d'annulation", 'retard_system', 'retard_securite', 
                         'retard_compagnie', 'retard_avion', 'retard_avion', 'retard_meteo']

DUPLICATED_DATA = ['compagnies_compagnie', 'depart_nom', 'arrivee_nom']

OTHER_COLUMNS_TO_DROP = ['depart_prix_retard_premiere_20_minutes', 
                         'depart_pris_retard_pour_chaque_minute_apres_10_minutes',
                         'arrivee_prix_retard_premiere_20_minutes',
                         'arrivee_pris_retard_pour_chaque_minute_apres_10_minutes',
                         'identifiant']

COLUMNS_TO_DROP = POSSIBLE_LEAK_COLUMNS + DUPLICATED_DATA + OTHER_COLUMNS_TO_DROP

In [None]:
flights_removed_col = flights_agg.drop(columns=COLUMNS_TO_DROP)

In [None]:
from src.feature_engineering import create_columns_from_date, convert_latitude_longitude_to_float, 
create_hour_column_from_departure_time, 

In [None]:
flights_new_col = create_columns_from_date(flights_removed_col)

In [None]:
flights_converted_lat = convert_latitude_longitude_to_float(flights_new_col)

In [None]:
flights_dep_hour = create_hour_column_from_departure_time(flights_converted_lat)

In [None]:
flights_dep_hour.head()

### Deal with NaN data

In [None]:
flights_dep_hour.shape

In [None]:
flights_no_na = flights_dep_hour.dropna()

In [None]:
flights_no_na.shape

## Model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import xgboost as xgb

In [None]:
CATEGORICAL_COLUMNS = ['code_avion', 'compagnies_code', 'depart_code_iata', 'depart_lieu', 'depart_pays', 
                       'arrivee_code_iata', 'arrivee_lieu', 'arrivee_pays']

TARGET_COLUMN = "retard_a_l'arrivee"

In [None]:
#data = flights_no_na.drop(columns=CATEGORICAL_COLUMNS)
data = flights_no_na

In [None]:
X = data.drop(columns=[TARGET_COLUMN])
y = data[TARGET_COLUMN].map(lambda x: 1 if x > 0  else 0)

In [None]:
for column in CATEGORICAL_COLUMNS:
    le = LabelEncoder()
    X[column] = le.fit_transform(X[column].values.reshape(-1,1))

In [None]:
X

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
#model = RandomForestClassifier(n_estimators=10, max_depth=10, n_jobs=-1, verbose=2, random_state=42)
#model.fit(X_train, y_train)
gbm = xgb.XGBClassifier(max_depth=3, n_estimators=10, learning_rate=0.05, verbosity=3)
gbm.fit(X_train, y_train)

In [None]:
accuracy_score(y_test, gbm.predict(X_test).round())

In [None]:
import matplotlib.pyplot as plt

features=X.columns
importances = model.feature_importances_
indices = np.argsort(importances)

plt.figure(1)
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), features[indices])
plt.xlabel('Relative Importance')