# Imports

In [77]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import holidays

%matplotlib inline

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_selector as selector
from sklearn.feature_extraction import DictVectorizer

from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV


from xgboost import XGBRegressor
import xgboost as xgb


# CSV Reads

In [78]:
# read csv
data_origin = pd.read_csv(
    "/home/antoine/projects/forecasting/data/KIX_AODB_data.csv",
    low_memory=False,
)
data_airports = pd.read_csv(
    "/home/antoine/projects/forecasting/data/AODB_airport_master.csv"
)

data_countries = pd.read_csv(
    "/home/antoine/projects/forecasting/data/AODB_country_master.csv"
)


In [None]:
from pathlib import Path
all_files = list(Path("/home/antoine/projects/forecasting/data/planned flights").glob('*.csv'))
all_files

In [None]:
df = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)

# Data cleaning

In [79]:
# select only useful columns
data = data_origin[
    [
        "Service Type",  # string
        "Traffic Type",  # string
        "Capacity",  # to convert to int
        "L Board Pax",  # int already
        "Direction",  # string
        "Date",  # date to convert to int for year/month/date
        "time",  # time to convert to int for hour
        "Routing-FirstLeg",  # string, should be country
    ]
].copy()

# filter out rows with irrelevant values
mask = (
    (data["Service Type"].isin(["C", "G", "J"]))
    & (data["Capacity"] != 0)
    & (data["L Board Pax"] != 0)
)
data = data[mask].copy()

# change capacity to numerical
data["Capacity"] = pd.to_numeric(data["Capacity"], errors="coerce")

# split date into year month day
data["Date"] = pd.to_datetime(data["Date"])
data["Year"] = data["Date"].apply(lambda x: x.year)
data["Month"] = data["Date"].apply(lambda x: x.month)
data["Day"] = data["Date"].apply(lambda x: x.day)

# change time to number of hour (int)
data["Hour"] = pd.to_datetime(data["time"]).apply(lambda x: x.hour)

# drop na and convert to int
data.dropna(inplace=True)
data["Capacity"] = data["Capacity"].astype("int")
data["L Board Pax"] = data["L Board Pax"].astype("int")

# replace capacity and pax with Load Factor
data["Load Factor"] = data["L Board Pax"] / data["Capacity"]

# change routing to Country name then to Country code
repl = data_airports[["ICAO", "Country"]].set_index("ICAO").T.to_dict(orient="records")
data["Country"] = data["Routing-FirstLeg"].map(*repl)
repl_country = (
    data_countries[["Name", "ISO-3166-1 alpha-2"]]
    .set_index("Name")
    .T.to_dict(orient="records")
)
data["Country"] = data["Country"].map(*repl_country)

# holidays
data["HolidayJP"] = 0
data["HolidayOrigin"] = 0

dct_holiday = {
    country_code: holidays.country_holidays(country_code)
    for country_code in data["Country"].unique()
    if hasattr(holidays, country_code)
}

for index, row in data.iterrows():
    # domestic holiday
    if row["Date"] in dct_holiday["JP"]:
        data.loc[index, "HolidayJP"] = 1
    # overseas holiday
    if row["Country"] in dct_holiday.keys():
        if row["Date"] in dct_holiday[row["Country"]]:
            data.loc[index, "HolidayOrigin"] = 1

# drop old columns
data.drop(["L Board Pax", "Capacity"], axis="columns", inplace=True)
data.drop("Routing-FirstLeg", axis="columns", inplace=True)
data.drop("Date", axis="columns", inplace=True)
data.drop("time", axis="columns", inplace=True)

# change types for categories
data["Service Type"] = data["Service Type"].astype("category")
data["Traffic Type"] = data["Traffic Type"].astype("category")
data["Direction"] = data["Direction"].astype("category")
data["Country"] = data["Country"].astype("category")


  .T.to_dict(orient="records")


In [None]:
data.info()

# Pipeline creation and first fit

In [81]:
# split dataset
X = data.drop("Load Factor", axis=1)
y = data["Load Factor"]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)

# pipeline creation
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)
categorical_transformer = OneHotEncoder(handle_unknown="ignore")
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, selector(dtype_exclude="category")),
        ("cat", categorical_transformer, selector(dtype_include="category")),
    ]
)
model = XGBRegressor()
regressor = Pipeline(
    steps=[("preprocessor", preprocessor), ("regressor", XGBRegressor())]
)


In [83]:
regressor.fit(X_train, y_train)
print("training score: %.3f" % regressor.score(X_train, y_train))
print("model score: %.3f" % regressor.score(X_test, y_test))


training score: 0.979
model score: 0.534


In [None]:
# regressor["regressor"].get_booster().feature_names = list(
#     regressor["preprocessor"].get_feature_names_out()
# )
# xgb.plot_importance(regressor["regressor"], ax=plt.gca())


# Hyper-parameters tuning

In [93]:
# Set the parameters by cross-validation
hyperparameter_grid = {
    'regressor__n_estimators': [100, 400, 800],
    'regressor__max_depth': [3, 6, 9],
    'regressor__learning_rate': [0.05, 0.1, 0.20],
    'regressor__min_child_weight': [1, 10, 100]
    }

gridCV = GridSearchCV(regressor, param_grid = hyperparameter_grid,cv=4)
gridCV.fit(X_train, y_train)

print("Best parameters set found on development set:")
print( gridCV.best_params_)
print("Best score found on development set:")
print( gridCV.best_score_)


# Test model on future schedule

In [None]:
# how to make predictions on future flights
# plot total departure/arrival international/domestic pax for NOV=>JAN