In [1]:
import pickle
import pandas as pd
import utils
import os
import numpy as np
import xgboost
from joblib import dump, load
from sklearn.metrics import log_loss, accuracy_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from scipy.special import softmax

In [9]:
data_dir = './data'
preprocessed_data_dir = './preprocessed_data/'
label_dir = os.path.join(data_dir, "prescreened_train_labels.csv.bz2")
predict_dir = os.path.join(data_dir, "partial_submission_format.csv")
predictions = pd.read_csv(predict_dir, parse_dates=["timestamp"])
submission = predictions.copy().reset_index(drop=True)
submission["active"] = np.nan

In [10]:
unique_config_maps = {}
airport_codes = []
grouped = submission.groupby(["airport"], sort=False)
for key, group in grouped:
    unique_config_maps[key] = group.config.unique()
    airport_codes.append(key)

In [35]:
# Code to prepare the datasets.
# feature_directory = Path("../../data")
# airport_directories = sorted(path for path in feature_directory.glob("k*"))
#
# airport_train_map = {}
# airport_codes = []
# for airport_directory in sorted(airport_directories):
#     airport_code, airport_config_df = utils.read_airport_configs(airport_directory)
#     #airport_config_df_map[airport_code] = airport_config_df
#     airport_config_df = utils.clean_airport_configs(airport_config_df, unique_config_maps[airport_code], airport_code)
#
#     print(len(airport_config_df.airport_config.unique()))
#     airport_codes.append(airport_code)
#
#     airport_code, airport_weather_df = utils.read_airport_weather_data(airport_directory)
#     #airport_weather_df_map[airport_code] = airport_weather_df
#
#     airport_label_df = utils.split_labels_into_airports(label_dir, airport_code)
#     airport_train_map[airport_code] = utils.create_train_dataset(airport_label_df, airport_weather_df, airport_config_df)
#     print(airport_code)

['katl',
 'kclt',
 'kden',
 'kdfw',
 'kjfk',
 'kmem',
 'kmia',
 'kord',
 'kphx',
 'ksea',
 'open_train_labels.csv',
 'open_train_labels.csv.bz2',
 'partial_submission_format.csv',
 'prescreened_train_labels.csv.bz2']

In [93]:
def split_data(dataframe, unique_labels, test_size=0.2, random_state=1234):
    unique_config_rows = dataframe.copy().drop_duplicates(subset=['true_labels'])
    dataframe = dataframe[~dataframe.isin(unique_config_rows)].dropna(how='all')

    missing_configs = set(unique_labels) - set(unique_config_rows['true_labels'])
    print(missing_configs)

    # Fill any missing configs randomly. Not the most elegant, but works.
    fill_rows = dataframe.dropna(axis=0).sample(n=len(missing_configs))
    for idx, config in enumerate(missing_configs):
        cur_row = fill_rows.iloc[[idx]]
        cur_row.true_labels = config
        cur_row.previous_config = config
        unique_config_rows = pd.concat([unique_config_rows, cur_row])

    train_set, test_set = train_test_split(dataframe, test_size=test_size, random_state=random_state)
    train_set = pd.concat([train_set, unique_config_rows])
    return train_set, test_set


def pre_pipeline_data(dataframe):
    dataframe = dataframe.drop(['time_pred_made', 'target_pred_timestamp', 'time_forecast_made', 'forecast_timestamp'], axis=1)
    dataframe = dataframe.dropna(axis=0)

    # After removing NAs, get the labels.
    y_train = dataframe['true_labels']
    dataframe = dataframe.drop(['true_labels'], axis=1)
    return dataframe, y_train

def make_uniform(num_vals):
    array = np.ones((num_vals,))
    array = softmax(array)
    return array


def divide(arr):
    return arr / sum(arr)


def merge_preds(pred_matrix):
    size = pred_matrix.shape[1]
    arr = .05*make_uniform(size) + .95*pred_matrix
    arr = np.apply_along_axis(divide, 1, arr)
    return arr

In [82]:
airport_train_map = {}
for code in airport_codes:
    airport_train_map[code] = pd.read_csv(preprocessed_data_dir + code + "train.csv",
                                          parse_dates=['time_pred_made', 'target_pred_timestamp', 'time_forecast_made', 'forecast_timestamp'],
                                          index_col=0)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [30]:
model_map = {}
data_pipeline_map = {}
label_enc_map = {}
train_data_map = {}
test_data_map = {}

for airport in airport_codes:
    current_data = airport_train_map[airport]
    current_label_configs = unique_config_maps[airport]
    train_set, test_set = split_data(current_data, current_label_configs, test_size=0.2, random_state=1234)

    x_train, y_train = pre_pipeline_data(train_set)
    x_test, y_test = pre_pipeline_data(test_set)

    data_pipeline, label_enc = utils.get_pipeline(current_label_configs)

    x_train_piped = data_pipeline.fit_transform(x_train)
    y_train_piped = label_enc.transform(y_train)

    x_test_piped = data_pipeline.transform(x_test)
    y_test_piped = label_enc.transform(y_test)

    xgb = xgboost.XGBClassifier(objective='multi:softprob', random_state=52, n_estimators=725, eval_metric='mlogloss', n_jobs=-1, use_label_encoder=False)
    rf = RandomForestClassifier(n_estimators=150, n_jobs=-1, random_state=123)

    voting_classifier = VotingClassifier(estimators=[('xgboost', xgb), ('rf', rf)],
                                         voting='soft', n_jobs=-1, verbose=True, weights=[.75, .25])
    voting_classifier = voting_classifier.fit(x_train_piped, y_train_piped)

    print(airport)
    preds = voting_classifier.predict_proba(x_test_piped)
    cat_preds = voting_classifier.predict(x_test_piped)
    print(log_loss(to_categorical(y_test_piped), preds, eps=1e-16))
    print(accuracy_score(y_test_piped, cat_preds))

    data_pipeline_map[airport] = data_pipeline
    label_enc_map[airport] = label_enc
    train_data_map[airport] = (x_train, y_train)
    test_data_map[airport] = (x_test, y_test)
    model_map[airport] = voting_classifier


{'katl:D_9L_A_9R'}
katl
0.3160810833221516
0.9033171521035599
set()
kclt
0.35111346185780706
0.8860855496005663
set()
kden
1.1099512518157766
0.6701557443365695
set()
kdfw
0.48675390537808866
0.8492769744160178
set()
kjfk
0.2099466961531511
0.9332490518331227
set()
kmem
0.7135362830470742
0.7660768452982811
set()
kmia
0.4135162366307961
0.8736916620316529
set()
kord
0.885584990241789
0.7204186893203883
set()
kphx
0.4131870075396187
0.8736852750809061
set()
ksea
0.2271242917852956
0.9265888063097224


In [None]:
for airport in airport_codes:
    with open(f'model_assets/{airport}_model.joblib', 'wb') as file:
        dump(model_map[airport], file)

with open("model_assets/saved_data_pipeline_map.pkl", 'wb') as file:
    pickle.dump(data_pipeline_map, file, pickle.HIGHEST_PROTOCOL)

with open("model_assets/saved_label_enc_map.pkl", 'wb') as file:
    pickle.dump(label_enc_map, file, pickle.HIGHEST_PROTOCOL)