In [1]:
import os
import time

import numpy as np
import pandas as pd

from catboost import CatBoostRegressor, Pool
from sklearn.metrics import mean_squared_error

from app.Encoder import Encoder

In [2]:
data = pd.read_csv('./X_train.csv')
aircraft_grp_encoder = Encoder(data['aircraft_grp'].unique())

In [3]:
# Generate available metrics for each pair ('engine_type', 'flight_phase') 

def get_engine_targets():
    data = pd.read_csv('./X_train.csv')
    y = pd.read_csv('./y_train.csv')
    y = y.drop(columns=['VSVNOM'])
    y['engine_type'] = data[data['engine_id'] == y['engine_id']]['engine_type']
    targets = {}
    for engine, df in y.groupby(['engine_type', 'flight_phase']):
        df = df.drop(columns=['flight_datetime', 'engine_id', 'flight_phase', 'engine_type'])
        size = df.shape[0]
        key = ' '.join([str(x) for x in engine]).replace('/', '_')
        for column in df.columns:
            # Fillter out columns (targets) with a lot of NaNs or with identical values
            if df[column].isna().sum() / size < 0.3 and (df[column].max() - df[column].min()) > 0.0001:
                engine_targets = targets.get(key, [])
                engine_targets.append(column)
                targets[key] = engine_targets

    return targets

In [4]:
def generate_dataset(key: str, metric:str, x: pd.DataFrame, y: pd.DataFrame,
                     empty:list = None):
    engine_type, flight_phase = key.split(' ')
    x['target'] = y[y['engine_id'] == x['engine_id']][metric]
    x = x[x['engine_type'] == engine_type] 
    x = x[x['flight_phase'] == flight_phase]
    # Remove columns (features) with a lot of NaNs
    if(empty is None):
        empty = []
        size = x.shape[0]
        for column in x.drop(columns=['target', 'flight_phase']).columns:
            if (x[column].isna().sum() / size) > 0.7:
                empty.append(column)
    x = x.drop(columns=empty)
    x = x[x['target'].notna()]
    x['aircraft_grp'] = x['aircraft_grp'].apply(aircraft_grp_encoder.encode_single)
    x = x.fillna(-100)
    features = x.drop(columns=['target', 'engine_id', 'aircraft_id', 'flight_datetime',
                               'engine_family', 'engine_type', 'manufacturer',
                               'ac_manufacturer', 'aircraft_type', 'aircraft_family',
                               'flight_phase'])
    return features, x['target'], empty

In [5]:
def generate_val_part(key: str, metric:str, x: pd.DataFrame,
                      aircraft_grp_encoder: Encoder, empty:list = None):
    engine_type, flight_phase = key.split(' ')
    x = x[x['engine_type'] == engine_type] 
    x = x[x['flight_phase'] == flight_phase]
    if(empty is None):
        empty = []
        size = x.shape[0]
        for column in x.drop(columns=['engine_type', 'flight_phase']).columns:
            if (x[column].isna().sum() / size) > 0.7:
                empty.append(column)
    x = x.drop(columns=empty)
    x['aircraft_grp'] = x['aircraft_grp'].apply(aircraft_grp_encoder.encode_single)
    x = x.fillna(-100)
    features = x.drop(columns=['engine_id', 'aircraft_id', 'flight_datetime',
                               'engine_family', 'engine_type', 'manufacturer',
                               'ac_manufacturer', 'aircraft_type', 'aircraft_family',
                               'flight_phase'])
    return x[['engine_id', 'flight_datetime', 'flight_phase']], features, empty

In [6]:
def generate_full_dataset(key: str, metric:str, x: pd.DataFrame, y: pd.DataFrame, 
                          x_te: pd.DataFrame, y_te: pd.DataFrame):
    x_train, y_train, empty = generate_dataset(key, metric, x, y)
    x_test, y_test, _ = generate_dataset(key, metric, x_te, y_te, empty)
    return x_train, y_train, x_test, y_test, empty

In [7]:
def train_model(key: str, metric:str, x: pd.DataFrame, y: pd.DataFrame,
                x_te: pd.DataFrame, y_te: pd.DataFrame):
    t_start = time.time()
    x_train, y_train, x_test, y_test, empty = generate_full_dataset(
        key,
        metric,
        x,
        y,
        x_te,
        y_te
    )
    train_pool = Pool(x_train, y_train)
    val_pool = Pool(x_test, y_test)
    test_pool = Pool(x_test)
    t_data = time.time() - t_start
    t_train = time.time()
    model = CatBoostRegressor(iterations=800, loss_function='RMSE',
                              logging_level='Silent', use_best_model=True,
                              early_stopping_rounds=150, max_depth=4)
    model.fit(train_pool, eval_set=val_pool)
    t_train = time.time() - t_train
    rmse = mean_squared_error(y_test, model.predict(test_pool), squared=False)
    print(f'Key: {key}, metric: {metric} \n\t rmse: {rmse}, percentage: {rmse / max(y_train.max() - y_train.min(), 0.0001) * 100}\n\tData time: {t_data}, train_time: {t_train}, overall_time: {time.time() - t_start}')
    return model

In [8]:
def train_all_models(root_path: str = "models"):
    if not os.path.exists(root_path):
        os.mkdir(root_path)
    index = len(list(os.listdir(root_path))) + 1
    os.mkdir(os.path.join(root_path, f'exp{index}'))
    targets = get_engine_targets()
    x = pd.read_csv('X_train.csv')
    # Replace / to avoid system conflicts with paths
    x['engine_type'] = x['engine_type'].map(lambda x: x.replace('/', '_'))
    x = x.astype({'n1_modifier': 'int32'})
    y = pd.read_csv('y_train.csv')
    x_te = pd.read_csv('X_test.csv')
    x_te['engine_type'] = x_te['engine_type'].map(lambda x: x.replace('/', '_'))
    x_te = x_te.astype({'n1_modifier': 'int32'})
    y_te = pd.read_csv('y_test.csv')
    for key, metrics in targets.items():
        os.mkdir(os.path.join(root_path, f'exp{index}', key))
        for metric in metrics:
            model = train_model(key, metric, x, y, x_te, y_te)
            model.save_model(
                os.path.join(root_path, f'exp{index}', key, f'{metric}.cbm')
            )


In [9]:
def evaluate_all_models(exp_index: int, root_path: str = "models"):
    targets = get_engine_targets()
    x = pd.read_csv('X_train.csv')
    x['engine_type'] = x['engine_type'].map(lambda x: x.replace('/', '_'))
    x = x.astype({'n1_modifier': 'int32'})
    y = pd.read_csv('y_train.csv')
    x_te = pd.read_csv('X_test.csv')
    x_te['engine_type'] = x_te['engine_type'].map(lambda x: x.replace('/', '_'))
    x_te = x_te.astype({'n1_modifier': 'int32'})
    y_te = pd.read_csv('y_test.csv')
    scores = {}
    for key, metrics in targets.items():
        for metric in metrics:
            model = CatBoostRegressor().load_model(
                os.path.join(root_path, f'exp{exp_index}', key, f'{metric}.cbm')
            )
            
            _, y_train, x_test, y_test, _ = generate_full_dataset(
                key,
                metric,
                x,
                y,
                x_te,
                y_te
            )
            test_pool = Pool(x_test)
            rmse = mean_squared_error(y_test, model.predict(test_pool), squared=False)
            engine_scores = scores.get(key, {})
            engine_scores[metric] = {
                'rmse': rmse,
                'percent': rmse / (max(y_train.max() - y_train.min(), 0.01)) * 100
            }
            scores[key] = engine_scores
    return scores


In [10]:
def get_overall_score(exp_index: int):
    scores = evaluate_all_models(exp_index)
    overall_scores = {}
    for key, metrics in scores.items():
        for metric, val in metrics.items():
            overall_score = overall_scores.get(metric, {'score': 0, 'n': 0})
            overall_score['score'] = overall_score['score'] + val['rmse']
            overall_score['n'] = overall_score['n'] + 1
            overall_scores[metric] = overall_score
    result = {}
    y = pd.read_csv('y_train.csv')
    for metric, val in overall_scores.items():
        result[metric] = [
            val['score'] / val['n'],
            (val['score'] / val['n']) / max(
                y[metric].max() - y[metric].min(), 0.0001
            ) * 100
        ]
        result

    result_scores = pd.DataFrame.from_dict(result, orient='index',
                                           columns=['rmse', 'percent error'])
    result_scores['rmse'] = result_scores['rmse'].apply(lambda x: '%.3f' % x)
    result_scores['percent error'] = \
        result_scores['percent error'].apply(lambda x: f'{round(x, 2)}%')
    result_scores = result_scores.sort_index()
    return result_scores

In [17]:
def predict(filepath: str, models_dir: str) -> pd.DataFrame:
    x_val = pd.read_csv(filepath)
    x_val['engine_type'] = x_val['engine_type'].map(lambda x: x.replace('/', '_'))
    x = pd.read_csv('X_train.csv')
    x['engine_type'] = x['engine_type'].map(lambda x: x.replace('/', '_'))
    x = x.astype({'n1_modifier': 'int32'})
    aircraft_grp_encoder = Encoder(x['aircraft_grp'].unique())
    targets = get_engine_targets()
    result = None
    for key, metrics in targets.items():
        df = None
        for metric in metrics:
            _, _, empty = generate_val_part(key, metric, x, aircraft_grp_encoder)
            ids, x_val_part, _ = generate_val_part(
                key, metric, x_val, aircraft_grp_encoder, empty
            )
            model = CatBoostRegressor().load_model(
                os.path.join(models_dir, key, f'{metric}.cbm')
            )
            prediction = model.predict(x_val_part)
            ids[metric] = prediction
            if df is None:
                df = ids
            else:
                df = pd.merge(
                    df,
                    ids, 
                    on=['engine_id', 'flight_datetime', 'flight_phase'],
                    how='outer'
                )
        if result is None:
            result = df
        else:
            result = pd.concat((result, df))

    # VSVNOM always contains either NaN or 0.0, it is not predicted
    result["VSVNOM"] = 0.
    result = result.fillna(result.mean())
    return result

            

### Train models

In [12]:
train_all_models()

Key: CF34-8E5 CRUISE, metric: BRAT 
	 rmse: 0.004878300745304771, percentage: 0.4878300745304771
	Data time: 0.19699525833129883, train_time: 4.618396282196045, overall_time: 4.822413206100464
Key: CF34-8E5 CRUISE, metric: DEGT 
	 rmse: 2.22838319449058, percentage: 1.9661054040343418
	Data time: 0.1549839973449707, train_time: 3.286564588546753, overall_time: 3.4445247650146484
Key: CF34-8E5 CRUISE, metric: EGTC 
	 rmse: 1.6164443630145424, percentage: 0.4750323744352746
	Data time: 0.13700318336486816, train_time: 2.377014398574829, overall_time: 2.51900315284729
Key: CF34-8E5 CRUISE, metric: GPCN25 
	 rmse: 0.0844384539467183, percentage: 3.1264342167431054
	Data time: 0.13900136947631836, train_time: 2.4801418781280518, overall_time: 2.6231424808502197
Key: CF34-8E5 CRUISE, metric: GWFM 
	 rmse: 0.807489249894825, percentage: 4.381719097659683
	Data time: 0.13901877403259277, train_time: 2.9363949298858643, overall_time: 3.079414129257202
Key: CF34-8E5 CRUISE, metric: PCN12 
	 rmse

### Evaluate models on test dataset

In [14]:
get_overall_score(2)

Unnamed: 0,rmse,percent error
BRAT,0.021,1.12%
DEGT,1.857,0.7%
DELFN,0.328,0.78%
DELN1,0.15,0.81%
DELVSV,0.02,0.11%
DPOIL,0.063,0.19%
EGTC,1.016,0.26%
EGTHDM,3.198,0.93%
EGTHDM_D,3.633,5.32%
GEGTMC,1.842,0.64%


### Create submission

In [18]:
y_valid = predict("X_valid.csv", "models/exp2/")
y_valid

  result = result.fillna(result.mean())


Unnamed: 0,engine_id,flight_datetime,flight_phase,BRAT,DEGT,EGTC,GPCN25,GWFM,PCN12,PCN12I,...,DPOIL,GEGTMC,GN2MC,WBE,ZTNAC_D,DELFN,DELN1,PCN1AR,PCN1BR,VSVNOM
0,bf23db5ac85aa359c82cb8a18b0d0ce406fc40272a9b00...,2022-06-07 02:48:28,CRUISE,1.000000,11.733448,766.179141,0.487207,4.423521,91.437901,91.440539,...,-19.15243,94.461648,2.64092,1.567022,-0.072951,17.323733,6.888370,90.153133,90.085177,0.0
1,0a88b8ae00eb0b817075fb953711648d5d3dd8687e5794...,2022-04-11 14:05:20,CRUISE,0.999999,14.583947,676.838089,0.071364,2.561371,87.289403,87.301037,...,-19.15243,94.461648,2.64092,1.567022,-0.072951,17.323733,6.888370,90.153133,90.085177,0.0
2,2d1eecee1d07af92b614c41744cf604226df549b1ca451...,2022-04-21 09:34:31,CRUISE,1.000001,18.166474,712.984819,0.311030,1.000648,88.407462,88.411817,...,-19.15243,94.461648,2.64092,1.567022,-0.072951,17.323733,6.888370,90.153133,90.085177,0.0
3,b89b5e1fea12a0ab78c29ca87649bf2687f1558dd0e67c...,2022-06-28 17:14:01,CRUISE,1.000005,18.482742,761.128886,-0.368831,3.319905,90.988659,91.002532,...,-19.15243,94.461648,2.64092,1.567022,-0.072951,17.323733,6.888370,90.153133,90.085177,0.0
4,ed163ff56745c53c98f72a1f2d2b989f7472e9231378c7...,2022-07-02 00:10:12,CRUISE,1.000000,13.654651,855.203761,-0.566104,0.511675,98.552016,98.518963,...,-19.15243,94.461648,2.64092,1.567022,-0.072951,17.323733,6.888370,90.153133,90.085177,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
600,364a94e57c5e7705c650bdadda955186d3a8f9d96a3d9e...,2022-04-18 07:42:30,TAKEOFF,1.000019,2.747740,660.400365,0.013040,-1.615732,87.899997,88.389134,...,-19.15243,94.461648,2.64092,1.567022,-0.072951,26.037438,12.055206,89.546539,89.965146,0.0
601,8795a3b0ee5404d35e8c3d2ad94a55b22f7b2e568efba0...,2022-06-27 06:55:12,TAKEOFF,1.000009,2.747740,660.400365,0.013040,-1.615732,91.821588,92.436771,...,-19.15243,94.461648,2.64092,1.567022,-0.072951,23.302760,10.130065,89.610982,90.226392,0.0
602,75f7a52561c81524430f1f2b83d40b61d4d1bdf83ab837...,2022-04-27 01:09:59,TAKEOFF,0.999979,2.747740,660.400365,0.013040,-1.615732,92.371090,92.826112,...,-19.15243,94.461648,2.64092,1.567022,-0.072951,21.469482,9.775971,90.887515,91.363158,0.0
603,e2f2eecb0976a1b21fbc21d709416039de60230cc5ba33...,2022-04-24 17:28:49,TAKEOFF,1.000007,2.747740,660.400365,0.013040,-1.615732,87.658277,88.087893,...,-19.15243,94.461648,2.64092,1.567022,-0.072951,31.405499,14.563255,87.375261,87.894542,0.0


In [20]:
y_valid.to_csv("y_valid_RealityX.csv", index=False)

### Create archive with weights (models)

In [19]:
import shutil

shutil.make_archive("weights_RealityX", "zip", base_dir="models/exp2/")

'weights_RealityX.zip'