In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from Encoder import Encoder
import os
import time
from catboost import CatBoostRegressor, Pool
import pickle
from sklearn.metrics import mean_squared_error
import pprint

In [2]:
class CombinedMetric(tf.keras.callbacks.Callback):
    def __init__(self, patience):
        super(CombinedMetric, self).__init__()
        self.patience = patience
        self.wait = 0
        self.best = np.Inf
        self.best_weights = None

    def on_epoch_end(self, epoch, logs={}):
        accuracy = logs['val_root_mean_squared_error']
        self.wait += 1
        if (accuracy <= self.best):
            self.best = accuracy
            self.best_weights = self.model.get_weights()
            self.wait = 0
        else:
            if(self.wait >= self.patience and epoch > 0):
                self.model.stop_training = True
                if self.best_weights is not None:
                    self.model.set_weights(self.best_weights)
        logs['best'] = self.best

    def on_train_end(self, logs=None):
        if self.wait < self.patience and self.best_weights is not None:
            self.model.set_weights(self.best_weights)

In [4]:
data = pd.read_csv('./X_train.csv')
flight_phase_encoder = Encoder(data['flight_phase'].unique())
aircraft_grp_encoder = Encoder(data['aircraft_grp'].unique())

In [5]:
def get_engine_targets():
    data = pd.read_csv('./X_train.csv')
    y = pd.read_csv('./y_train.csv')
    y = y.drop(columns=['VSVNOM'])
    y['engine_type'] = data[data['engine_id'] == y['engine_id']]['engine_type']
    # y['n1_modifier'] = data[data['engine_id'] == y['engine_id']]['n1_modifier']
    # y = y.astype({'n1_modifier': 'int32'})
    targets = {}
    for engine, df in y.groupby(['engine_type', 'flight_phase']):
        df = df.drop(columns=['flight_datetime', 'engine_id', 'flight_phase', 'engine_type'])
        size = df.shape[0]
        key = ' '.join([str(x) for x in engine]).replace('/', '_')
        for column in df.columns:
            if df[column].isna().sum() / size < 0.3 and (df[column].max() - df[column].min()) > 0.0001:
                engine_targets = targets.get(key, [])
                engine_targets.append(column)
                targets[key] = engine_targets

    return targets

In [6]:
def generate_dataset(key: str, metric:str, x: pd.DataFrame, y: pd.DataFrame, empty:list = None):
    engine_type, flight_phase = key.split(' ')
    x['target'] = y[y['engine_id'] == x['engine_id']][metric]
    x = x[x['engine_type'] == engine_type] 
    # x = x[x['n1_modifier'] == n1_modifier]
    x = x[x['flight_phase'] == flight_phase]
    if(empty is None):
        empty = []
        size = x.shape[0]
        for column in x.drop(columns=['target', 'flight_phase']).columns:
            if (x[column].isna().sum() / size) > 0.7:
                empty.append(column)
    x = x.drop(columns=empty)
    x = x[x['target'].notna()]
    x['aircraft_grp'] = x['aircraft_grp'].apply(aircraft_grp_encoder.encode_single)
    x = x.fillna(-100)
    return x.drop(columns=['target', 'engine_id', 'aircraft_id', 'flight_datetime', 'engine_family', 'engine_type', 'manufacturer', 'ac_manufacturer', 'aircraft_type', 'aircraft_family', 'flight_phase']), x['target'], empty

In [7]:
def generate_full_dataset(key: str, metric:str, x: pd.DataFrame, y: pd.DataFrame, x_te, y_te):
    x_train, y_train, empty = generate_dataset(key, metric, x, y)
    x_test, y_test, _ = generate_dataset(key, metric, x_te, y_te, empty)
    return x_train, y_train, x_test, y_test, empty

In [8]:
def create_model(shape: int):
    model = tf.keras.models.Sequential([
        tf.keras.layers.InputLayer(shape),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dropout(0.1),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dense(8, activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dense(1),
    ])
    scheduler = tf.keras.optimizers.schedules.PolynomialDecay(0.1, 3000, 0.001)
    model.compile(optimizer=tf.keras.optimizers.Adam(scheduler), loss='mse', metrics=[tf.keras.metrics.RootMeanSquaredError()])
    return model

In [9]:
def train_model(key: str, metric:str, x: pd.DataFrame, y: pd.DataFrame, x_te, y_te):
    t_start = time.time()
    x_train, y_train, x_test, y_test, empty = generate_full_dataset(
        key,
        metric,
        x,
        y,
        x_te,
        y_te
    )
    train_pool = Pool(x_train, y_train)
    val_pool = Pool(x_test, y_test)
    test_pool = Pool(x_test)
    t_data = time.time() - t_start
    # model = create_model(x_train.shape[1])
    t_train = time.time()
    # model.fit(x_train, y_train, epochs=200, batch_size=32, validation_data=(x_test, y_test), callbacks=[CombinedMetric(20)], verbose=0)
    # loss, rmse = model.evaluate(x_test, y_test)
    model = CatBoostRegressor(iterations=3200, loss_function='RMSE', logging_level='Silent', use_best_model=True, early_stopping_rounds=150, max_depth=4)
    model.fit(train_pool, eval_set=val_pool)
    t_train = time.time() - t_train
    rmse = mean_squared_error(y_test, model.predict(test_pool), squared=False)
    print(f'Key: {key}, metric: {metric} \n\t rmse: {rmse}, percentage: {rmse / max(y_train.max() - y_train.min(), 0.0001) * 100}\n\tData time: {t_data}, train_time: {t_train}, overall_time: {time.time() - t_start}')
    return model

In [10]:
def train_all_models():
    index = len(list(os.listdir('models'))) + 1
    # index = len(list(os.listdir('models')))
    os.mkdir(os.path.join('models', f'exp{index}'))
    targets = get_engine_targets()
    x = pd.read_csv('X_train.csv')
    x['engine_type'] = x['engine_type'].map(lambda x: x.replace('/', '_'))
    x = x.astype({'n1_modifier': 'int32'})
    y = pd.read_csv('y_train.csv')
    x_te = pd.read_csv('X_test.csv')
    x_te['engine_type'] = x_te['engine_type'].map(lambda x: x.replace('/', '_'))
    x_te = x_te.astype({'n1_modifier': 'int32'})
    y_te = pd.read_csv('y_test.csv')
    for key, metrics in targets.items():
        os.mkdir(os.path.join('models', f'exp{index}', key))
        for metric in metrics:
            # print(metric)
            model = train_model(key, metric, x, y, x_te, y_te)
            model.save_model(os.path.join('models', f'exp{index}', key, f'{metric}.cbm'))
            # model.save(os.path.join('models', f'exp{index}', engine, f'{metric}.h5'))


In [11]:
def evaluate_all_models(exp_index: int):
    targets = get_engine_targets()
    x = pd.read_csv('X_train.csv')
    x['engine_type'] = x['engine_type'].map(lambda x: x.replace('/', '_'))
    x = x.astype({'n1_modifier': 'int32'})
    y = pd.read_csv('y_train.csv')
    x_te = pd.read_csv('X_test.csv')
    x_te['engine_type'] = x_te['engine_type'].map(lambda x: x.replace('/', '_'))
    x_te = x_te.astype({'n1_modifier': 'int32'})
    y_te = pd.read_csv('y_test.csv')
    scores = {}
    for key, metrics in targets.items():
        for metric in metrics:
            # model = tf.keras.models.load_model(os.path.join('models', f'exp{exp_index}', engine, f'{metric}.h5'))
            model = CatBoostRegressor().load_model(os.path.join('models', f'exp{exp_index}', key, f'{metric}.cbm'))
            
            x_train, y_train, x_test, y_test, empty = generate_full_dataset(
                key,
                metric,
                x,
                y,
                x_te,
                y_te
            )
            # loss, rmse = model.evaluate(x_test, y_test)
            test_pool = Pool(x_test)
            rmse = mean_squared_error(y_test, model.predict(test_pool), squared=False)
            engine_scores = scores.get(key, {})
            engine_scores[metric] = {'rmse': rmse, 'percent': rmse / (max(y_train.max() - y_train.min(), 0.01)) * 100}
            scores[key] = engine_scores
    return scores


In [97]:
train_all_models()

Key: CF34-8E5 CRUISE, metric: BRAT 
	 rmse: 0.0043982504347780185, percentage: 0.4398250434778018
	Data time: 0.06699800491333008, train_time: 4.723588228225708, overall_time: 4.794586181640625
Key: CF34-8E5 CRUISE, metric: DEGT 
	 rmse: 1.8937983512889813, percentage: 1.670900759719552
	Data time: 0.05399775505065918, train_time: 4.340362548828125, overall_time: 4.397361516952515
Key: CF34-8E5 CRUISE, metric: EGTC 
	 rmse: 1.2967567167720633, percentage: 0.3810842094709215
	Data time: 0.0540010929107666, train_time: 4.37437891960144, overall_time: 4.432382106781006
Key: CF34-8E5 CRUISE, metric: GPCN25 
	 rmse: 0.07727147858254628, percentage: 2.861068427084742
	Data time: 0.05299949645996094, train_time: 4.434455633163452, overall_time: 4.490455865859985
Key: CF34-8E5 CRUISE, metric: GWFM 
	 rmse: 0.707303827798308, percentage: 3.8380779564747303
	Data time: 0.05500006675720215, train_time: 4.40537428855896, overall_time: 4.464374303817749
Key: CF34-8E5 CRUISE, metric: PCN12 
	 rmse: 

In [98]:
scores = evaluate_all_models(17)

In [99]:
def get_overall_score(scores):
    overall_scores = {}
    for key, metrics in scores.items():
        for metric, val in metrics.items():
            overall_score = overall_scores.get(metric, {'score': 0, 'n': 0})
            overall_score['score'] = overall_score['score'] + val['rmse']
            overall_score['n'] = overall_score['n'] + 1
            overall_scores[metric] = overall_score
    result = {}
    y = pd.read_csv('y_train.csv')
    for metric, val in overall_scores.items():
        result[metric] = [val['score'] / val['n'], (val['score'] / val['n']) / max(y[metric].max() - y[metric].min(), 0.0001) * 100]
        result
    return result

In [100]:
result_scores = pd.DataFrame.from_dict(get_overall_score(scores), orient='index', columns=['rmse', 'percent error'])
result_scores['rmse'] = result_scores['rmse'].apply(lambda x: '%.3f' % x)
result_scores['percent error'] = result_scores['percent error'].apply(lambda x: f'{round(x, 2)}%')
result_scores.sort_index()

Unnamed: 0,rmse,percent error
BRAT,0.021,1.12%
DEGT,1.857,0.7%
DELFN,0.328,0.78%
DELN1,0.15,0.81%
DELVSV,0.02,0.11%
DPOIL,0.063,0.19%
EGTC,1.016,0.26%
EGTHDM,3.198,0.93%
EGTHDM_D,3.633,5.32%
GEGTMC,1.842,0.64%


In [None]:
#11 16 17

In [None]:
def get_engine_targets():
    data = pd.read_csv('./X_train.csv')
    y = pd.read_csv('./y_train.csv')
    y = y.drop(columns=['VSVNOM'])
    y['engine_type'] = data[data['engine_id'] == y['engine_id']]['engine_type']
    # y['n1_modifier'] = data[data['engine_id'] == y['engine_id']]['n1_modifier']
    # y = y.astype({'n1_modifier': 'int32'})
    targets = {}
    for engine, df in y.groupby(['engine_type', 'flight_phase']):
        df = df.drop(columns=['flight_datetime', 'engine_id', 'flight_phase', 'engine_type'])
        size = df.shape[0]
        key = ' '.join([str(x) for x in engine]).replace('/', '_')
        for column in df.columns:
            if df[column].isna().sum() / size < 0.3 and (df[column].max() - df[column].min()) > 0.0001:
                engine_targets = targets.get(key, [])
                engine_targets.append(column)
                targets[key] = engine_targets

    return targets

In [107]:
import json
targets = get_engine_targets()
with open('app/targets.json', 'w') as f:
    json.dump(targets, f)

In [16]:
def generate_val_part(key: str, metric:str, x: pd.DataFrame, aircraft_grp_encoder, empty:list = None):
    engine_type, flight_phase = key.split(' ')
    x = x[x['engine_type'] == engine_type] 
    # x = x[x['n1_modifier'] == n1_modifier]
    x = x[x['flight_phase'] == flight_phase]
    if(empty is None):
        empty = []
        size = x.shape[0]
        for column in x.drop(columns=['engine_type', 'flight_phase']).columns:
            if (x[column].isna().sum() / size) > 0.7:
                empty.append(column)
    x = x.drop(columns=empty)
    x['aircraft_grp'] = x['aircraft_grp'].apply(aircraft_grp_encoder.encode_single)
    x = x.fillna(-100)
    return x[['engine_id', 'flight_datetime', 'flight_phase']], x.drop(columns=['engine_id', 'aircraft_id', 'flight_datetime', 'engine_family', 'engine_type', 'manufacturer', 'ac_manufacturer', 'aircraft_type', 'aircraft_family', 'flight_phase']), empty

In [103]:
def dataframe_to_dict(dataframe: pd.DataFrame):
    result = []
    dataframe = dataframe.reset_index()
    for index, row in dataframe.iterrows():
        asd = {}
        row = row.drop(labels=['index'])
        asd['engine_id'] = row['engine_id']
        asd['flight_phase'] = row['flight_phase']
        asd['flight_datetime'] = row['flight_datetime']
        metrics = {}
        row = row.drop(labels=['engine_id', 'flight_datetime', 'flight_phase'])
        for label in row.keys():
            metrics[label] = None if np.isnan(row[label]) else row[label]
        asd['metrics'] = metrics
        result.append(asd)
    return result

In [61]:
def predict(filepath: str, model_dir: str):
    x_val = pd.read_csv(filepath)
    x = pd.read_csv('X_train.csv')
    x['engine_type'] = x['engine_type'].map(lambda x: x.replace('/', '_'))
    x = x.astype({'n1_modifier': 'int32'})
    flight_phase_encoder = Encoder(x['flight_phase'].unique())
    aircraft_grp_encoder = Encoder(x['aircraft_grp'].unique())
    targets = get_engine_targets()
    result = None
    for key, metrics in targets.items():
        df = None
        for metric in metrics:
            _, _, empty = generate_val_part(key, metric, x, aircraft_grp_encoder)
            ids, x_val_part, _ = generate_val_part(key, metric, x_val, aircraft_grp_encoder, empty)
            model = CatBoostRegressor().load_model(os.path.join(model_dir, key, f'{metric}.cbm'))
            prediction = model.predict(x_val_part)
            ids[metric] = prediction
            if df is None:
                df = ids
            else:
                df = pd.merge(df, ids, on=['engine_id', 'flight_datetime', 'flight_phase'], how='outer')
        if result is None:
            result = df
        else:
            result = pd.concat((result, df))
    return result


            

In [71]:
tmp = predict('X_valid.csv', 'models/exp17')

In [73]:
tmp.to_csv('y_valid.csv', index=False)

In [104]:
dataframe_to_dict(tmp)

[{'engine_id': 'bf23db5ac85aa359c82cb8a18b0d0ce406fc40272a9b009ee01d62a88d4cfc61',
  'flight_phase': 'CRUISE',
  'flight_datetime': '2022-06-07 02:48:28',
  'metrics': {'BRAT': 1.0000000852432864,
   'DEGT': 11.733447669504635,
   'EGTC': 766.1791414645561,
   'GPCN25': 0.48720686655286943,
   'GWFM': 4.423521478467699,
   'PCN12': 91.43790130170532,
   'PCN12I': 91.44053903293761,
   'PCN1K': 97.48274691637293,
   'PCN2C': 89.3778210724457,
   'WBI': 8.524372461203578e-08,
   'WFMP': 1793.597118716451,
   'ZPCN25_D': 0.14028036939800445,
   'ZT49_D': -2.0620197136883136,
   'ZTLA_D': -0.013400144113976265,
   'ZWF36_D': 4.296574907796749,
   'EGTHDM': None,
   'EGTHDM_D': None,
   'SLOATL': None,
   'SLOATL_D': None,
   'DELVSV': None,
   'DPOIL': None,
   'GEGTMC': None,
   'GN2MC': None,
   'WBE': None,
   'ZTNAC_D': None,
   'DELFN': None,
   'DELN1': None,
   'PCN1AR': None,
   'PCN1BR': None}},
 {'engine_id': '0a88b8ae00eb0b817075fb953711648d5d3dd8687e57946ccff881382b6606ae',
  '