In [None]:
# !pip install numpy
# !pip install pandas
# !pip install tqdm
!pip install scikit-learn
!pip install catboost

In [30]:
import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_squared_error
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor

pd.set_option('display.max_columns', 200)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [4]:
X = pd.read_csv("/content/drive/MyDrive/S7/X.csv")
y = pd.read_csv("/content/drive/MyDrive/S7/y.csv")
X['ZTAMB'] = 40 / 21 * (X['ZTAMB'] - 7.5)

In [5]:
#function for counting nan values in cols
def count_nan_cols(X : pd.DataFrame):
  count = 0
  for i in X.columns:
    for j in X[i]:
      if pd.isna(j):
        count+=1
    print(i, ':', count)
    count = 0

In [78]:
merged = X.merge(y)
Mt5b = merged[merged['flight_phase'] == 'TAKEOFF']
Mt5b = Mt5b[Mt5b['engine_family'] == 'CFM56-5B']

Mc5b = merged[merged['flight_phase'] == 'CRUISE']
Mc5b = Mc5b[Mc5b['engine_family'] == 'CFM56-5B']

Mt8e = merged[merged['flight_phase'] == 'TAKEOFF']
Mt8e = Mt8e[Mt8e['engine_family'] == 'CF34-8E']

Mc8e = merged[merged['flight_phase'] == 'CRUISE']
Mc8e = Mc8e[Mc8e['engine_family'] == 'CF34-8E']

Mt7 = merged[merged['flight_phase'] == 'TAKEOFF']
Mt7 = Mt7[Mt7['engine_family'] == 'CFM56-7']

Mc7 = merged[merged['flight_phase'] == 'CRUISE']
Mc7 = Mc7[Mc7['engine_family'] == 'CFM56-7']


In [79]:
Mnames = ['Mt5b', 'Mc5b', 'Mt8e', 'Mc8e', 'Mt7', 'Mc7' ]

In [80]:
for i in range(6):
  exec(f"{Mnames[i]}.dropna(axis='columns', thresh=20, inplace=True)")
  exec(f"{Mnames[i]}.dropna(inplace=True)")

In [81]:
col = [0, 2, 3]

Xt5b = Mt5b.iloc[:, :-15]
yt5b = Mt5b.iloc[:, col + [i for i in range(34, 49)]]

Xc5b = Mc5b.iloc[:,:-24]
yc5b = Mc5b.iloc[:,col + [i for i in range(39, 62)]]

Xt8e = Mt8e.iloc[:,:-10]
yt8e = Mt8e.iloc[:,col + [i for i in range(40, 50)]]

Xc8e = Mc8e.iloc[:,:-15]
yc8e = Mc8e.iloc[:,col + [i for i in range(39, 54)]]

Xt7 = Mt7.iloc[:,:-14]
yt7 = Mt7.iloc[:,col + [i for i in range(32, 46)]]

Xc7 = Mc7.iloc[:,:-19]
yc7 = Mc7.iloc[:, col + [i for i in range(29, 48)]]

In [82]:
Xnames = ['Xt5b', 'Xc5b', 'Xt8e', 'Xc8e', 'Xt7', 'Xc7' ]
ynames = ['yt5b', 'yc5b', 'yt8e', 'yc8e', 'yt7', 'yc7' ]

In [83]:
def cb_score(predict_field, XX, yy):
    df = pd.merge(XX, yy, on=["engine_id", "flight_datetime", "flight_phase"])
    df = df[df[predict_field].notna()]
    #df = df.fillna(-100)
    X = df.drop(columns=[predict_field])
    X = X.drop(columns=[
        "engine_id", "aircraft_id", "flight_datetime",
        "flight_phase", "engine_family", "engine_type", "manufacturer",
        "aircraft_family", "aircraft_type", "aircraft_grp", "ac_manufacturer"])
    y = df[predict_field]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
    
    model = CatBoostRegressor(iterations=500, verbose=False)
    model.fit(X_train, y_train)
    catboost_predicted = model.predict(X_test)
    catboost_score = mean_squared_error(y_test, catboost_predicted, squared=False)
    catboost_predicted = model.predict(X)
    return catboost_score

In [84]:
for i in range(6):
  print('Dataset : ', Xnames[i])
  scores = pd.DataFrame(columns=["parameter", "rmse"])
  exec(f"output_parameters = {ynames[i]}.columns.drop(['flight_datetime', 'flight_phase', 'engine_id'])")

  for param in tqdm(output_parameters):
      try:
          exec(f"score = cb_score(param, {Xnames[i]}, {ynames[i]})")
          scores.loc[len(scores)] = [param, score] 
      except:
          continue
  print(scores)

Dataset :  Xt5b


100%|██████████| 15/15 [00:50<00:00,  3.38s/it]


   parameter      rmse
0       BRAT  0.010445
1      DELFN  0.215623
2      DELN1  0.094042
3     EGTHDM  2.550564
4   EGTHDM_D  0.209373
5      PCN12  0.167429
6     PCN12I  0.168264
7     PCN1AR  0.166453
8     PCN1BR  0.161637
9      PCN1K  0.189601
10    SLOATL  0.933299
11  SLOATL_D  0.059752
12       WBE  0.013611
13  ZPCN25_D  0.083510
14    ZT49_D  0.765873
Dataset :  Xc5b


100%|██████████| 23/23 [02:26<00:00,  6.37s/it]


   parameter      rmse
0       BRAT  0.001935
1       DEGT  0.565583
2     DELVSV  0.012281
3      DPOIL  0.049766
4       EGTC  0.707202
5     EGTHDM  0.622020
6     GEGTMC  0.530380
7      GN2MC  0.016930
8     GPCN25  0.014673
9       GWFM  0.364877
10     PCN12  0.051987
11    PCN12I  0.051987
12     PCN1K  0.099796
13     PCN2C  0.031353
14    SLOATL  0.214646
15       WBE  0.003874
16       WBI  0.002549
17      WFMP  7.749193
18  ZPCN25_D  0.036888
19    ZT49_D  1.209436
20   ZTNAC_D  1.992801
21   ZWF36_D  6.286841
Dataset :  Xt8e


100%|██████████| 10/10 [01:01<00:00,  6.17s/it]


  parameter       rmse
0      BRAT   0.009148
1    EGTHDM   0.356304
2  EGTHDM_D   0.243982
3     PCN12   0.053027
4     PCN1K   0.043683
5    SLOATL   0.102539
6  SLOATL_D   0.068931
7  ZPCN25_D   0.049735
8    ZT49_D   0.454237
9   ZWF36_D  19.404562
Dataset :  Xc8e


100%|██████████| 15/15 [01:41<00:00,  6.77s/it]


   parameter      rmse
0       BRAT  0.000572
1       DEGT  2.457834
2       EGTC  1.104739
3     GPCN25  0.066851
4       GWFM  0.433354
5      PCN12  0.076237
6     PCN12I  0.066664
7      PCN1K  0.079818
8      PCN2C  0.043441
9        WBI  0.000572
10      WFMP  4.553451
11  ZPCN25_D  0.049547
12    ZT49_D  3.657592
13    ZTLA_D  0.057342
14   ZWF36_D  8.664141
Dataset :  Xt7


100%|██████████| 14/14 [01:11<00:00,  5.09s/it]


   parameter      rmse
0       BRAT  0.009424
1      DELFN  0.124969
2      DELN1  0.099247
3     EGTHDM  0.383315
4   EGTHDM_D  0.164772
5      PCN12  0.064799
6     PCN12I  0.056194
7     PCN1AR  0.090109
8     PCN1BR  0.088188
9      PCN1K  0.077520
10    SLOATL  0.121331
11  SLOATL_D  0.047761
12  ZPCN25_D  0.075100
13    ZT49_D  0.958128
Dataset :  Xc7


100%|██████████| 19/19 [01:35<00:00,  5.02s/it]

   parameter      rmse
0       DEGT  0.300055
1      DPOIL  0.050490
2       EGTC  0.595171
3     EGTHDM  0.464938
4     GEGTMC  0.271239
5      GN2MC  0.006869
6     GPCN25  0.006450
7       GWFM  0.283252
8      PCN12  0.040840
9     PCN12I  0.040840
10     PCN1K  0.044413
11     PCN2C  0.028275
12    SLOATL  0.144230
13      WFMP  3.690094
14  ZPCN25_D  0.041472
15    ZT49_D  2.112228
16   ZWF36_D  5.661522





In [85]:
def lgbm_score(predict_field, XX, yy):
    df = pd.merge(XX, yy, on=["engine_id", "flight_datetime", "flight_phase"])
    df = df[df[predict_field].notna()]
    #df = df.fillna(-100)
    X = df.drop(columns=[predict_field])
    X = X.drop(columns=[
        "engine_id", "aircraft_id", "flight_datetime",
        "flight_phase", "engine_family", "engine_type", "manufacturer",
        "aircraft_family", "aircraft_type", "aircraft_grp", "ac_manufacturer"])
    y = df[predict_field]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

    model = LGBMRegressor()
    model.fit(X_train, y_train)
    lgbm_predicted = model.predict(X_test)
    lgbm_score = mean_squared_error(y_test, lgbm_predicted, squared=False)
    lgbm_predicted = model.predict(X)
    return lgbm_score

In [86]:
for i in range(6):
  #exec(f"{Xnames[i]}.dropna(axis='columns',how='all', inplace=True)")
  #exec(f"count_nan_cols({Xnames[i]})")
  #exec(f"df = pd.merge({Xnames[i]}, {ynames[i]}, on=['engine_id', 'flight_datetime', 'flight_phase'])")
  #print(Xnames[i])
  print('Dataset : ', Xnames[i])
  scores = pd.DataFrame(columns=["parameter", "rmse"])
  exec(f"output_parameters = {ynames[i]}.columns.drop(['flight_datetime', 'flight_phase', 'engine_id'])")

  for param in tqdm(output_parameters):
      try:
          exec(f"score = lgbm_score(param, {Xnames[i]}, {ynames[i]})")
          #qua = df[param].quantile([.0, .1, .5, .9, 1])
          #qua = list(qua)
          scores.loc[len(scores)] = [param, score] 
      except:
          continue
  print(scores)

Dataset :  Xt5b


100%|██████████| 15/15 [00:04<00:00,  3.44it/s]


   parameter      rmse
0       BRAT  0.013761
1      DELFN  0.224656
2      DELN1  0.123318
3     EGTHDM  2.325645
4   EGTHDM_D  0.123267
5      PCN12  0.171598
6     PCN12I  0.155378
7     PCN1AR  0.154302
8     PCN1BR  0.168396
9      PCN1K  0.156837
10    SLOATL  1.419165
11  SLOATL_D  0.045466
12       WBE  0.015269
13  ZPCN25_D  0.084865
14    ZT49_D  0.785672
Dataset :  Xc5b


100%|██████████| 23/23 [00:27<00:00,  1.21s/it]


   parameter       rmse
0       BRAT   0.007127
1       DEGT   0.603038
2     DELVSV   0.032136
3      DPOIL   0.063277
4       EGTC   0.774277
5     EGTHDM   0.705264
6     GEGTMC   0.601644
7      GN2MC   0.047333
8     GPCN25   0.049280
9       GWFM   0.420539
10     PCN12   0.103739
11    PCN12I   0.103739
12     PCN1K   0.190249
13     PCN2C   0.032880
14    SLOATL   0.204362
15    VSVNOM   0.000000
16       WBE   0.006359
17       WBI   0.007126
18      WFMP  10.316546
19  ZPCN25_D   0.038334
20    ZT49_D   1.299324
21   ZTNAC_D   2.225659
22   ZWF36_D   6.667480
Dataset :  Xt8e


100%|██████████| 10/10 [00:10<00:00,  1.07s/it]


  parameter       rmse
0      BRAT   0.000007
1    EGTHDM   0.337281
2  EGTHDM_D   0.182473
3     PCN12   0.050723
4     PCN1K   0.048058
5    SLOATL   0.098698
6  SLOATL_D   0.052702
7  ZPCN25_D   0.055363
8    ZT49_D   0.449380
9   ZWF36_D  22.147783
Dataset :  Xc8e


100%|██████████| 15/15 [00:14<00:00,  1.03it/s]


   parameter      rmse
0       BRAT  0.000002
1       DEGT  3.549591
2       EGTC  1.305048
3     GPCN25  0.074385
4       GWFM  0.541135
5      PCN12  0.104632
6     PCN12I  0.095343
7      PCN1K  0.097771
8      PCN2C  0.050804
9        WBI  0.000002
10      WFMP  4.982666
11  ZPCN25_D  0.053112
12    ZT49_D  3.829023
13    ZTLA_D  0.051606
14   ZWF36_D  8.906177
Dataset :  Xt7


100%|██████████| 14/14 [00:11<00:00,  1.24it/s]


   parameter      rmse
0       BRAT  0.000002
1      DELFN  0.122788
2      DELN1  0.088076
3     EGTHDM  0.326272
4   EGTHDM_D  0.145461
5      PCN12  0.068701
6     PCN12I  0.052395
7     PCN1AR  0.088180
8     PCN1BR  0.079453
9      PCN1K  0.071841
10    SLOATL  0.096270
11  SLOATL_D  0.047381
12  ZPCN25_D  0.077042
13    ZT49_D  0.976120
Dataset :  Xc7


100%|██████████| 19/19 [00:13<00:00,  1.37it/s]

   parameter      rmse
0       BRAT  0.017125
1       DEGT  0.306294
2      DPOIL  0.101948
3       EGTC  0.796295
4     EGTHDM  0.444674
5     GEGTMC  0.285334
6      GN2MC  0.006917
7     GPCN25  0.006730
8       GWFM  0.336543
9      PCN12  0.053743
10    PCN12I  0.053743
11     PCN1K  0.103726
12     PCN2C  0.031026
13    SLOATL  0.140968
14       WBI  0.017125
15      WFMP  4.471431
16  ZPCN25_D  0.042230
17    ZT49_D  2.202510
18   ZWF36_D  5.915052



