In [None]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
import matplotlib.pyplot as plt
import seaborn as sns 
import warnings
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from typing import Dict, Tuple, Set, Callable
warnings.filterwarnings("ignore")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
def grouper(input: Dict[Tuple[str, ...],pd.DataFrame],
            keys: Tuple[str, ...]) -> Dict[Tuple[str, ...], pd.DataFrame]:
    """
    Helper function for recursive grouping dict of pandas DataFrames.
    """
    if not keys:
        return input

    to_tuple_transformer: Callable = lambda element: element if isinstance(element, tuple) else (element, )

    grouped_df: Dict[Tuple[str, ...], pd.DataFrame] = {
        (next_key, *to_tuple_transformer(curr_key)): next_df
        for curr_key, curr_df in input.items()
        for next_key, next_df in curr_df.groupby(keys[-1])
    }

    return grouper(grouped_df, keys[:-1])


def group(input: pd.DataFrame,
          keys: Tuple[str, ...] = ("engine_family", "flight_phase")) -> Dict[Tuple[str, ...], pd.DataFrame]:
    """
    Mock function for maintenance characteristics prediction.
    :param input: input DataFrame of aircraft and engine characteristics.
    :param keys: keys that are used for grouping output data.
    :return: output groups of DataFrames of predicted maintenance characteristics.
    """
    phase_df: Dict = {k: v for k, v in input.groupby(keys[-1])}

    return grouper(phase_df, keys[:-1])

In [None]:
def validate(df):
    return df.dropna(axis='columns', thresh=20).dropna()

In [None]:
X = pd.read_csv('/content/drive/MyDrive/S7/X.csv')
y = pd.read_csv('/content/drive/MyDrive/S7/y.csv')

In [None]:
splitted_X = group(X, ("flight_phase", "engine_family"))

In [None]:
merged_y = pd.merge(X[["engine_id", "flight_datetime", "flight_phase", "engine_family"]], y, on=["engine_id", "flight_datetime", "flight_phase"])
splitted_y = group(merged_y, ("flight_phase", "engine_family"))

In [None]:
class DataFrameDataset(Dataset):
    def __init__(self, df, incols, outcols):
        self.df = df
        self.incols = incols
        self.outcols = outcols
    
    def __len__(self):
        return self.df.count()[0]
    
    def __getitem__(self, index):
        if type(index) == tuple:
          ind1 = index[0]
          ind2 = index[1]
          input = torch.tensor(self.df[self.incols].iloc[ind2], dtype=torch.float32)
          output = torch.tensor(self.df[self.outcols].iloc[ind2], dtype=torch.float32)
          return input[ind1], output[ind1]
        else:
          input = torch.tensor(self.df[self.incols].iloc[index], dtype=torch.float32)
          output = torch.tensor(self.df[self.outcols].iloc[index], dtype=torch.float32)
          return input, output

In [None]:
def create_dataset(dataset, lookback):
    """Transform a time series into a prediction dataset
    
    Args:
        dataset: A numpy array of time series, first dimension is the time steps
        lookback: Size of window for prediction
    """
    X, y = [], []
    for i in range(len(dataset)-lookback):
        feature = dataset[i : i+lookback, i]
        target = dataset[i+1 : i+lookback+1, i]
        X.append(feature)
        y.append(target)
    return torch.tensor(X), torch.tensor(y)

In [None]:
dataloaders = {}
for (key, X_i), (_, y_i) in zip(splitted_X.items(), splitted_y.items()):
    X_i = validate(X_i)
    y_i = validate(y_i)
    input_cols = X_i.drop(columns=[
        "engine_id", "aircraft_id", "flight_datetime",
        "flight_phase", "engine_family", "engine_type", "manufacturer",
        "aircraft_family", "aircraft_type", "aircraft_grp", "ac_manufacturer"]).columns
    output_cols = y_i.drop(columns=["engine_id", "flight_datetime", "flight_phase", "engine_family"]).columns

    prepaired = pd.merge(X_i, y_i, on=["engine_id", "flight_datetime", "flight_phase", "engine_family"])
    prepaired.drop(columns=[
        "engine_id", "aircraft_id", "flight_datetime",
        "flight_phase", "engine_family", "engine_type", "manufacturer",
        "aircraft_family", "aircraft_type", "aircraft_grp", "ac_manufacturer"], inplace=True)
    
    train, test = train_test_split(prepaired, test_size=0.2)
    train = train.reset_index()
    test = test.reset_index()
    train = train.drop(columns='index')
    test = test.drop(columns='index')
    train['engine_position'] = train['engine_position'].astype('float')
    test = test['engine_position'].astype('float')
    train_dataset = DataFrameDataset(train, input_cols, output_cols)
    test_dataset = DataFrameDataset(test, input_cols, output_cols)
    train_dataset = create_dataset(train_dataset, 10)
    test_dataset = create_dataset(test_dataset, 10)
    

    train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=1)
    test_dataloader = DataLoader(test_dataset, batch_size=32, num_workers=1)
    dataloaders[key] = {"train": train_dataloader,
                        "train_size": len(input_cols),
                        "valid": test_dataloader,
                        "valid_size": len(output_cols)}

In [None]:
dataloaders = {}
(key, X_i), (_, y_i) = next(zip(splitted_X.items(), splitted_y.items()))
X_i = validate(X_i)
y_i = validate(y_i)
input_cols = X_i.drop(columns=[
    "engine_id", "aircraft_id", "flight_datetime",
    "flight_phase", "engine_family", "engine_type", "manufacturer",
    "aircraft_family", "aircraft_type", "aircraft_grp", "ac_manufacturer"]).columns
output_cols = ["ZWF36_D"]

prepaired = pd.merge(X_i, y_i[["engine_id", "flight_datetime", "flight_phase", "engine_family", "ZWF36_D"]], on=["engine_id", "flight_datetime", "flight_phase", "engine_family"])
prepaired.drop(columns=[
    "engine_id", "aircraft_id", "flight_datetime",
    "flight_phase", "engine_family", "engine_type", "manufacturer",
    "aircraft_family", "aircraft_type", "aircraft_grp", "ac_manufacturer"], inplace=True)


train, test = train_test_split(prepaired, test_size=0.2)
train = train.reset_index()
test = test.reset_index()
train = train.drop(columns='index')
test = test.drop(columns='index')
train['engine_position'] = train['engine_position'].astype('float')
test['engine_position']= test['engine_position'].astype('float')
train_dataset = DataFrameDataset(train, input_cols, output_cols)
test_dataset = DataFrameDataset(test, input_cols, output_cols)

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=1)
test_dataloader = DataLoader(test_dataset, batch_size=32, num_workers=1)
dataloaders[key] = {"train": train_dataloader,
                    "train_size": len(input_cols),
                    "valid": test_dataloader,
                    "valid_size": len(output_cols)}

In [None]:
class RMSELoss(nn.Module):
    def __init__(self, eps=1e-6):
        super().__init__()
        self.mse = nn.MSELoss()
        self.eps = eps
        
    def forward(self,yhat,y):
        loss = torch.sqrt(self.mse(yhat, y) + self.eps)
        return loss

In [None]:
test_key = ('CRUISE', 'CF34-8E')
input_size = dataloaders[test_key]["train_size"]
hidden_size1 = 64
hidden_size2 = 128
output_size = dataloaders[test_key]["valid_size"]
learning_rate = 1000
num_epochs = 15

In [None]:
class AirModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.lstm1 = nn.LSTMCell(input_size, hidden_size1)
        self.lstm2 = nn.LSTMCell(hidden_size1, hidden_size2)
        self.linear = nn.Linear(hidden_size2, 1)
    def forward(self, x):
        h_t, c_t = self.lstm1(x) # initial hidden and cell states
        h_t2, c_t2 = self.lstm2(h_t) # new hidden and cell states
        output = self.linear(h_t2) # output from the last FC layer
        return output

In [None]:
model = AirModel()
criterion = RMSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
for epoch in range(num_epochs):
    for k, dataloader in dataloaders[test_key].items():
        epoch_correct = 0
        epoch_all = 0


        if k != "train" and k != "valid":
            continue
        if k == 'train':
          train_loss = .0
          for x_batch, y_batch in dataloader:
                model.train()
                optimizer.zero_grad()
                outp = model(x_batch)
                loss = criterion(outp, y_batch)
                loss.backward()
                optimizer.step()
                train_loss += loss.item() * x_batch.shape[0]
          train_loss /= len(dataloader.dataset)
          print(f'Epoch {epoch+1}, Train_Loss: {train_loss:.4f}')

        elif k == 'valid':
          valid_loss = .0
          model.eval()
          with torch.no_grad():
            for x_batch, y_batch in dataloader:
                outp = model(x_batch)
                loss = criterion(outp, y_batch )
                valid_loss += loss.item() * x_batch.shape[0]
            valid_loss /= len(dataloader.dataset)
            print(f'Epoch {epoch+1}, Valid_Loss: {valid_loss:.4f}')

Epoch 1, Train_Loss: 6888.6151
Epoch 1, Valid_Loss: 9218.2325
Epoch 2, Train_Loss: 6553.0729
Epoch 2, Valid_Loss: 2371.8382
Epoch 3, Train_Loss: 6551.7599
Epoch 3, Valid_Loss: 4763.4931
Epoch 4, Train_Loss: 6542.9852
Epoch 4, Valid_Loss: 9865.6993
Epoch 5, Train_Loss: 6553.9445
Epoch 5, Valid_Loss: 7652.2018
Epoch 6, Train_Loss: 6551.0681
Epoch 6, Valid_Loss: 3773.0108
Epoch 7, Train_Loss: 6543.4293
Epoch 7, Valid_Loss: 6020.7443
Epoch 8, Train_Loss: 6551.8268
Epoch 8, Valid_Loss: 8741.5046
Epoch 9, Train_Loss: 6549.8943
Epoch 9, Valid_Loss: 8664.2141
Epoch 10, Train_Loss: 6550.0229
Epoch 10, Valid_Loss: 2866.1391
Epoch 11, Train_Loss: 6547.6099
Epoch 11, Valid_Loss: 5194.0581
Epoch 12, Train_Loss: 6545.5316
Epoch 12, Valid_Loss: 9476.8305
Epoch 13, Train_Loss: 6552.1875
Epoch 13, Valid_Loss: 8000.2696
Epoch 14, Train_Loss: 6550.6206
Epoch 14, Valid_Loss: 3463.1244
Epoch 15, Train_Loss: 6544.5391
Epoch 15, Valid_Loss: 5734.3843


Видим, что персептрон плохо подходить для прогнозирования параметра fuel flow