In [None]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
import matplotlib.pyplot as plt
import seaborn as sns 
import warnings
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from typing import Dict, Tuple, Set, Callable
warnings.filterwarnings("ignore")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
def grouper(input: Dict[Tuple[str, ...],pd.DataFrame],
            keys: Tuple[str, ...]) -> Dict[Tuple[str, ...], pd.DataFrame]:
    """
    Helper function for recursive grouping dict of pandas DataFrames.
    """
    if not keys:
        return input

    to_tuple_transformer: Callable = lambda element: element if isinstance(element, tuple) else (element, )

    grouped_df: Dict[Tuple[str, ...], pd.DataFrame] = {
        (next_key, *to_tuple_transformer(curr_key)): next_df
        for curr_key, curr_df in input.items()
        for next_key, next_df in curr_df.groupby(keys[-1])
    }

    return grouper(grouped_df, keys[:-1])


def group(input: pd.DataFrame,
          keys: Tuple[str, ...] = ("engine_family", "flight_phase")) -> Dict[Tuple[str, ...], pd.DataFrame]:
    """
    Mock function for maintenance characteristics prediction.
    :param input: input DataFrame of aircraft and engine characteristics.
    :param keys: keys that are used for grouping output data.
    :return: output groups of DataFrames of predicted maintenance characteristics.
    """
    phase_df: Dict = {k: v for k, v in input.groupby(keys[-1])}

    return grouper(phase_df, keys[:-1])

In [None]:
def validate(df):
    return df.dropna(axis='columns', thresh=20).dropna()

In [None]:
X = pd.read_csv('/content/drive/MyDrive/S7/X.csv')
y = pd.read_csv('/content/drive/MyDrive/S7/y.csv')

In [None]:
splitted_X = group(X, ("flight_phase", "engine_family"))

In [None]:
merged_y = pd.merge(X[["engine_id", "flight_datetime", "flight_phase", "engine_family"]], y, on=["engine_id", "flight_datetime", "flight_phase"])
splitted_y = group(merged_y, ("flight_phase", "engine_family"))

In [None]:
class DataFrameDataset(Dataset):
    def __init__(self, df, incols, outcols):
        self.df = df
        self.incols = incols
        self.outcols = outcols
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        #print(self.incols)
        #print(self.outcols)
        for i in self.df.columns:
          (self.df[i]-self.df[i].min())/(self.df[i].max()-self.df[i].min())
        input = torch.tensor(self.df[self.incols].iloc[index], dtype=torch.float32)
        # input.type(dtype=torch.float32)
        output = torch.tensor(self.df[self.outcols].iloc[index], dtype=torch.float32)
        # output.type(dtype=torch.float32)
        return input, output

In [None]:
dataloaders = {}
for (key, X_i), (_, y_i) in zip(splitted_X.items(), splitted_y.items()):
    X_i = validate(X_i)
    y_i = validate(y_i)
    input_cols = X_i.drop(columns=[
        "engine_id", "aircraft_id", "flight_datetime",
        "flight_phase", "engine_family", "engine_type", "manufacturer",
        "aircraft_family", "aircraft_type", "aircraft_grp", "ac_manufacturer"]).columns
    output_cols = y_i.drop(columns=["engine_id", "flight_datetime", "flight_phase", "engine_family"]).columns

    prepaired = pd.merge(X_i, y_i, on=["engine_id", "flight_datetime", "flight_phase", "engine_family"])
    prepaired.drop(columns=[
        "engine_id", "aircraft_id", "flight_datetime",
        "flight_phase", "engine_family", "engine_type", "manufacturer",
        "aircraft_family", "aircraft_type", "aircraft_grp", "ac_manufacturer"], inplace=True)
    
    train, test = train_test_split(prepaired, test_size=0.2)
    train = train.reset_index()
    test = test.reset_index()
    train = train.drop(columns='index')
    test = test.drop(columns='index')
    train['engine_position'] = train['engine_position'].astype('float')
    test = test['engine_position'].astype('float')
    train_dataset = DataFrameDataset(train, input_cols, output_cols)
    test_dataset = DataFrameDataset(test, input_cols, output_cols)

    train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=1)
    test_dataloader = DataLoader(test_dataset, batch_size=32, num_workers=1)
    dataloaders[key] = {"train": train_dataloader,
                        "train_size": len(input_cols),
                        "valid": test_dataloader,
                        "valid_size": len(output_cols)}

In [None]:
dataloaders = {}
(key, X_i), (_, y_i) = next(zip(splitted_X.items(), splitted_y.items()))
X_i = validate(X_i)
y_i = validate(y_i)
input_cols = X_i.drop(columns=[
    "engine_id", "aircraft_id", "flight_datetime",
    "flight_phase", "engine_family", "engine_type", "manufacturer",
    "aircraft_family", "aircraft_type", "aircraft_grp", "ac_manufacturer"]).columns
output_cols = ["ZWF36_D"]

prepaired = pd.merge(X_i, y_i[["engine_id", "flight_datetime", "flight_phase", "engine_family", "ZWF36_D"]], on=["engine_id", "flight_datetime", "flight_phase", "engine_family"])
prepaired.drop(columns=[
    "engine_id", "aircraft_id", "flight_datetime",
    "flight_phase", "engine_family", "engine_type", "manufacturer",
    "aircraft_family", "aircraft_type", "aircraft_grp", "ac_manufacturer"], inplace=True)


train, test = train_test_split(prepaired, test_size=0.2)
train = train.reset_index()
test = test.reset_index()
train = train.drop(columns='index')
test = test.drop(columns='index')
train['engine_position'] = train['engine_position'].astype('float')
test['engine_position']= test['engine_position'].astype('float')
train_dataset = DataFrameDataset(train, input_cols, output_cols)
test_dataset = DataFrameDataset(test, input_cols, output_cols)

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=1)
test_dataloader = DataLoader(test_dataset, batch_size=32, num_workers=1)
dataloaders[key] = {"train": train_dataloader,
                    "train_size": len(input_cols),
                    "valid": test_dataloader,
                    "valid_size": len(output_cols)}

0) приветствие

1) проблематика + ценность

2-4) что ценного сделали (решение проекта -- тех. часть, масштабирование)

5) демо + QR?

6) спасибо за внимание

In [None]:
class RMSELoss(nn.Module):
    def __init__(self, eps=1e-6):
        super().__init__()
        self.mse = nn.MSELoss()
        self.eps = eps
        
    def forward(self,yhat,y):
        loss = torch.sqrt(self.mse(yhat, y) + self.eps)
        return loss

In [None]:
test_key = ('CRUISE', 'CF34-8E')
input_size = dataloaders[test_key]["train_size"]
hidden_size1 = 128
output_size = dataloaders[test_key]["valid_size"]
learning_rate = 1000
num_epochs = 15

In [None]:
model = nn.Sequential(
    nn.Flatten(),
    nn.Linear(input_size, hidden_size1),
    nn.Tanh()
)

In [None]:
criterion = RMSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
for epoch in range(num_epochs):
    for k, dataloader in dataloaders[test_key].items():
        epoch_correct = 0
        epoch_all = 0


        if k != "train" and k != "valid":
            continue
        if k == 'train':
          train_loss = .0
          for x_batch, y_batch in dataloader:
                model.train()
                optimizer.zero_grad()
                outp = model(x_batch)
                loss = criterion(outp, y_batch)
                loss.backward()
                optimizer.step()
                train_loss += loss.item() * x_batch.shape[0]
          train_loss /= len(dataloader.dataset)
          print(f'Epoch {epoch+1}, Train_Loss: {train_loss:.4f}')

        elif k == 'valid':
          valid_loss = .0
          model.eval()
          with torch.no_grad():
            for x_batch, y_batch in dataloader:
                outp = model(x_batch)
                loss = criterion(outp, y_batch )
                valid_loss += loss.item() * x_batch.shape[0]
            valid_loss /= len(dataloader.dataset)
            print(f'Epoch {epoch+1}, Valid_Loss: {valid_loss:.4f}')

Epoch 1, Train_Loss: 23.9305
Epoch 1, Valid_Loss: 23.5957
Epoch 2, Train_Loss: 23.9520
Epoch 2, Valid_Loss: 23.5957
Epoch 3, Train_Loss: 23.9435
Epoch 3, Valid_Loss: 23.5957
Epoch 4, Train_Loss: 23.9731
Epoch 4, Valid_Loss: 23.5957
Epoch 5, Train_Loss: 23.9418
Epoch 5, Valid_Loss: 23.5957
Epoch 6, Train_Loss: 23.9524
Epoch 6, Valid_Loss: 23.5957
Epoch 7, Train_Loss: 23.9262
Epoch 7, Valid_Loss: 23.5957
Epoch 8, Train_Loss: 23.9698
Epoch 8, Valid_Loss: 23.5957
Epoch 9, Train_Loss: 23.9614
Epoch 9, Valid_Loss: 23.5957
Epoch 10, Train_Loss: 23.9692
Epoch 10, Valid_Loss: 23.5957
Epoch 11, Train_Loss: 23.9006
Epoch 11, Valid_Loss: 23.5957
Epoch 12, Train_Loss: 23.9553
Epoch 12, Valid_Loss: 23.5957
Epoch 13, Train_Loss: 23.9388
Epoch 13, Valid_Loss: 23.5957
Epoch 14, Train_Loss: 23.9303
Epoch 14, Valid_Loss: 23.5957
Epoch 15, Train_Loss: 23.9651
Epoch 15, Valid_Loss: 23.5957


Видим, что персептрон плохо подходить для прогнозирования параметра fuel flow