In [None]:
import pandas as pd
import numpy as np
import math 
import scipy.stats as st
from sklearn.preprocessing import Normalizer


import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import plotly.express as px
import plotly.graph_objects as go

%matplotlib inline

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset

In [None]:
is_cuda = torch.cuda.is_available()
if is_cuda:
    device = torch.device('cuda:0')
    from torch.cuda import FloatTensor, LongTensor
else:
    device = torch.device('cpu')
    from torch import FloatTensor, LongTensor
    
try:
    from google.colab import drive
    is_in_colab = True
except:
    is_in_colab = False

if is_in_colab:
    drive.mount('/content/drive')
    data_folder = r'/content/drive/My Drive/Colab/IDAO_2020/'
else:
    data_folder = r'./data/'
    

In [None]:
# баш команда для создания каталога в монитрованном гугл-диске, для хранения там данных. 
# Выполните один раз после монтирования диска, чтобы не создавать папку вручную
# ! mkdir -p '/content/drive/My Drive/Colab/IDAO_2020/'

In [None]:
def save_model(path, model, optimizer, loss_history, train_history, val_history):
    torch.save({
            'epoch': len(train_history),
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': loss_history[-1],
            'loss_history': loss_history,
            'train_history': train_history,
            'val_history': val_history
            }, path)
    print('successfully saved')
    
def load_model(path, model, optimizer, loss_history, train_history, val_history):
    checkpoint = torch.load(path)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    loss_history = checkpoint['loss_history']
    train_history = checkpoint['train_history']
    val_history = checkpoint['val_history']
    print('successfully loaded')

In [None]:
data = pd.read_csv(data_folder + 'train.csv', parse_dates=['epoch']).iloc[:,2:]

In [None]:
data.set_index(keys=data['sat_id'], inplace=True)

In [None]:
def split_data(values, coeff=0.7):
    # coeff - доля трейна, остальное делится на валидацию и тест поровну
    split = int(np.floor(coeff * values))
    split2 = int(np.floor(values*(1-coeff)/2))
    indices = list(range(values))
    np.random.shuffle(indices)
    train_indices, val_indices, test_indices = indices[:split], indices[split:split+split2], indices[split+split2:]
    return train_indices, val_indices, test_indices

train_indices, val_indices, test_indices = split_data(len(data['sat_id'].unique()))

In [None]:
train_data = data.loc[train_indices]
test_data = data.loc[test_indices]
val_data = data.loc[val_indices]

In [None]:
class Data_Sat(Dataset):
    def __init__(self, data, sequence_length=20, normalizer=None):
        self.sequence_length = sequence_length
        self.data = data
        self.satellite_dict = {}
        self.normalizer = normalizer
        self.split_data()

    def split_data(self):
        # разделяет данные по каждому спутнику на отдельные секвенции длиной sequence_length каждая
        # и записывает их в словарь self.satellite_dict
        # нормализует данные

        for ind, satellite in enumerate(self.data['sat_id'].unique()):
            sat_data = self.data.query('sat_id==@satellite').iloc[:, 1:]
            sequence_count = np.floor(sat_data.shape[0] / self.sequence_length).astype('int')

            samples_sat = np.zeros((sequence_count * self.sequence_length, 12))
            samples_sat = sat_data.iloc[: sequence_count * self.sequence_length].values
            if self.normalizer:
                samples_sat = self.normalizer.fit_transform(samples_sat)

            self.satellite_dict[ind] = samples_sat.reshape(sequence_count, self.sequence_length, 12)

    def generate_samples(self, max_sequence_count=10):
        # генерирует отдельные наборы последовательных секвенций, аугментируя данные: 
        # разбивает данные по одному спутнику (если их больше, чем max_sequence_count)
        # на несколько отдельных последовательностей 
        # для использования их при тренировке, как разных спутников.
        self.samples = []

        for sat in self.satellite_dict.values():
            if sat.shape[0] > max_sequence_count:
                sequence_count = sat.shape[0]
                samples_count = math.ceil(sequence_count / max_sequence_count)
                step = (sequence_count - max_sequence_count) / (samples_count - 1)
                for sample in range(samples_count):
                    next_step = round(step * sample)
                    self.samples.append(self.data_casting(sat[next_step: next_step + max_sequence_count]))

    @staticmethod
    def data_casting(data):
        # вычитает из значений симуляции начальную ошибку.
        # начальная ошибка равна x_sym[0] - x[0] и аналогично для y, z и т.д.
        for i in range(6):
            data[..., i + 6] -= data[0, 0, i + 6] - data[0, 0, i]
        return data

    def __len__(self):
        """
        Returns total number of samples
        """
        return len(self.samples)

    def __getitem__(self, index):
        """
        
        :param index: 
        :return: one-satellite sample [max_sequence_count, sequence_length, gt + in values]
        """
        return torch.Tensor(self.samples[index]).type(torch.FloatTensor)


In [None]:
import math
from tqdm import tqdm


def do_epoch(model, loss_function, data, batch_size, optimizer=None, name=None):
    epoch_loss = 0
    epoch_SGP4_loss = 0
    is_train = not optimizer is None
    name = name or ''
    model.train(is_train)
    loader = torch.utils.data.DataLoader(data, batch_size=batch_size)
    max_sequence_count = data[0].shape[1]
    
    with torch.autograd.set_grad_enabled(is_train):
        # with tqdm(total=1) as progress_bar:
            for i, sample in enumerate(loader):
                sample = sample.permute(1, 2, 0, 3)  # [max_sequence_count, sequence_length,  batch, gt + in values]
                for sequence in sample:
                    X_batch, y_batch = (sequence[...,6:]).to(device), (sequence[...,:6]).to(device)
                    prediction = model(X_batch)
                    loss = loss_function(prediction, y_batch)
                    SGP4_loss = loss_function(X_batch, y_batch)
                    
                    epoch_loss += loss.item()
                    epoch_SGP4_loss += SGP4_loss.item()

                    if is_train:
                        optimizer.zero_grad()
                        loss.backward()
                        optimizer.step()
                

                # progress_bar.update()
                # progress_bar.set_description('{:>5s} Loss = {:.5f}'.format(
                #     name, loss.item())
                # )
            
            epoch_loss /= i * max_sequence_count
            epoch_SGP4_loss /= i * max_sequence_count
            
            print(f'Epoch {name} - loss: {epoch_loss:.5f} , SGP4 loss: {epoch_SGP4_loss:.5f}')
            # progress_bar.set_description('{:>5s} Loss = {:.5f}'.format(
            #     name, epoch_loss)
            # )

    return epoch_loss


def fit(model, loss_function, optimizer, train_data, epochs_count=1, batch_size=1,
        val_data=None, val_batch_size=None):
    """
    тренировко модели с валидацией после каждой эпохи, если валидация задана
    """
        
    if val_data and val_batch_size is None:
        val_batch_size = batch_size
        
    for epoch in range(epochs_count):
        name_prefix = '[{} / {}] '.format(epoch + 1, epochs_count)
        train_loss = do_epoch(model, loss_function, train_data, batch_size, optimizer, name_prefix + 'Train:')

        
        if val_data:
            val_loss = do_epoch(model, loss_function, val_data, val_batch_size, 
                                         optimizer=None, name=name_prefix + '  Val:')

In [None]:
class LSTM(nn.Module):
    def __init__(self, input_dim=6, output_dim=6, lstm_hidden_dim=28, lstm_layers_count=1):
        super().__init__()
        self.lstm = nn.LSTM(input_size = 6, 
                            hidden_size = lstm_hidden_dim,
                            num_layers = lstm_layers_count,
                            bias=True)
        
        self.linear = nn.Linear(lstm_hidden_dim, 6, bias=True)

    def forward(self, inputs):
        lstm_out, _ = self.lstm.forward(inputs)
        linear_out = self.linear.forward(lstm_out)
        return linear_out


In [None]:
sequence_duration = 20
train_dataset = Data_Sat(train_data, sequence_duration, Normalizer())
train_dataset.generate_samples(max_sequence_count=100)
val_dataset = Data_Sat(val_data, sequence_duration, Normalizer())
val_dataset.generate_samples(max_sequence_count=100)
print('Samples count:', len(train_dataset))
model = LSTM().to(device)
loss_function = torch.nn.MSELoss()
optimizer = optim.Adam(model.parameters())

In [None]:
# run only train
fit(model, loss_function, optimizer, train_dataset, epochs_count=2, batch_size=5, )

In [None]:
# run with validation
fit(model, loss_function, optimizer, train_dataset, epochs_count=2, batch_size=5, val_data=val_dataset)


