In [0]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [0]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset
from torch.optim.lr_scheduler import ReduceLROnPlateau

In [0]:
is_cuda = torch.cuda.is_available()
if is_cuda:
    device = torch.device('cuda:0')
    from torch.cuda import FloatTensor
else:
    device = torch.device('cpu')
    from torch import FloatTensor
    
try:
    from google.colab import drive
    is_in_colab = True
except:
    is_in_colab = False
    

In [0]:
# вывод информации о выданном с colab GPU
if is_in_colab:
    !ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
    !pip install gputil
    !pip install psutil
    !pip install humanize
    import psutil
    import humanize
    import os
    import GPUtil as GPU
    GPUs = GPU.getGPUs()
    gpu = GPUs[0]
    def printm():
        process = psutil.Process(os.getpid())
        print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ), " | Proc size: " + humanize.naturalsize( process.memory_info().rss))
        print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))

    printm()
    

In [0]:
if is_in_colab:
    drive.mount('/content/drive')
    data_folder = r'/content/drive/My Drive/Colab/IDAO_2020/'
else:
    data_folder = r'./data/'

In [0]:
# баш команда для создания каталога в монитрованном гугл-диске, для хранения там данных. 
# Выполните один раз после монтирования диска, чтобы не создавать папку вручную
# ! mkdir -p '/content/drive/My Drive/Colab/IDAO_2020/'


In [0]:
def save_model(path, model, optimizer, loss_history, train_history, val_history):
    torch.save({
            'epoch': len(train_histor),
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': loss_history[-1],
            'loss_history': loss_history,
            'train_history': train_history,
            'val_history': val_history
            }, path)
    print('successfully saved')
    
def load_model(path, model, optimizer, loss_history, train_history, val_history):
    checkpoint = torch.load(path)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    loss_history = checkpoint['loss_history']
    train_history = checkpoint['train_history']
    val_history = checkpoint['val_history']
    print('successfully loaded')

In [0]:
def add_delta_time(df, columns=None):
    """
    Добавляет столбец delta_time в секундах. Возвращает DataFrame в порядке указанном columns
    если columns нет то возвращает все столбцы
    """
    
        
    df.sort_values(by=['sat_id', 'epoch'], inplace=True) 
    
    df['delta_time'] = df.iloc[1:,1] - df.iloc[0:-1,1].values    
    delta_time = df['delta_time']
    df['delta_seconds'] = delta_time.dt.seconds
    filters = data.iloc[:, 2] != np.insert(df.iloc[0:-1, 2].values, 0, -1)
    df.loc[filters, ['delta_time', 'delta_seconds']] = 0
    if not columns:
        columns=df.columns
    return df[columns]

In [0]:
class Norm():
    """
    Нормализатор. 
    Init запоминает среднее и стандартное отклонение в данных
    """
    def __init__(self, df, ignore_column=None):
        self.mean = df.mean()
        self.std = df.std()
        self.l2 = df.pow(2, axis=0).sum(axis=0).pow(0.5, axis=0)
        if ignore_column:
            self.mean[ignore_column] = 0
            self.std[ignore_column] = 1
            self.l2[ignore_column] = 1
    @staticmethod
    def columns_check(columns, df_columns):
        if not columns:
            return df_columns
        return columns
        
    def z_norm(self, df, columns=None):
        columns = columns_check(columns, df.columns)
        return (df[columns] - self.mean[columns]) / self.std[columns]
    
    def l2_norm(self, df, columns=None):
        columns = self.columns_check(columns, df.columns)
        return df[columns] / self.l2[columns]
        
    def back_z_norm(self, df, columns=None):
        try:
            columns = columns_check(columns, df.columns)
        except:
            print("df должен быть DataFrame или columns должен быть заполнен")
            return None
        if not type(df) is pd.core.frame.DataFrame:
            df = pd.DataFrame(data=df, columns=columns)
            
    def back_l2_norm(self, df, columns=None):
        try:
            columns = columns_check(columns, df.columns)
        except:
            print("df должен быть DataFrame или columns должен быть заполнен")
            return None
        if not type(df) is pd.core.frame.DataFrame:
            df = pd.DataFrame(data=df, columns=columns)
            
        return (df[columns] * self.l2[columns])

In [0]:
def split_data(values, coeff=0.9):
    # coeff - доля трейна, остальное - 
    split = int(np.floor(coeff * values))
    indices = list(range(values))
    np.random.shuffle(indices)
    train_indices, val_indices = indices[:split], indices[split:]
    return train_indices, val_indices


In [0]:
def split_folds(indices, n_folds):
    # делит список индексов на n_folds частей
    avg = len(indices) / float(n_folds)
    result = []
    last = 0.0
    while last < len(indices):
        result.append(indices[int(last):int(last + avg)])
        last += avg
    return result


In [0]:
class Data_Sat(Dataset):
    def __init__(self, data, sequence_length=20):
        self.sequence_length = sequence_length
        self.data = data
        self.satellite_dict = {}
        self.split_data()

    def split_data(self):
        # разделяет данные по каждому спутнику на отдельные секвенции длиной sequence_length каждая
        # и записывает их в словарь self.satellite_dict

        for ind, satellite in enumerate(self.data['sat_id'].unique()):
            sat_data = self.data.query('sat_id==@satellite').iloc[:, 1:]
            sequence_count = np.ceil(sat_data.shape[0] / self.sequence_length).astype('int')

            samples_sat = np.zeros((sequence_count * self.sequence_length, sat_data.shape[1]))
            samples_sat[: sat_data.shape[0]] = sat_data.values

            self.satellite_dict[ind] = samples_sat.reshape(sequence_count, self.sequence_length, -1)

    def generate_samples(self, max_sequence_count=10, last_sequence=False):
        # генерирует отдельные наборы последовательных секвенций, аугментируя данные: 
        # разбивает данные по одному спутнику (если их больше, чем max_sequence_count)
        # на несколько отдельных последовательностей 
        # для использования их при тренировке, как разных спутников.
        self.samples = []
        

        for sat in self.satellite_dict.values():
            sequence_count = sat.shape[0]
            if not last_sequence:
                sequence_count -= 1
            if  sequence_count > max_sequence_count:
                samples_count = math.ceil(sequence_count / max_sequence_count)
                step = (sequence_count - max_sequence_count) / (samples_count - 1)
                for sample in range(samples_count):
                    next_step = round(step * sample)
                    self.samples.append(self.data_casting(sat[next_step: next_step + max_sequence_count]))

    @staticmethod
    def data_casting(data):
        # вычитает из значений симуляции начальную ошибку.
        # начальная ошибка равна x_sym[0] - x[0] и аналогично для y, z и т.д.
        for i in range(1, 7, 1):
            data[..., i + 6] -= data[0, 0, i + 6] - data[0, 0, i]
        return data

    def __len__(self):
        """
        Returns total number of samples
        """
        return len(self.samples)

    def __getitem__(self, index):
        """
        
        :param index: 
        :return: one-satellite sample [max_sequence_count, sequence_length, gt + in values]
        """
        return FloatTensor(self.samples[index])
    

In [0]:
def smape(satellite_predicted_values, satellite_true_values): 
    # the division, addition and subtraction are pointwise 
    return torch.mean(torch.abs(satellite_predicted_values - satellite_true_values) 
        / (torch.abs(satellite_predicted_values) + torch.abs(satellite_true_values)))

In [0]:
import math
from tqdm import tqdm


def do_epoch(model, loss_function, data, batch_size, optimizer=None, name=None, ):
    """
    #TO DO: описание
    """
    epoch_loss = 0
    epoch_SGP4_loss = 0
    epoch_smape = 0
    is_train = not optimizer is None
    name = name or ''
    model.train(is_train)
    loader = torch.utils.data.DataLoader(data, batch_size=batch_size)
    max_sequence_count, sequence_length = data[0].shape[0], data[0].shape[1]
    batch_count = len(loader)
    with torch.autograd.set_grad_enabled(is_train):
        with tqdm(total=batch_count) as progress_bar:               
            for i, sample in enumerate(loader):
                sample = sample.permute(1, 2, 0, 3)  # [max_sequence_count, sequence_length,  batch, gt + in values]
                h, c = model.init_hidden(sample.shape[2])
                for sequence in sample:
                    X_batch, y_batch = (sequence[...,:7]).to(device), (sequence[...,7:]).to(device)
                    
                    prediction, (h_1, c_1) = model(X_batch, h, c)
                    
                    loss = loss_function(prediction, y_batch)
                    SGP4_loss = loss_function(X_batch[...,1:], y_batch)
                    epoch_smape += smape(prediction.detach(),
                                         y_batch.detach())
                    
                    epoch_loss += loss.item()
                    epoch_SGP4_loss += SGP4_loss.item()

                    if is_train:
                        optimizer.zero_grad()
                        loss.backward()
                        optimizer.step()
                        
                
                    h, c = h_1.detach(), c_1.detach()
                progress_bar.update()
                progress_bar.set_description('{:>5s} Loss = {:.5f}'.format(
                    name, loss.item())
                )
            
            epoch_loss /= (i + 1) * max_sequence_count
            epoch_SGP4_loss /= (i + 1) * max_sequence_count
            epoch_smape /= (i + 1) * max_sequence_count
            score = (1-epoch_smape) * 100

            loss_comparison = epoch_loss / epoch_SGP4_loss
            
            progress_bar.set_description(f'Epoch {name} - loss compar: {loss_comparison:.2f}, '
                                         f'score: {score:.2f}, loss: {epoch_loss:.5f}')

    return float(score)


def fit(model, data, folds, loss_function, default_state, optimizer=None, 
        epochs_count=1, batch_size=1, plot_draw=False):
    """
    тренировка модели с кросс-валидацией и валидацией после каждой эпохи, валидация есть по умолчанию.
    Выводит списки fold_train_history fold_val_history.
    """
    fold_train_history = []
    fold_val_history = []
    for j, fold in enumerate(folds):

        #Возврат оптимизатора к изначальным значениям
        optimizer.load_state_dict(default_state)

        #Scheduler на каждом фолде заново определяется
        scheduler = ReduceLROnPlateau(optimizer, mode='min', factor = 0.8, patience = 10, verbose = True, threshold= 1e-1)

        #Сброс параметров модели на каждом фолде
        for name, module in model.named_children():
            print('resetting ', name)
            module.reset_parameters()
        
        print('Fold: ', j+1, '\n')
        
        
        val_data = data.loc[fold]
        val_dataset = Data_Sat(val_data, sequence_length)
        val_dataset.generate_samples(max_sequence_count=max_sequence_count,  last_sequence=False)

        train_data = data.loc[[index for nfold in folds for index in nfold if nfold != fold]]
        train_dataset = Data_Sat(train_data, sequence_length)
        train_dataset.generate_samples(max_sequence_count=max_sequence_count, last_sequence=False)

        train_history = []
        val_history = []

        for epoch in range(epochs_count):

            for param_group in optimizer.param_groups: 
              print('\nLR: ', param_group['lr'])

            name_prefix = '[{} / {}] '.format(epoch + 1, epochs_count)
            if train_dataset:
                epoch_train_score = do_epoch(model, loss_function, train_dataset, batch_size, 
                                              optimizer, name_prefix + 'Train:')
                train_history.append(epoch_train_score)


            if val_dataset:
                name = '  Val:'
                if not train_dataset:
                    name = ' Test:'
                epoch_val_score = do_epoch(model, loss_function, val_dataset, batch_size, 
                                             optimizer=None, name=name_prefix + name)
                val_history.append(epoch_val_score)
                scheduler.step(epoch_val_score)
        if plot_draw:
            draw_plot(train_history, val_history)
        fold_val_history.append(val_history[-1])
        fold_train_history.append(train_history[-1])
    return fold_train_history, fold_val_history

In [0]:
def draw_plot(train_loss_history, val_loss_history):
    """
    Рисует lineplot
    """
    data = pd.DataFrame(data=[train_loss_history, val_loss_history], index=['Train', 'Val']).T
    plt.figure(figsize=(15, 6))
    sns.set(style='darkgrid')
    ax = sns.lineplot(data=data, markers = ["o", "o"], palette='bright')
    plt.title("Line Plot", fontsize = 20)
    plt.xlabel("Epoch", fontsize = 15)
    plt.ylabel("Loss", fontsize = 15)
    plt.show()

In [0]:
class LSTM(nn.Module):
    def __init__(self, input_dim=7, output_dim=6, lstm_hidden_dim=20, 
                 lstm_layers_count=1, bidirectional=False, dropout=0):
        super().__init__()
        
        self.input_dim = input_dim 
        self.lstm_layers_count = lstm_layers_count
        self.lstm_hidden_dim = lstm_hidden_dim
            
        self.lstm = nn.LSTM(input_size = self.input_dim, 
                            hidden_size = self.lstm_hidden_dim,
                            num_layers = self.lstm_layers_count,
                            bidirectional=bidirectional,
                            bias=True,
                            dropout=dropout
                           )
        
        self.linear = nn.Linear(lstm_hidden_dim, output_dim, bias=True)
        
    def init_hidden(self, batch_size):
        
            return (torch.zeros(self.lstm_layers_count * (2 if bidirectional else 1), 
                                batch_size, self.lstm_hidden_dim).to(device),
                    torch.zeros(self.lstm_layers_count * (2 if bidirectional else 1), 
                                batch_size, self.lstm_hidden_dim).to(device)
                   )

        
    def forward(self, inputs, h, c):
        
        lstm_out, (h_1, c_1) = self.lstm.forward(inputs, (h, c))
        linear_out = self.linear.forward(lstm_out)
        
        return linear_out, (h_1, c_1)

In [0]:
def predict(model, sat_data):
    """
    Получает на вход модель и разделенные на sequences_count, sequence_length данные. Предсказывает реальные значение по спутнику.
    Выводит Tensor формы (n_samples, n_features).
    """
    sequences_count, sequence_length, _ = sat_data.shape
    result = torch.zeros((sequences_count*sequence_length, 6)).to(device)
    model.eval()
    h, c = model.init_hidden(1)
    for i, seq in enumerate(sat_data):
        inputs = FloatTensor(seq[:, None, :])
        predicted, (h_1, c_1) = model(inputs, h, c)
        
        h, c = h_1.detach(), c_1.detach()
        predicted = predicted.view(sequence_length, -1).detach()
        result[i*sequence_length : (i+1)*sequence_length] = predicted
    return result

In [0]:
#data preparation
data = pd.read_csv(data_folder + 'train.csv', parse_dates=['epoch'])
columns = ['sat_id', 'delta_seconds', 'x_sim', 'y_sim', 'z_sim', 'Vx_sim', 'Vy_sim', 'Vz_sim',
           'x', 'y', 'z', 'Vx', 'Vy', 'Vz']
data_with_dt = add_delta_time(data, columns)
data_with_dt.set_index(keys='sat_id', drop=False, inplace=True)


In [0]:
#data normalization
normalizer = Norm(data_with_dt, ['sat_id'])
norm_data = normalizer.l2_norm(data_with_dt)

In [0]:
#data splitting
np.random.seed(42)

train_indices, test_indices = split_data(len(data['sat_id'].unique()))
folds = split_folds(train_indices, 5)
test_data = norm_data.loc[test_indices]

In [0]:
# data settings
sequence_length = 50
max_sequence_count = 50

# train settings
batch_size = 5
epoch_count = 2
plot_draw = True

# optimizer settings
learning_rate = 1e-4
weight_decay = 0

# model settings
lstm_hidden_dim = 50
lstm_hidden_lauers_count = 1
bidirectional = False
dropout = 0

#train_dataset = Data_Sat(train_data, sequence_length)
#train_dataset.generate_samples(max_sequence_count=max_sequence_count, last_sequence=False)
#val_dataset = Data_Sat(val_data, sequence_length)
#val_dataset.generate_samples(max_sequence_count=max_sequence_count,  last_sequence=False)
#print('Samples count:', len(train_dataset))

model = LSTM(lstm_hidden_dim=lstm_hidden_dim,
             lstm_layers_count=lstm_hidden_lauers_count,
             bidirectional=bidirectional,
             dropout=dropout,
            ).to(device)

loss_function = torch.nn.MSELoss()
optimizer = optim.Adam(
                        model.parameters(),
                        lr=learning_rate, 
                        weight_decay=weight_decay
                    )
default_state = optimizer.state_dict()                              #Нужно ли обновлять optimizer на каждом фолде?

In [0]:
tr_hist, val_hist = fit(model, norm_data, folds, loss_function, default_state, optimizer, epochs_count=epoch_count,
    batch_size=batch_size, plot_draw=plot_draw
   )

In [0]:
print('Mean_train_score: ', np.mean(tr_hist), ' Mean_val_score: ', np.mean(val_hist))

In [0]:
#Predict test and compute score

test_dataset = Data_Sat(test_data, sequence_length)
test_dataset.generate_samples(max_sequence_count=50)

metric = 0

for sat in test_dataset.satellite_dict:
    sat_data = test_dataset.satellite_dict[sat]
    X = FloatTensor(sat_data[..., :7]).to(device)
    y = FloatTensor(sat_data[..., 7:]).view(-1, 6).to(device)
    predicts = predict(model, X)[y!=0].view(-1, 6)
    metric += smape(predicts, 
                    y[y!=0].view(-1, 6)
                   )
    
metric /= len(test_dataset.satellite_dict)
score = (1-metric)*100
print('Test score: ', int(score.cpu()))

In [0]:
#fit(model, loss_function, val_data=test_dataset)
#test_submit = pd.read_csv('data/Track 1/test.csv')