In [None]:
import pandas as pd
import numpy as np
import math 
import scipy.stats as st
from sklearn.preprocessing import Normalizer


import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import plotly.express as px
import plotly.graph_objects as go

%matplotlib inline

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset

In [None]:
is_cuda = torch.cuda.is_available()
if is_cuda:
    device = torch.device('cuda:0')
    from torch.cuda import FloatTensor, LongTensor
else:
    device = torch.device('cpu')
    from torch import FloatTensor, LongTensor
    
 try:
    from google.colab import drive
    is_in_colab = True
except:
    is_in_colab = False

if is_in_colab:
    drive.mount('/content/drive')
    data_folder = r'/content/drive/My Drive/Colab/IDAO_2020/'
else:
    data_folder = r'./data/'
    

In [None]:
# баш команда для создания каталога в монитрованном гугл-диске, для хранения там данных. 
# Выполните один раз после монтирования диска, чтобы не создавать папку вручную
# ! mkdir -p '/content/drive/My Drive/Colab/IDAO_2020/'

In [None]:
data = pd.read_csv(data_folder + 'train.csv', parse_dates=['epoch']).iloc[:,2:]

In [None]:
data.set_index(keys=data['sat_id'], inplace=True)

In [None]:
def split_data(values, coeff=0.7):
  split = int(np.floor(coeff * values))
  split2 = int(np.floor(values*(1-coeff)/2))
  indices = list(range(values))
  np.random.shuffle(indices)
  train_indices, val_indices, test_indices = indices[:split], indices[split:split+split2], indices[split+split2:]
  return train_indices, val_indices, test_indices

train_indices, val_indices, test_indices = split_data(len(data['sat_id'].unique()))

In [None]:
train_data = data.loc[train_indices]
test_data = data.loc[test_indices]
val_data = data.loc[val_indices]

In [None]:
class Data_Sat(Dataset):
    def __init__(self, data, sequence_duration=20, normalizer=None):
        self.sequence_duration = sequence_duration
        self.data = data
        self.satellite_dict = {}
        self.normalizer = normalizer
        self.split_data()

    def split_data(self):
      for ind, satellite in enumerate(self.data['sat_id'].unique()):
            sat_data = self.data.query('sat_id==@satellite').iloc[:,1:]
            sequence_count = np.floor(sat_data.shape[0] / self.sequence_duration).astype('int')

            samples_sat = np.zeros((sequence_count * self.sequence_duration, 12))
            samples_sat = sat_data.iloc[: sequence_count * self.sequence_duration].values
            if self.normalizer:
                samples_sat = self.normalizer.fit_transform(samples_sat)

            self.satellite_dict[ind] = samples_sat.reshape(sequence_count, self.sequence_duration, 12)

        
        
    def sample_generator(self, max_sequence_count=10):
      self.samples = []

      for sat in self.satellite_dict.values():
        if sat.shape[0] > max_sequence_count:
          sequence_count = sat.shape[0]
          numb_samples = math.ceil(sequence_count / max_sequence_count)
          step = (sequence_count - max_sequence_count)/(numb_samples-1)
          for sample in range(numb_samples):
            next_step = round(step*sample)
            
            self.samples.append(self.data_casting(sat[next_step: next_step + max_sequence_count]))
            # break

    def data_casting(self, data):
      for i in range(6):
        data[...,i+6] -= data[0, 0, i+6] - data[0, 0, i]
      return data


    def __len__(self):
        '''
        Returns total number of samples
        '''
        return len(self.samples)
            
    def __getitem__(self, index):
        
        return torch.Tensor(self.samples[index]).type(torch.FloatTensor)

dataset = Data_Sat(train_data, 20, Normalizer())
dataset.sample_generator(max_sequence_count=100)
print('Samples count:',len(dataset))

In [None]:
dataset[1].shape

In [None]:
import math
from tqdm import tqdm


def do_epoch(model, criterion, data, batch_size, optimizer=None, name=None):
    epoch_loss = 0
    correct_count = 0
    sum_count = 0
    
    is_train = not optimizer is None
    name = name or ''
    model.train(is_train)
    
    with torch.autograd.set_grad_enabled(is_train):
        with tqdm(total=1) as progress_bar:
            for i, sample in enumerate(data):
                sample = sample.permute(1, 2, 0, 3)
                for sequence in sample:
                    X_batch, y_batch = (sequence[...,6:]).to(device), (sequence[...,:6]).to(device)
                    logits = model(X_batch)


                    loss = criterion(logits, y_batch)

                    epoch_loss += loss.item()

                    if optimizer:
                        optimizer.zero_grad()
                        loss.backward()
                        optimizer.step()

#                     indices = torch.max(logits, dim=2)[1]   
                # correct_samples = float(torch.sum(indices[y_batch!=0] == y_batch[y_batch!=0]))
                
                # cur_correct_count, cur_sum_count = correct_samples, y_batch[y_batch!=0].shape[0]

                # correct_count += cur_correct_count
                # sum_count += cur_sum_count

                progress_bar.update()
                progress_bar.set_description('{:>5s} Loss = {:.5f}'.format(
                    name, loss.item())
                )
                
            progress_bar.set_description('{:>5s} Loss = {:.5f}'.format(
                name, epoch_loss)
            )

    return epoch_loss #/ batches_count, correct_count / sum_count


def fit(model, criterion, optimizer, train_data, epochs_count=1, batch_size=1,
        val_data=None, val_batch_size=None):
        
    if not val_data is None and val_batch_size is None:
        val_batch_size = batch_size
        
    for epoch in range(epochs_count):
        name_prefix = '[{} / {}] '.format(epoch + 1, epochs_count)
        train_loss = do_epoch(model, criterion, train_data, batch_size, optimizer, name_prefix + 'Train:')
        
        if not val_data is None:
            val_loss, val_acc = do_epoch(model, criterion, val_data, val_batch_size, None, name_prefix + '  Val:')

In [None]:
class LSTM(nn.Module):
    def __init__(self, input_dim=6, output_dim=6, lstm_hidden_dim=28, lstm_layers_count=1):
        super().__init__()
        self.lstm = nn.LSTM(input_size = 6, 
                            hidden_size = lstm_hidden_dim,
                            num_layers = lstm_layers_count,
                            bias=True)
        
        self.linear = nn.Linear(lstm_hidden_dim, 6, bias=True)

    def forward(self, inputs):
        lstm_out, _ = self.lstm.forward(inputs)
        linear_out = self.linear.forward(lstm_out)
        return linear_out

In [None]:
model = LSTM().to(device)
criterion = torch.nn.MSELoss()
train_loader = torch.utils.data.DataLoader(dataset, batch_size=5)
optimizer = optim.Adam(model.parameters())
fit(model, criterion, optimizer, train_loader, epochs_count=20)

In [None]:
sequence_duration = 20
val_dataset = Data_Sat(val_data, sequence_duration, Normalizer())
# scaler = StandardScaler()

for batch in val_dataset.satellite_dict[1]:
    model.train(False)
    batch = batch.reshape(sequence_duration, 1, -1)
    X_batch, y_batch = FloatTensor(batch[...,6:]), FloatTensor(batch[...,:6]).detach().to("cpu").numpy().reshape(sequence_duration,-1)
    
    predict = model(X_batch).detach().to("cpu").numpy().reshape(sequence_duration,-1)
    with_model = predict - y_batch
    without_model = X_batch.detach().to("cpu").numpy().reshape(sequence_duration,-1) - y_batch
    print(abs(with_model).sum())
    print(abs(without_model).sum())
    print("-"*40)
    
#     break
#     scaler.fit(predict.detach())
#     print(scaler.inverse_transform())