In [None]:
import numpy as np
import pandas as pd
import torch
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
%matplotlib inline

from preprocessing import add_delta_time, Norm, split_data, split_folds, Data_Sat
from model_builder import LSTM, cross_validation, fit, load_model

In [None]:
is_cuda = torch.cuda.is_available()
if is_cuda:
    device = torch.device('cuda:0')
    from torch.cuda import FloatTensor
else:
    device = torch.device('cpu')
    from torch import FloatTensor
    
try:
    from google.colab import drive
    is_in_colab = True
except:
    is_in_colab = False

In [None]:
# вывод информации о выданном с colab GPU
if is_in_colab:
    !ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
    !pip install gputil
    !pip install psutil
    !pip install humanize
    import psutil
    import humanize
    import os
    import GPUtil as GPU
    GPUs = GPU.getGPUs()
    gpu = GPUs[0]
    def printm():
        process = psutil.Process(os.getpid())
        print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ), " | Proc size: " + humanize.naturalsize( process.memory_info().rss))
        print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))

    printm()

In [None]:
if is_in_colab:
    drive.mount('/content/drive')
    data_folder = r'/content/drive/My Drive/Colab/IDAO_2020/'
else:
    data_folder = r'./data/'

In [None]:
#data preparation
data = pd.read_csv(data_folder + 'train.csv', parse_dates=['epoch'])
columns = ['id', 'sat_id', 'delta_seconds', 'x_sim', 'y_sim', 'z_sim', 'Vx_sim', 'Vy_sim', 'Vz_sim',
           'x', 'y', 'z', 'Vx', 'Vy', 'Vz']
data_with_dt = add_delta_time(data, columns)
data_with_dt.set_index(keys='sat_id', drop=False, inplace=True)


In [None]:
#data normalization
normalizer = Norm(data_with_dt, ['id', 'sat_id', ])
norm_data = normalizer.l2_norm(data_with_dt)

In [None]:
#data splitting
np.random.seed(42)

train_indices, test_indices = split_data(len(data['sat_id'].unique()))
folds = split_folds(train_indices, 2)
test_data = norm_data.loc[test_indices]

In [None]:
# data settings
sequence_length = 100
max_sequence_count = 50

# train settings
batch_size = 10
epoch_count = 1
plot_draw = False

# optimizer settings
learning_rate = 1e-3
weight_decay = 0

# model settings
lstm_hidden_dim = 10
lstm_hidden_lauers_count = 1
bidirectional = False
dropout = 0

# scheduler settings
factor = 0.1
patience = 2
threshold = 1e-2

model = LSTM(device, lstm_hidden_dim=lstm_hidden_dim,
             lstm_layers_count=lstm_hidden_lauers_count,
             bidirectional=bidirectional,
             dropout=dropout,
            ).to(device)

loss_function = torch.nn.MSELoss()
optimizer = optim.Adam(
                        model.parameters(),
                        lr=learning_rate, 
                        weight_decay=weight_decay
                    )

scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=factor, 
                              patience=patience, verbose=True, threshold=threshold
                              )

In [None]:
tr_hist, val_hist = cross_validation(model, norm_data, folds, loss_function, 
                                     sequence_length, max_sequence_count,
                                     optimizer, scheduler, 
                                     epochs_count=epoch_count,
                                     batch_size=batch_size, plot_draw=plot_draw
                                     )

In [None]:
print('Mean_train_score: ', np.mean(tr_hist), ' Mean_val_score: ', np.mean(val_hist))

In [None]:
train_dataset = Data_Sat(device, norm_data.loc[train_indices], sequence_length)
train_dataset.generate_samples(max_sequence_count, False)

In [None]:
# Train model
train_hist, val_hist = fit(model, loss_function, batch_size=batch_size, epochs_count=epoch_count, optimizer=optimizer,  
        scheduler=scheduler, train_dataset=train_dataset, val_dataset=None, plot_draw=False)

In [None]:
test_dataset = Data_Sat(device, test_data, sequence_length)
test_dataset.generate_samples(max_sequence_count=50)

In [None]:
def smape(satellite_predicted_values, satellite_true_values):
    # the division, addition and subtraction are point twice
    return torch.mean(torch.abs(satellite_predicted_values - satellite_true_values)
        / (torch.abs(satellite_predicted_values) + torch.abs(satellite_true_values)))


In [None]:
def predict(model, sat_data):
    """
    Получает на вход модель и разделенные на sequences_count, sequence_length данные. Предсказывает реальные значение по спутнику.
    Выводит Tensor формы (n_samples, n_features).
    """
    sequences_count, sequence_length, _ = sat_data.shape
    result = torch.zeros((sequences_count*sequence_length, 6)).to(model.device)
    model.eval()
    model.init_hidden(1)
    for i, seq in enumerate(sat_data):
        inputs = FloatTensor(seq[:, None, :]).to(model.device)
        predicted = model(inputs)

        predicted = predicted.view(sequence_length, -1).detach()
        result[i*sequence_length : (i+1)*sequence_length] = predicted
    return result

In [None]:
#Predict test and compute score
metric = 0
test_predicts = []
for sat in test_dataset.satellite_dict:
    sat_data = test_dataset.satellite_dict[sat]
    X = FloatTensor(sat_data[..., :7]).to(device)
    y = FloatTensor(sat_data[..., 7:]).view(-1, 6).to(device)
    predicts = predict(model, X)[y!=0].view(-1, 6)
    test_predicts.append(predicts.cpu().detach().numpy())
    metric += smape(predicts, 
                    y[y!=0].view(-1, 6)
                   )
    
metric /= len(test_dataset.satellite_dict)
score = (1-metric)*100
print(f'Test score: {float(score.cpu()):.2f}')

In [None]:
test_dataset.predict_to_df(test_predicts)
normalizer.back_l2_norm(test_dataset.result).head()

In [None]:
data_with_dt.loc[test_indices].head()

In [None]:
file_name = 'score_80_v1.model'
path = data_folder  + file_name

In [None]:
save_model(path, model, optimizer, scheduler, train_hist, val_hist)