In [58]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [59]:
from pathlib import Path

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from fastcore.test import test, operator

In [69]:
from pyproteonet.simulation.missing_values import simulate_mnars_thresholding, simulate_mcars
from pyproteonet.visualization import plot_hist
from pyproteonet.simulation.sampling import draw_normal_log_space
from pyproteonet.processing.aggregation import neighbor_sum
from pyproteonet.processing.dataset_transforms import normalize, logarithmize
from pyproteonet.processing.masking import train_test_non_missing_no_overlap_iterable
from pyproteonet.predictors import GnnPredictor
from pyproteonet.dgl.gnn_architectures import GAT
from pyproteonet.lightning.console_logger import ConsoleLogger

# Load Real World Dataset as Template

In [61]:
from test_utils import load_maxlfq_benchmark

In [62]:
maxlfq_benchmark = load_maxlfq_benchmark()

In [63]:
log_mu, log_sigma = 0.05647178595714227, 2.519063763272205

# Simulate Simple Data without any Errors

In [64]:
ds = draw_normal_log_space(molecule_set=maxlfq_benchmark.molecule_set,
                           log_mu=log_mu,
                           log_sigma=log_sigma,
                           num_samples=len(maxlfq_benchmark.samples),
                           molecule='protein_group', column='abundance_gt')
neighbor_sum(ds, input_molecule='protein_group', column='abundance_gt', mapping='protein_group',
             result_molecule='peptide', result_column='abundance', only_unique=False, inplace=True)

In [66]:
ds_gnn = normalize(logarithmize(ds))
train_mds, test_mds = train_test_non_missing_no_overlap_iterable(dataset=ds_gnn, train_frac=0.1, test_frac=0.2, molecule='peptide', column='abundance')
logger = ConsoleLogger()
gnn_predictor = GnnPredictor(mapping='protein_group', value_columns=['abundance'], molecule_columns=[], target_column='abundance',
                             model = GAT(in_dim=3, hidden_dim=40, out_dim=1, num_heads=20),
                             bidirectional_graph = True, missing_substitute_value=0.0,
                             logger=logger, 
                            )
gnn_predictor.fit(train_mds=train_mds, test_mds=test_mds, max_epochs=4)
test(logger.logs['val_r2'][-1], 0.9, operator.gt)

  rank_zero_warn(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type | Params
-------------------------------
0 | model | GAT  | 326 K 
-------------------------------
326 K     Trainable params
0         Non-trainable params
326 K     Total params
1.306     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

step5: val_loss:2.287769317626953 || val_MAE:1.212532639503479 || val_r2:0.011150834150612354 || epoch:0 || 


Validation: 0it [00:00, ?it/s]

step11: val_loss:0.42071810364723206 || val_MAE:0.5423712134361267 || val_r2:0.80494624376297 || epoch:1 || 


Validation: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=3` reached.


step17: val_loss:0.1773095279932022 || val_MAE:0.3163752555847168 || val_r2:0.9200934767723083 || epoch:2 || 
