In [1]:
# %connect_info
%qtconsole

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import pandas as pd
import numpy as np
from graphgym.custom_graphgym.loader.protein import ProteinDataset
import matplotlib.pyplot as plt
import os

In [3]:
os.chdir('/Users/cgu3/Documents/Grape-Pi')

Import data from csv fies

In [4]:

# dataset = ProteinDataset("data/yeast-ORBI", numeric_columns=['protein probability'], label_column=None)
# dataset = ProteinBatchDataset("data/yeast-LCQ", numeric_params=['protein probability'])
# dataset = ProteinBatchDataset("data/yeast-ORBI-mRNA", numeric_params=['protein probability', 'mRNA(M)'])
# dataset = ProteinBatchDataset("data/yeast-LCQ-mRNA", numeric_params=['protein probability', 'mRNA(M)'])
# dataset = ProteinDataset("data/single-soft-label", numeric_columns=['protein_probability', 'mRNA_TPM'], label_column=None)
protein_dataset = ProteinDataset("data/single", numeric_columns=['protein_probability', 'mRNA_TPM'], label_column=None)

In [5]:
protein_file = [f for f in os.listdir('data/single/raw/protein') if f.endswith('.csv')][0]
df = pd.read_csv(os.path.join('data/single/raw/protein', protein_file))
ids = df['protein.Accession'].values
# df = pd.read_csv('/Users/cgu3/Documents/data/SG_combined_protein_0982_with_protein_probability.csv')
df

Unnamed: 0,protein.Accession,protein_probability,protein_probability_soft_label,hard_label,gene_symbol,mRNA_TPM
0,A0A075B6H7,0.700122,0.267383,0,IGKV3-7,0.000000
1,A0A075B6H8,0.286947,0.056265,0,IGKV1D-42,0.000000
2,A0A075B6L6,0.943529,0.140940,0,TRBV7-3,0.000000
3,A0A075B6N1,0.289680,0.153119,0,TRBV19,0.000000
4,A0A075B6N2,0.259341,0.128836,0,TRBV20-1,0.370502
...,...,...,...,...,...,...
12532,Q9UI54,0.000000,0.000000,0,0,0.000000
12533,Q9Y3F1,0.000000,0.197423,0,0,0.000000
12534,Q9Y6C7,0.000000,0.093179,0,LINC00312,0.000000
12535,Q9Y6Z2,0.000000,0.000000,0,LINC01558,0.053100


In [6]:
import torch
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, data, labels, ids, train_mask=None, val_mask=None, test_mask=None):
        self.data = data
        self.labels = labels
        self.ids = ids

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        sample_data = self.data[index]
        sample_label = self.labels[index]
        sample_id = self.ids[index]
        return sample_data, sample_label, sample_id


In [7]:
from torch.utils.data import Dataset, Subset
dataset = CustomDataset(protein_dataset.x, protein_dataset.y, ids)

train_indices = torch.nonzero(protein_dataset.train_mask).squeeze().tolist()
train_dataset = Subset(dataset, train_indices)

val_indices = torch.nonzero(protein_dataset.val_mask).squeeze().tolist()
val_dataset = Subset(dataset, val_indices)

test_indices = torch.nonzero(protein_dataset.test_mask).squeeze().tolist()
test_dataset = Subset(dataset, test_indices)


In [8]:
from torch.utils.data import DataLoader
torch.manual_seed(12345)
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=64, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=False)

<torch._C.Generator at 0x126ed6c30>

In [9]:
import os
from torch import optim, nn, utils, Tensor
import pytorch_lightning as pl
from torchmetrics.classification import BinaryAUROC, BinaryAccuracy, BinaryF1Score
import torch


# define the LightningModule
class MLP(pl.LightningModule):
    def __init__(self, num_features, hidden_channels, num_classes, criterion):
        super().__init__()
        torch.manual_seed(12345)
        self.lin1 = nn.Linear(num_features, hidden_channels)
        self.lin2 = nn.Linear(hidden_channels, num_classes)
        # self.lin3 = nn.Linear(hidden_channels, num_classes)
        self.criterion = criterion
        self.val_auroc = BinaryAUROC()
        self.test_auroc = BinaryAUROC()
        self.val_accuracy = BinaryAccuracy()
        self.test_accuracy = BinaryAccuracy()
        self.val_F1 = BinaryF1Score()
        self.test_F1 = BinaryF1Score()

    def forward(self, x):
        x = self.lin1(x)
        x = x.relu()
        x = self.lin2(x)
        return x

    def get_auc(self, out, target):
        return self.auroc(out, target)

    def training_step(self, batch, batch_idx):
        x, y, id = batch
        logits = self(x).squeeze(-1)  # Perform a single forward pass.

        loss = self.criterion(logits, y)  # Compute the loss solely based on the training nodes.
        values = {"loss": loss}
        self.log_dict(values, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y, ids = batch
        logits = self(x).squeeze(-1)  # Perform a single forward pass.
        self.val_accuracy.update(logits, y)
        loss = self.criterion(logits, y)  # Compute the loss solely based on the training nodes.
        self.val_auroc.update(logits, y)
        self.val_F1.update(logits, y)
        values = {"val_loss": loss, "val_acc": self.val_accuracy, "val_auroc": self.val_auroc, "val_F1": self.val_F1}
        self.log_dict(values, prog_bar=True, on_step=False, on_epoch=True)
        return loss

    def test_step(self, batch, batch_idx):
        x, y, ids = batch
        logits = self(x).squeeze(-1)
        self.test_accuracy.update(logits, y)
        loss = self.criterion(logits, y)
        self.test_auroc.update(logits, y)
        self.test_F1.update(logits, y)
        values = {"test_loss": loss, "test_acc": self.test_accuracy, "test_auroc": self.test_auroc, "test_F1": self.test_F1}
        self.log_dict(values, prog_bar=True, on_step=False, on_epoch=True)
        return loss
    
    def predict_step(self, batch, batch_idx):
        x, y, ids = batch
        ids = [item for sublist in ids for item in sublist]
        logits = self(x).squeeze(-1)
        pred_prob = torch.nn.functional.sigmoid(logits)
        return (ids, x[:, 0], x[:, 1], pred_prob, y)

    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=1e-3)
        return optimizer



In [10]:
# init the autoencoder
MLP_model = MLP(1, 1, 1, criterion = torch.nn.BCEWithLogitsLoss())
# MLP_model = MLP(dataset.num_node_features, 10, 2, criterion = torch.nn.CrossEntropyLoss())

In [11]:
import sys
from pytorch_lightning.callbacks import TQDMProgressBar


class MyProgressBar(TQDMProgressBar):
    def init_validation_tqdm(self):
        bar = super().init_validation_tqdm()
        if not sys.stdout.isatty():
            bar.disable = True
        return bar

    def init_predict_tqdm(self):
        bar = super().init_predict_tqdm()
        if not sys.stdout.isatty():
            bar.disable = True
        return bar

    def init_test_tqdm(self):
        bar = super().init_test_tqdm()
        if not sys.stdout.isatty():
            bar.disable = True
        return bar



In [12]:
import pytorch_lightning as pl
trainer = pl.Trainer(limit_train_batches=100, max_epochs=100, enable_progress_bar=False)

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [None]:
trainer.fit(model=MLP_model, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader)


  | Name          | Type              | Params
----------------------------------------------------
0 | lin1          | Linear            | 2     
1 | lin2          | Linear            | 2     
2 | criterion     | BCEWithLogitsLoss | 0     
3 | val_auroc     | BinaryAUROC       | 0     
4 | test_auroc    | BinaryAUROC       | 0     
5 | val_accuracy  | BinaryAccuracy    | 0     
6 | test_accuracy | BinaryAccuracy    | 0     
7 | val_F1        | BinaryF1Score     | 0     
8 | test_F1       | BinaryF1Score     | 0     
----------------------------------------------------
4         Trainable params
0         Non-trainable params
4         Total params
0.000     Total estimated model params size (MB)


In [None]:
trainer.test(model=MLP_model, dataloaders=test_dataloader)

In [200]:
out = trainer.predict(MLP_model, dataloaders=test_dataloader)
out

/Users/cgu3/anaconda3/envs/grape-pi/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


IndexError: index 1 is out of bounds for dimension 1 with size 1

In [281]:
accession, raw_prob, mRNA, pred_prob, soft_label = zip(*out)
accession = [item for sublist in accession for item in sublist]
raw_prob = [item.item() for sublist in raw_prob for item in sublist]
mRNA = [item.item() for sublist in mRNA for item in sublist]
pred_prob = [item.item() for sublist in pred_prob for item in sublist]
soft_label = [item.item() for sublist in soft_label for item in sublist]
pd.DataFrame({'accession': accession, 'raw_prob': raw_prob, 'mRNA': mRNA, 'pred_prob': pred_prob, 'soft_label': soft_label})

  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")

  | Name             | Type              | Params
-------------------------------------------------------
0 | lin1             | Linear            | 20    
1 | lin2             | Linear            | 11    
2 | criterion        | BCEWithLogitsLoss | 0     
3 | train_collection | MetricCollection  | 0     
4 | val_collection   | MetricCollection  | 0     
5 | test_collection  | MetricCollection  | 0     
-------------------------------------------------------
31        Trainable params
0         Non-trainable params
31        Total params
0.000     Total estimated model params size (MB)


Epoch 10:  45%|████▍     | 47/105 [01:47<02:13,  2.29s/it, v_num=97, loss=-.203, val_loss=-.0114, val_acc=0.760, val_auc=0.830, val_f1=0.711] 
                                                                            

  rank_zero_warn(
  rank_zero_warn(


Epoch 10: 100%|██████████| 105/105 [00:00<00:00, 143.22it/s, v_num=97, loss=-.314] 
Epoch 11: 100%|██████████| 105/105 [00:00<00:00, 146.27it/s, v_num=97, loss=0.736, val_loss=-.0882, val_acc=0.764, val_auc=0.830, val_f1=0.721]  
Epoch 12: 100%|██████████| 105/105 [00:00<00:00, 152.38it/s, v_num=97, loss=-.784, val_loss=-.141, val_acc=0.766, val_auc=0.830, val_f1=0.724] 
Epoch 13: 100%|██████████| 105/105 [00:00<00:00, 153.06it/s, v_num=97, loss=0.166, val_loss=-.192, val_acc=0.765, val_auc=0.830, val_f1=0.723]
Epoch 14: 100%|██████████| 105/105 [00:00<00:00, 151.92it/s, v_num=97, loss=-.323, val_loss=-.241, val_acc=0.765, val_auc=0.830, val_f1=0.718] 
Epoch 15: 100%|██████████| 105/105 [00:00<00:00, 154.61it/s, v_num=97, loss=-.104, val_loss=-.298, val_acc=0.765, val_auc=0.830, val_f1=0.723]  
Epoch 16: 100%|██████████| 105/105 [00:00<00:00, 156.63it/s, v_num=97, loss=-.197, val_loss=-.352, val_acc=0.772, val_auc=0.830, val_f1=0.735]  
Epoch 17: 100%|██████████| 105/105 [00:00<00:00, 

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


In [102]:
test_dataloader

<torch.utils.data.dataloader.DataLoader at 0x156841660>

In [54]:
%reload_ext tensorboard
%tensorboard --logdir=lightning_logs/

Launching TensorBoard...