In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold 
from earlystopping import *
from module_multiome import *
import gc

In [3]:
# load data
feature_path = '../dataset/'

train_df = pd.read_feather(feature_path+'train_multi_inputs_id.feather')
test_df = pd.read_feather(feature_path+'train_multi_inputs_id.feather')

train_multi_X = np.load(feature_path+'train_multi_X.npy')
test_multi_X = np.load(feature_path+'test_multi_X.npy')
train_multi_y = np.load(feature_path+'train_multi_targets.npy') 
A = train_multi_X  
B = train_multi_y  
A_tensor = torch.tensor(A, dtype=torch.float32)
B_tensor = torch.tensor(B, dtype=torch.float32)
dataset = TensorDataset(A_tensor, B_tensor)
train_multi_X.shape, train_multi_y.shape

((105942, 500), (105942, 23418))

In [5]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
criterion = nn.MSELoss()

In [6]:
%%time
# Training
k_folds = 5
kfold = KFold(n_splits=k_folds, shuffle=True)

num_epochs = 50

for fold, (train_ids, test_ids) in enumerate(kfold.split(dataset)):
    print(f'FOLD {fold}')
    print('--------------------------------')

    train_subsampler = torch.utils.data.SubsetRandomSampler(train_ids)
    test_subsampler = torch.utils.data.SubsetRandomSampler(test_ids)

    trainloader = DataLoader(dataset, batch_size=8192, sampler=train_subsampler,num_workers=8)
    testloader = DataLoader(dataset, batch_size=8192, sampler=test_subsampler,num_workers=8)

    generator, discriminator = create_models()
    generator = generator.to(device)
    discriminator = discriminator.to(device)
    g_optimizer = optim.Adam(generator.parameters(), lr=0.0002)
    d_optimizer = optim.Adam(discriminator.parameters(), lr=0.0002)

    early_stopping = EarlyStopping(patience=5, min_delta=0.01)

    for epoch in range(num_epochs):
        generator.train()
        discriminator.train()

        for data in trainloader:
            A_batch, B_batch = data
            A_batch, B_batch = A_batch.to(device), B_batch.to(device)

            # Train Discriminator
            d_optimizer.zero_grad()
            real_output = discriminator(B_batch)
            fake_B = generator(A_batch)
            fake_output = discriminator(fake_B.detach())
            d_loss_real = criterion(real_output, torch.ones_like(real_output))
            d_loss_fake = criterion(fake_output, torch.zeros_like(fake_output))
            d_loss = (d_loss_real + d_loss_fake) / 2
            d_loss.backward()
            d_optimizer.step()

            # Train Generator
            g_optimizer.zero_grad()
            fake_output = discriminator(fake_B)
            g_loss = criterion(fake_output, torch.ones_like(fake_output)) + criterion(fake_B, B_batch)
            g_loss.backward()
            g_optimizer.step()

        # validate
        generator.eval()
        val_loss = 0
        with torch.no_grad():
            for data in testloader:
                A_batch, B_batch = data
                A_batch, B_batch = A_batch.to(device), B_batch.to(device)
                fake_B = generator(A_batch)
                val_loss += criterion(fake_B, B_batch).item()
        
        val_loss /= len(testloader)
        print(f'Epoch [{epoch+1}/{num_epochs}], d_loss: {d_loss.item():.4f}, g_loss: {g_loss.item():.4f}, val_loss: {val_loss:.4f}')

        early_stopping(val_loss)
        if early_stopping.early_stop:
            print("Early stopping")
            break




FOLD 0
--------------------------------


  from .autonotebook import tqdm as notebook_tqdm


Epoch [1/50], d_loss: 0.0000, g_loss: 4.8600, val_loss: 3.7620
Epoch [2/50], d_loss: 0.0000, g_loss: 3.8105, val_loss: 2.7394
Epoch [3/50], d_loss: 0.5003, g_loss: 2.3553, val_loss: 2.3485
Epoch [4/50], d_loss: 0.5000, g_loss: 2.2238, val_loss: 2.2151
Epoch [5/50], d_loss: 0.5000, g_loss: 2.1702, val_loss: 2.1691
Epoch [6/50], d_loss: 0.5000, g_loss: 2.1554, val_loss: 2.1543
Epoch [7/50], d_loss: 0.5000, g_loss: 2.1515, val_loss: 2.1491
Epoch [8/50], d_loss: 0.5000, g_loss: 2.1430, val_loss: 2.1410
Epoch [9/50], d_loss: 0.5000, g_loss: 2.1234, val_loss: 2.1166
Epoch [10/50], d_loss: 0.5000, g_loss: 2.1115, val_loss: 2.1089
Epoch [11/50], d_loss: 0.5000, g_loss: 2.1032, val_loss: 2.1062
Epoch [12/50], d_loss: 0.5000, g_loss: 2.0995, val_loss: 2.0996
Epoch [13/50], d_loss: 0.5000, g_loss: 2.1016, val_loss: 2.0963
Epoch [14/50], d_loss: 0.5000, g_loss: 2.0882, val_loss: 2.0930
Epoch [15/50], d_loss: 0.5000, g_loss: 2.0935, val_loss: 2.0917
Epoch [16/50], d_loss: 0.5000, g_loss: 2.0887, va

In [7]:
generator

Generator(
  (attention): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=500, out_features=500, bias=True)
  )
  (layer_norm): LayerNorm((500,), eps=1e-05, elementwise_affine=True)
  (fc): Sequential(
    (0): Linear(in_features=500, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=23418, bias=True)
  )
)

In [8]:
def generate_B_from_A(new_A):
    new_A_tensor = torch.tensor(new_A, dtype=torch.float32)
    dataset = TensorDataset(new_A_tensor)
    dataloader = DataLoader(dataset, batch_size=64)

    generator.eval()
    generated_B = []
    with torch.no_grad():
        for data in dataloader:
            A_batch = data[0].cuda()
            fake_B = generator(A_batch)
            generated_B.append(fake_B.cpu().numpy())
    
    generated_B = np.concatenate(generated_B, axis=0)
    return generated_B

generated_B = generate_B_from_A(test_multi_X)
generated_B

array([[0.60343105, 0.3458233 , 0.34303367, ..., 1.4880004 , 1.259369  ,
        2.474244  ],
       [0.6014136 , 0.34502655, 0.3446757 , ..., 1.4828991 , 1.2593346 ,
        2.4766738 ],
       [0.59464055, 0.34384373, 0.34512666, ..., 1.4688619 , 1.2545477 ,
        2.4668083 ],
       ...,
       [0.4066004 , 0.2936568 , 0.30199626, ..., 0.94134074, 0.9706296 ,
        1.7565163 ],
       [0.4168693 , 0.29230085, 0.30114892, ..., 0.97669625, 0.9810226 ,
        1.7901096 ],
       [0.39712453, 0.29427043, 0.30096447, ..., 0.90688264, 0.96370304,
        1.729365  ]], dtype=float32)

In [9]:
generated_B.shape

(55935, 23418)

In [None]:
######data  collation

In [10]:
input_path = '../dataset/'
metadata = pd.read_csv(input_path+'metadata.csv')[['cell_id','technology']]
evaluation_ids = pd.read_csv(input_path+'evaluation_ids.csv')
evaluation_ids = evaluation_ids.merge(metadata, on=['cell_id'], how='left')

# multi
train_multi_targets = pd.read_hdf(input_path+'train_multi_targets.h5')
multi_targets = train_multi_targets.columns.values.tolist()

del train_multi_targets
gc.collect()

test_preds_multi = pd.DataFrame(generated_B, columns=multi_targets)

test_multi_inputs_id = pd.read_feather(feature_path+'test_multi_inputs_id.feather')
test_preds_multi['cell_id'] = test_multi_inputs_id['cell_id']
test_preds_multi = test_preds_multi[test_preds_multi['cell_id'].isin(evaluation_ids['cell_id'])]
test_preds_multi = pd.melt(test_preds_multi,id_vars='cell_id')
test_preds_multi.columns = ['cell_id','gene_id','target']

del test_multi_inputs_id
gc.collect()


0

In [11]:
test_preds_cite = pd.read_csv('../dataset/pred_cite.csv',index_col=0)
test_preds_cite

Unnamed: 0,cell_id,gene_id,target
0,c2150f55becb,CD86,0.599453
1,65b7edf8a4da,CD86,0.599510
2,c1b26cb1057b,CD86,0.586568
3,917168fa6f83,CD86,0.593099
4,2b29feeca86d,CD86,0.589955
...,...,...,...
6812815,a9b4d99f1f50,CD224,2.738467
6812816,0e2c1d0782af,CD224,2.715506
6812817,a3cbc5aa0ec3,CD224,5.346298
6812818,75b350243add,CD224,3.651095


In [12]:
# merge final results
test_preds = pd.concat([test_preds_cite,test_preds_multi])
evaluation_ids = evaluation_ids.merge(test_preds, on=['cell_id','gene_id'], how='left')
evaluation_ids[['row_id','target']].to_csv('../dataset/submission.csv',index=False)