In [1]:
# Import required libraries
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
from models.Denoising import DenoisingAutoencoder   # Import the PyTorch DAE
from tqdm import tqdm
 

In [2]:
# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 
# =========================
# Data Preparation
# =========================
# Load the data
data_file = 'data/GEL_data/catalogues_Prostate_SBS.tsv'
mf_df = pd.read_table(data_file, index_col=0).astype('float32')

# Transform the count data into frequencies

mf_df = mf_df.div(mf_df.sum(axis=1), axis=0)

print(mf_df.head())

mf_tensor = torch.tensor(mf_df.values)
 
# Add noise
noise_factor = 0.5
noisy_data = mf_tensor + noise_factor * torch.randn_like(mf_tensor)
noisy_data = torch.clamp(noisy_data, 0., 1.)  # Clip values to [0, 1]
 
# Train-test split
test_split = 0.2
num_samples = mf_df.shape[0]
test_size = int(num_samples * test_split)
train_size = num_samples - test_size

 
train_data, test_data = torch.utils.data.random_split(
    TensorDataset(noisy_data, mf_tensor), [train_size, test_size])
 
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
test_loader = DataLoader(test_data, batch_size=32)
 
 

         GEL-2371244-11  GEL-2394208-11  GEL-2747967-11  GEL-2428569-11  \
A[C>A]A        0.001625        0.003074        0.003074        0.012119   
A[C>A]C        0.002203        0.003204        0.002804        0.018159   
A[C>A]G        0.002249        0.002571        0.001285        0.011568   
A[C>A]T        0.001567        0.002999        0.001908        0.016628   
C[C>A]A        0.001853        0.002236        0.003258        0.014950   

         GEL-2821784-11  GEL-2712389-11  GEL-2363308-11  GEL-2741618-11  \
A[C>A]A        0.002810        0.003117        0.005269        0.002722   
A[C>A]C        0.002804        0.003605        0.005341        0.003338   
A[C>A]G        0.002892        0.001607        0.005784        0.002571   
A[C>A]T        0.002385        0.001976        0.005043        0.002794   
C[C>A]A        0.003450        0.003067        0.004792        0.001981   

         GEL-2888061-11  GEL-2075446-11  ...  GEL-2596847-11  GEL-2000858-11  \
A[C>A]A        0.0

In [3]:
print(mf_df)
print(train_loader.dataset[0])

         GEL-2371244-11  GEL-2394208-11  GEL-2747967-11  GEL-2428569-11  \
A[C>A]A        0.001625        0.003074        0.003074        0.012119   
A[C>A]C        0.002203        0.003204        0.002804        0.018159   
A[C>A]G        0.002249        0.002571        0.001285        0.011568   
A[C>A]T        0.001567        0.002999        0.001908        0.016628   
C[C>A]A        0.001853        0.002236        0.003258        0.014950   
...                 ...             ...             ...             ...   
G[T>G]T        0.001564        0.002086        0.001391        0.007648   
T[T>G]A        0.001913        0.003689        0.003279        0.007925   
T[T>G]C        0.002048        0.002421        0.003166        0.007263   
T[T>G]G        0.001242        0.002130        0.003372        0.010648   
T[T>G]T        0.001128        0.001900        0.002731        0.007244   

         GEL-2821784-11  GEL-2712389-11  GEL-2363308-11  GEL-2741618-11  \
A[C>A]A        0.002810 

In [4]:
# =========================
# Model Initialization
# =========================
input_dim = mf_df.shape[1]
latent_dim = 100
model = DenoisingAutoencoder(input_dim, latent_dim).to(device)
 
# Define optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()


In [5]:

# Check & Fix ?

# =========================
# Training Loop
# =========================
epochs = 50
 
for epoch in tqdm(range(epochs)):
    model.train()
    train_loss = 0.0
    for batch_noisy, batch_clean in train_loader:
        batch_noisy, batch_clean = batch_noisy.to(device), batch_clean.to(device)
        # Forward pass
        outputs = model(batch_noisy)
        loss = criterion(outputs, batch_clean)
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    # Validation loss
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for batch_noisy, batch_clean in test_loader:
            batch_noisy, batch_clean = batch_noisy.to(device), batch_clean.to(device)
            outputs = model(batch_noisy)
            loss = criterion(outputs, batch_clean)
            val_loss += loss.item()
    # print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {train_loss / len(train_loader):.4f}, "
          # f"Val Loss: {val_loss / len(test_loader):.4f}")
 
# =========================
# Save Results
# =========================
# Save latent representations
model.eval()
encoded_data = model.encoder(mf_tensor.to(device)).cpu().detach().numpy()
encoded_df = pd.DataFrame(encoded_data, index=mf_df.index)
encoded_df.to_csv('encoded_representations.tsv', sep='\t')
 
print("Training completed and latent representations saved!")

100%|██████████| 50/50 [00:00<00:00, 72.94it/s]

Training completed and latent representations saved!





In [6]:
print(val_loss)

2.9583534342236817e-05
