In [10]:
# Import required libraries
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
from models.Denoising import DenoisingAutoencoder   # Import the PyTorch DAE
from tqdm import tqdm
 

In [11]:
# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 
# =========================
# Data Preparation
# =========================
# Load the data
data_file = 'data/GEL_data/catalogues_Prostate_SBS.tsv'
mf_df = pd.read_table(data_file, index_col=0).astype('float32')
mf_tensor = torch.tensor(mf_df.values)
 
# Add noise
noise_factor = 0.5
noisy_data = mf_tensor + noise_factor * torch.randn_like(mf_tensor)
noisy_data = torch.clamp(noisy_data, 0., 1.)  # Clip values to [0, 1]
 
# Train-test split
test_split = 0.2
num_samples = mf_df.shape[0]
test_size = int(num_samples * test_split)
train_size = num_samples - test_size

 
train_data, test_data = torch.utils.data.random_split(
    TensorDataset(noisy_data, mf_tensor), [train_size, test_size])
 
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
test_loader = DataLoader(test_data, batch_size=32)
 
 

In [12]:
# =========================
# Model Initialization
# =========================
input_dim = mf_df.shape[1]
latent_dim = 100
model = DenoisingAutoencoder(input_dim, latent_dim).to(device)
 
# Define optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()
 

In [14]:

# =========================
# Training Loop
# =========================
epochs = 10000
 
for epoch in tqdm(range(epochs)):
    model.train()
    train_loss = 0.0
    for batch_noisy, batch_clean in train_loader:
        batch_noisy, batch_clean = batch_noisy.to(device), batch_clean.to(device)
        # Forward pass
        outputs = model(batch_noisy)
        loss = criterion(outputs, batch_clean)
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    # Validation loss
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for batch_noisy, batch_clean in test_loader:
            batch_noisy, batch_clean = batch_noisy.to(device), batch_clean.to(device)
            outputs = model(batch_noisy)
            loss = criterion(outputs, batch_clean)
            val_loss += loss.item()
    # print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {train_loss / len(train_loader):.4f}, "
          # f"Val Loss: {val_loss / len(test_loader):.4f}")
 
# =========================
# Save Results
# =========================
# Save latent representations
model.eval()
encoded_data = model.encoder(mf_tensor.to(device)).cpu().detach().numpy()
encoded_df = pd.DataFrame(encoded_data, index=mf_df.index)
encoded_df.to_csv('encoded_representations.tsv', sep='\t')
 
print("Training completed and latent representations saved!")

100%|██████████| 10000/10000 [05:40<00:00, 29.35it/s]

Training completed and latent representations saved!





In [17]:
print(loss)

tensor(28076.2793, device='cuda:0')
