## Import Required Libraries

In [17]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
import pandas as pd

## Define the Sparse Autoencoder Model

In [18]:
class SparseAutoencoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, sparsity_lambda=1e-3, sparsity_target=0.05):
        super(SparseAutoencoder, self).__init__()
        self.encoder = nn.Linear(input_dim, hidden_dim)
        self.decoder = nn.Linear(hidden_dim, input_dim)
        self.sparsity_lambda = sparsity_lambda
        self.sparsity_target = sparsity_target

    def forward(self, x):
        encoded = F.relu(self.encoder(x))
        decoded = self.decoder(encoded)
        return decoded, encoded

    def sparsity_loss(self, encoded):
        # Calculate the mean activation per neuron in the hidden layer
        mean_activation = torch.mean(encoded, dim=0)
        
        # KL divergence between target sparsity and actual sparsity
        kl_divergence = self.sparsity_target * torch.log(self.sparsity_target / mean_activation) + \
                        (1 - self.sparsity_target) * torch.log((1 - self.sparsity_target) / (1 - mean_activation))
        
        return self.sparsity_lambda * torch.sum(kl_divergence)


## Prepare the Dataset

In [19]:
# Define the path where the data is located
data_path = r"C:/Users/divyas/Documents/hackathons/CMI_PB/Tasks/Task_IGg_PT/Data_Task_IGg_PT/"

# Load the datasets using the data path
X_train = pd.read_csv(data_path + "abtiter_data_X_train.csv", index_col=0)
y_train = pd.read_csv(data_path + "abtiter_data_y_train.csv", index_col=0)
X_test = pd.read_csv(data_path + "abtiter_data_X_test.csv", index_col=0)

# Display the shapes of the datasets
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")


X_train shape: (111, 35)
y_train shape: (111, 1)
X_test shape: (54, 35)


In [20]:
# Mapping categorical columns to numerical values
X_train['infancy_vac'] = X_train['infancy_vac'].map({'wP': 0, 'aP': 1})
X_train['biological_sex'] = X_train['biological_sex'].map({'Female': 0, 'Male': 1})

# Dropping the unnecessary columns
X_train = X_train.drop(columns=['dataset', 'timepoint', 'date_of_boost'])

# Display the modified dataframe
print(X_train.head())

X_test['infancy_vac'] = X_test['infancy_vac'].map({'wP': 0, 'aP': 1})
X_test['biological_sex'] = X_test['biological_sex'].map({'Female': 0, 'Male': 1})

# Dropping the unnecessary columns
X_test = X_test.drop(columns=['dataset', 'timepoint', 'date_of_boost'])

# Display the modified dataframe
print(X_test.head())

             IgG_PRN    IgG_FHA   IgG1_PT  IgG1_PRN  IgG1_FHA  IgG1_FIM2/3  \
subject_id                                                                   
1           2.602350  34.050956  7.334714  2.174783  3.013252     1.188744   
3           7.652635   1.096457  1.424098  3.161591  1.287515     0.322658   
4           5.670403   1.048276  3.888604  2.591155  1.269821     2.621216   
5           5.268274   0.084437  7.456313  2.760065  2.864834     7.487345   
6           0.090176   0.379290  0.084132  0.025479  0.654192     0.681225   

             IgG1_TT   IgG1_DT   IgG1_OVA   IgG2_PT  ...   IgG3_OVA  \
subject_id                                           ...              
1           1.428852  2.389153   0.665203  1.000000  ...   1.865388   
3           1.377390  1.523941  33.771912  1.000000  ...   1.119233   
4           1.675259  2.022924   5.777047  4.269877  ...   1.000000   
5           1.537432  2.250237   4.130732  6.070427  ...   5.446934   
6           0.874920  0.369

In [21]:
# Standardize the dataset (fit only on training data)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert the standardized data into tensors
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)

# Create a DataLoader for batching
batch_size = 32
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# You can now use train_loader for training your model
# For testing, just create a DataLoader for the test data
test_dataset = TensorDataset(X_test_tensor)  # y_test is not available for challenge set
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Example of iterating over the DataLoader
for inputs, labels in train_loader:
    # inputs and labels are your batches
    print(f'Inputs batch shape: {inputs.shape}, Labels batch shape: {labels.shape}')
    # You can now pass inputs and labels to your model for training


Inputs batch shape: torch.Size([32, 32]), Labels batch shape: torch.Size([32, 1])
Inputs batch shape: torch.Size([32, 32]), Labels batch shape: torch.Size([32, 1])
Inputs batch shape: torch.Size([32, 32]), Labels batch shape: torch.Size([32, 1])
Inputs batch shape: torch.Size([15, 32]), Labels batch shape: torch.Size([15, 1])


## Training the Sparse Autoencoder

In [25]:
# Define model, optimizer, and loss function
input_dim = X_train_scaled.shape[1]  # Input dimension based on X_train
hidden_dim = 64  # Adjust hidden dimensions based on your data

model = SparseAutoencoder(input_dim=input_dim, hidden_dim=hidden_dim)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
num_epochs = 50

# Training Loop
for epoch in range(num_epochs):
    epoch_loss = 0.0
    model.train()  # Set model to training mode
    for batch_data in train_loader:  # Remove the extra comma here

        # Unpack batch data correctly
        batch_data = batch[0]  # Extract tensor if batch is a tuple/list
    
       # Move to the correct device if necessary (e.g., for GPU support)
        # batch_data = batch_data.to(device)
        optimizer.zero_grad()
        
        # Forward pass
        reconstructed, encoded = model(batch_data)
        
        # Compute reconstruction loss
        reconstruction_loss = F.mse_loss(reconstructed, batch_data)
        
        # Compute sparsity loss
        sparsity_loss = model.sparsity_loss(encoded)
        
        # Total loss
        total_loss = reconstruction_loss + sparsity_loss
        
        # Backward pass and optimization
        total_loss.backward()
        optimizer.step()
        
        epoch_loss += total_loss.item()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss/len(train_loader):.4f}")


NameError: name 'device' is not defined