## Import Required Libraries

In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt


## Define the Sparse Autoencoder Model

In [2]:
# Sparse Autoencoder definition
class SparseAutoencoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, sparsity_weight=1e-3):
        super(SparseAutoencoder, self).__init__()
        self.encoder = nn.Linear(input_dim, hidden_dim)
        self.decoder = nn.Linear(hidden_dim, input_dim)
        self.sparsity_weight = sparsity_weight

    def forward(self, x):
        encoded = F.relu(self.encoder(x))
        decoded = self.decoder(encoded)
        return decoded, encoded

    def sparsity_loss(self, encoded):
        sparsity_loss = torch.mean(torch.abs(encoded))
        return self.sparsity_weight * sparsity_loss


## Prepare the Dataset

In [3]:
# Define the path where the data is located
data_path = r"C:/Users/divyas/Documents/hackathons/CMI_PB/Tasks/Task_IGg_PT/Data_Task_IGg_PT/"

# Load the datasets using the data path
X_train = pd.read_csv(data_path + "abtiter_data_X_train.csv", index_col=0)
y_train = pd.read_csv(data_path + "abtiter_data_y_train.csv", index_col=0)
X_test = pd.read_csv(data_path + "abtiter_data_X_test.csv", index_col=0)

# Display the shapes of the datasets
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")


X_train shape: (111, 35)
y_train shape: (111, 1)
X_test shape: (54, 35)


## Data Preprocessing

In [4]:
# Mapping categorical columns to numerical values
X_train['infancy_vac'] = X_train['infancy_vac'].map({'wP': 0, 'aP': 1})
X_train['biological_sex'] = X_train['biological_sex'].map({'Female': 0, 'Male': 1})

# Dropping the unnecessary columns
X_train = X_train.drop(columns=['dataset', 'timepoint', 'date_of_boost'])

# Display the modified dataframe
print(X_train.head())

X_test['infancy_vac'] = X_test['infancy_vac'].map({'wP': 0, 'aP': 1})
X_test['biological_sex'] = X_test['biological_sex'].map({'Female': 0, 'Male': 1})

# Dropping the unnecessary columns
X_test = X_test.drop(columns=['dataset', 'timepoint', 'date_of_boost'])

# Display the modified dataframe
print(X_test.head())

             IgG_PRN    IgG_FHA   IgG1_PT  IgG1_PRN  IgG1_FHA  IgG1_FIM2/3  \
subject_id                                                                   
1           2.602350  34.050956  7.334714  2.174783  3.013252     1.188744   
3           7.652635   1.096457  1.424098  3.161591  1.287515     0.322658   
4           5.670403   1.048276  3.888604  2.591155  1.269821     2.621216   
5           5.268274   0.084437  7.456313  2.760065  2.864834     7.487345   
6           0.090176   0.379290  0.084132  0.025479  0.654192     0.681225   

             IgG1_TT   IgG1_DT   IgG1_OVA   IgG2_PT  ...   IgG3_OVA  \
subject_id                                           ...              
1           1.428852  2.389153   0.665203  1.000000  ...   1.865388   
3           1.377390  1.523941  33.771912  1.000000  ...   1.119233   
4           1.675259  2.022924   5.777047  4.269877  ...   1.000000   
5           1.537432  2.250237   4.130732  6.070427  ...   5.446934   
6           0.874920  0.369

In [5]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)

batch_size = 32
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataset = TensorDataset(X_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [6]:
# Create dataset and split into train and val
dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)


NameError: name 'random_split' is not defined

## Training the Sparse Autoencoder

In [None]:
input_dim = X_train.shape[1]
hidden_dim = 64  # Adjust for dimensionality reduction

model = SparseAutoencoder(input_dim=input_dim, hidden_dim=hidden_dim)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
num_epochs = 50

train_losses = []

for epoch in range(num_epochs):
    epoch_loss = 0.0
    for batch_data, _ in train_loader:
        optimizer.zero_grad()
        reconstructed, encoded = model(batch_data)
        reconstruction_loss = F.mse_loss(reconstructed, batch_data)
        sparsity_loss = model.sparsity_loss(encoded)
        total_loss = reconstruction_loss + sparsity_loss
        total_loss.backward()
        optimizer.step()
        epoch_loss += total_loss.item()
    avg_epoch_loss = epoch_loss / len(train_loader)
    train_losses.append(avg_epoch_loss)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_epoch_loss:.4f}")


In [None]:
plt.plot(range(1, num_epochs + 1), train_losses, label='Training Loss')
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("Training Loss for Sparse Autoencoder")
plt.legend()
plt.show()


In [None]:
# Extract embeddings for training data
model.eval()
with torch.no_grad():
    _, X_train_encoded = model(X_train_tensor)
    _, X_test_encoded = model(X_test_tensor)

# Convert embeddings to numpy for regression
X_train_encoded_np = X_train_encoded.numpy()
X_test_encoded_np = X_test_encoded.numpy()
y_train_np = y_train_tensor.numpy().ravel()


In [None]:
regressor = LinearRegression()
regressor.fit(X_train_encoded_np, y_train_np)
y_pred_train = regressor.predict(X_train_encoded_np)

# Predict on test data (if labels available, use y_test for evaluation)
y_pred_test = regressor.predict(X_test_encoded_np)
train_rmse = mean_squared_error(y_train_np, y_pred_train, squared=False)
print(f"Training RMSE: {train_rmse:.4f}")


In [None]:
plt.scatter(y_train_np, y_pred_train, alpha=0.7)
plt.xlabel("Actual IgG_PT")
plt.ylabel("Predicted IgG_PT")
plt.title("Training Set: Predicted vs Actual IgG_PT")
plt.plot([y_train_np.min(), y_train_np.max()], [y_train_np.min(), y_train_np.max()], 'k--')
plt.show()
