In [13]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from scipy.stats import spearmanr
from sklearn.model_selection import train_test_split
from skorch import NeuralNetRegressor
from skorch.callbacks import Checkpoint
from skorch.callbacks import EarlyStopping
from skorch.callbacks import EpochScoring
from skorch.callbacks import LRScheduler
from skorch.helper import predefined_split
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts
from torch.utils.data import Dataset
from torch.utils.data import Subset

from gene_expression_prediction.data_processor import FeatureNames
from gene_expression_prediction.data_processor import ProcessedFeatures
from gene_expression_prediction.data_processor import process_all_cell_lines
from gene_expression_prediction.data_processor import save_processed_features

## Work Package 1.1 - Modeling Choices & Data Pre-processing

In [14]:
# TODO:
# Load your feature (bed and/or bigwig and/or fasta) and target files (tsv) here.
# Decide which features to use for training. Feel free to process them however you need.

# NOTE:
# bed and bigwig files contain signals of all chromosomes (including sex chromosomes).
# Training and validation split based on chromosomes has been done for you.
# However, you can resplit the data in any way you want.

# ---------------------------INSERT CODE HERE---------------------------
DATA_PATH = "/workspaces/Gene-Expression-Prediction/data"

WINDOW_SIZE = 196_608
BIN_SIZE = 128
SAMPLE_N = None  # Set to e.g., 100 for quick testing, None for full dataset


cell_line_x1, cell_line_x2, cell_line_x3 = process_all_cell_lines(
    data_path=DATA_PATH,
    window_size=WINDOW_SIZE,
    bin_size=BIN_SIZE,
    sample_n=SAMPLE_N,
    save_to_disk=True,
    output_dir=DATA_PATH,
)

# ----------------------------------------------------------------------


GENE EXPRESSION FEATURE PROCESSING - ENFORMER STYLE
Data path:     /workspaces/Gene-Expression-Prediction/data
Window size:   196,608 bp (±98,304 bp from TSS)
Bin size:      128 bp
Total bins:    1536
Sample size:   All genes
Save to disk:  True

Loading data readers...
All required data paths have been successfully validated.

PROCESSING CELL LINE X1 (Training)

Uniform Binning Configuration (Enformer-style):
  Target window size:   196,608 bp
  Bin size:             128 bp
  Computed n_bins:      1,536 (window_size // bin_size)
  Actual coverage:      196,608 bp (n_bins * bin_size)
  Window center:        TSS ±98,304 bp
  Number of features:   7
  Number of genes:      16,284
  Tensor shape:         (16,284, 1,536, 7)
  Memory (est.):        0.70 GB



Processing X1:   0%|          | 0/16284 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
# cell_line_x1 = load_processed_features(
#     "/workspaces/Gene-Expression-Prediction/data/processed_data_x1"
# )
# cell_line_x2 = load_processed_features(
#     "/workspaces/Gene-Expression-Prediction/data/processed_data_x2"
# )
# cell_line_x3 = load_processed_features(
#     "/workspaces/Gene-Expression-Prediction/data/processed_data_x3"
# )

In [4]:
def validate_combined_features(features: ProcessedFeatures):
    """Validates that all components of ProcessedFeatures are properly aligned."""
    print("\n" + "=" * 60)
    print("VALIDATING COMBINED FEATURES ALIGNMENT")
    print("=" * 60)

    n_genes = len(features.gene_annotations)
    gene_names = features.gene_annotations["gene_name"].values

    print(f"Number of genes: {n_genes}")
    print(f"Promoter tensor shape: {features.sequence_signal_tensor.shape}")
    print(f"Gene annotations shape: {features.gene_annotations.shape}")
    print(
        f"Target expression shape: {features.target_expression.shape if features.target_expression is not None else 'None'}"
    )

    if features.sequence_signal_tensor.shape[0] != n_genes:
        print("ERROR: Promoter tensor length doesn't match gene annotations!")
        return False

    if features.target_expression is not None:
        if len(features.target_expression) != n_genes:
            print("ERROR: Target expression length doesn't match gene annotations!")
            return False

        target_genes = features.target_expression.index.values
        if not np.array_equal(gene_names, target_genes):
            print(
                "ERROR: Target expression index doesn't match gene_annotations order!"
            )
            print(f"  First 5 in annotations: {gene_names[:5]}")
            print(f"  First 5 in targets: {target_genes[:5]}")
            return False

    print("All alignments validated successfully!")
    print("=" * 60 + "\n")
    return True


def combine_cell_lines(
    cell_line_x1: ProcessedFeatures, cell_line_x2: ProcessedFeatures
) -> ProcessedFeatures:
    """
    Combines two ProcessedFeatures objects ensuring proper alignment.
    """
    print("\nCombining cell line features...")

    combined_tensor = np.concatenate(
        [cell_line_x1.sequence_signal_tensor, cell_line_x2.sequence_signal_tensor],
        axis=0,
    )

    annotations_x1 = cell_line_x1.gene_annotations.copy()
    annotations_x2 = cell_line_x2.gene_annotations.copy()
    annotations_x1["gene_name"] = annotations_x1["gene_name"] + "_x1"
    annotations_x2["gene_name"] = annotations_x2["gene_name"] + "_x2"
    combined_annotations = pd.concat(
        [annotations_x1, annotations_x2], ignore_index=True
    )

    targets_x1 = cell_line_x1.target_expression.copy()
    targets_x2 = cell_line_x2.target_expression.copy()
    targets_x1.index = targets_x1.index + "_x1"
    targets_x2.index = targets_x2.index + "_x2"
    combined_targets = pd.concat([targets_x1, targets_x2])

    combined_targets = combined_targets.reindex(combined_annotations["gene_name"])

    combined_features = ProcessedFeatures(
        gene_annotations=combined_annotations,
        sequence_signal_tensor=combined_tensor,
        window_size=cell_line_x1.window_size,
        bin_size=cell_line_x1.bin_size,
        n_bins=cell_line_x1.n_bins,
        target_expression=combined_targets,
    )

    if not validate_combined_features(combined_features):
        raise ValueError("Combined features validation failed! Data is misaligned.")

    return combined_features


combined_features = combine_cell_lines(cell_line_x1, cell_line_x2)


Combining cell line features...

VALIDATING COMBINED FEATURES ALIGNMENT
Number of genes: 20
Promoter tensor shape: (20, 1536, 7)
Gene annotations shape: (20, 7)
Target expression shape: (20,)
All alignments validated successfully!



## Work Package 1.2 - Model Building

In [10]:
# TODO:
# Select the best model to predict gene expression from the obtained features in WP 1.1.
# pytorch_dataset.py

class GeneExpressionDataset(Dataset):
    """
    PyTorch Dataset for gene expression prediction.

    Handles a 3D input tensor (N, B, F) from ProcessedFeatures.
    Implements `asinh` transform and per-feature z-score normalization.
    """

    def __init__(self, processed_features: ProcessedFeatures, normalize_params=None):
        """
        Args:
            processed_features: ProcessedFeatures object.
                                Assumes sequence_signal_tensor is (N_genes, N_bins, N_features).
            normalize_params: Optional dict with normalization params from training.
        """
        is_training = normalize_params is None
        requires_targets = processed_features.target_expression is not None

        if is_training and not requires_targets:
            raise ValueError("Target expression required for training dataset.")

        # --- INPUT TENSOR PROCESSING ---
        # 1. Load 3D tensor: (N_genes, N_bins, N_features)
        #    (e.g., N, 1562, 7)
        pt = torch.from_numpy(processed_features.sequence_signal_tensor).float()

        # 2. Permute to (N, F, B) for easier processing
        #    (N, 7, 1562)
        pt = pt.permute(0, 2, 1).contiguous()
        N, F, B = pt.shape # F=7 (features), B=1562 (bins)

        # 3.Apply asinh transformation
        # This stabilizes variance before normalization
        pt = torch.asinh(pt)

        if is_training:
            # --- TRAINING MODE: Compute normalization ---
            print("\n" + "=" * 60)
            print("TRAINING MODE: Computing per-feature normalization")
            print("=" * 60)
            
            # Compute stats *per-feature*
            # pt shape is (N, F, B)
            # We compute stats over N and B dimensions
            self.feature_means = pt.mean(dim=(0, 2)) # Shape (F,)
            self.feature_stds = pt.std(dim=(0, 2))   # Shape (F,)
            
            # Add a small epsilon to stds to prevent division by zero
            self.feature_stds[self.feature_stds < 1e-8] = 1e-8
            
            print(f"Computed stats for {F} features (1 channel: mean)")
            print(f"  Mean stats tensor shape: {self.feature_means.shape}")
            print(f"  Std stats tensor shape:  {self.feature_stds.shape}")
            
            print("\nTarget normalization:")
            target_values = processed_features.target_expression.fillna(0.0).values
            log_targets = np.log1p(target_values)

            self.target_log_mean = log_targets.mean()
            self.target_log_std = log_targets.std()
            
            normalized_targets = (
                log_targets - self.target_log_mean
            ) / self.target_log_std
            self.targets = torch.from_numpy(normalized_targets).float()
            print(f"  log1p mean: {self.target_log_mean:.3f}, std: {self.target_log_std:.3f}")

        else:
            # --- TEST MODE: Apply training normalization ---
            print("\n" + "=" * 60)
            print("TEST MODE: Applying training normalization parameters")
            print("=" * 60)
            
            self.feature_means = normalize_params["feature_means"]
            self.feature_stds = normalize_params["feature_stds"]
            self.target_log_mean = normalize_params["target_log_mean"]
            self.target_log_std = normalize_params["target_log_std"]
            
            print(f"Loaded stats for {self.feature_means.shape[0]} features")
            self.targets = torch.zeros(N) # Dummy targets for test set

        # --- APPLY PER-FEATURE NORMALIZATION ---
        # Use broadcasting to normalize
        # pt shape:        (N, F, B)
        # means/stds shape:    (F,)
        # Unsqueeze to:      (1, F, 1) to broadcast over N and B
        
        mean_tensor = self.feature_means.unsqueeze(0).unsqueeze(2)
        std_tensor = self.feature_stds.unsqueeze(0).unsqueeze(2)
        
        pt = (pt - mean_tensor) / std_tensor

        # --- FINAL TENSOR ---
        # The tensor is already in the correct (N, F, B) shape
        self.promoter_tensor = pt

        print(f"\nDataset size: {N} samples")
        print(f"Tensor shape: {tuple(self.promoter_tensor.shape)}")
        print("=" * 60 + "\n")

    def get_normalization_params(self):
        """ Returns normalization parameters for use with test set. """
        return {
            "feature_means": self.feature_means,
            "feature_stds": self.feature_stds,
            "target_log_mean": self.target_log_mean,
            "target_log_std": self.target_log_std,
        }

    def denormalize_targets(self, normalized_predictions):
        """
        Converts normalized predictions back to original gene expression scale.
        """
        if torch.is_tensor(normalized_predictions):
            normalized_predictions = normalized_predictions.cpu().numpy()

        log_predictions = (
            normalized_predictions * self.target_log_std
        ) + self.target_log_mean
        original_scale = np.expm1(log_predictions)
        return np.clip(original_scale, 0, None)

    def __len__(self) -> int:
        return len(self.targets)

    def __getitem__(self, idx: int) -> tuple[torch.Tensor, torch.Tensor]:
        return self.promoter_tensor[idx], self.targets[idx]

class PromoterAttentionCNN(nn.Module):
    def __init__(self, n_bins: int, n_features: int, n_channels: int):
        super().__init__()
        in_channels = n_features * n_channels

        # Conv 1
        self.conv1 = nn.Conv1d(in_channels, 64, kernel_size=7, padding="same")
        self.norm1 = nn.GroupNorm(8, 64)
        self.dropout1 = nn.Dropout(0.1)

        # Residual block 1
        self.res1_conv1 = nn.Conv1d(64, 64, kernel_size=5, padding="same")
        self.res1_norm1 = nn.GroupNorm(8, 64)
        self.res1_conv2 = nn.Conv1d(64, 64, kernel_size=5, padding="same")
        self.res1_norm2 = nn.GroupNorm(8, 64)
        self.pool1 = nn.MaxPool1d(2)

        # Conv 2
        self.conv2 = nn.Conv1d(64, 128, kernel_size=5, padding="same")
        self.norm2 = nn.GroupNorm(8, 128)
        self.dropout2 = nn.Dropout(0.1)
        
        # Residual block 2
        self.res2_conv1 = nn.Conv1d(128, 128, kernel_size=3, padding="same")
        self.res2_norm1 = nn.GroupNorm(8, 128)
        self.res2_conv2 = nn.Conv1d(128, 128, kernel_size=3, padding="same")
        self.res2_norm2 = nn.GroupNorm(8, 128)
        self.pool2 = nn.MaxPool1d(2)

        # Additional conv layer
        self.conv3 = nn.Conv1d(128, 256, kernel_size=3, padding="same")
        self.norm3 = nn.GroupNorm(8, 256)
        self.dropout3 = nn.Dropout(0.2)

        self.gap = nn.AdaptiveAvgPool1d(32)

        # Multi-head attention
        self.self_attn = nn.MultiheadAttention(
            embed_dim=256, num_heads=8, dropout=0.1, batch_first=True
        )
        self.attn_norm = nn.LayerNorm(256)

        # Prediction head
        self.fc1 = nn.Linear(256 * 32, 512)
        self.fc_norm = nn.LayerNorm(512)
        self.fc_dropout = nn.Dropout(0.2)
        self.fc2 = nn.Linear(512, 128)
        self.fc_dropout2 = nn.Dropout(0.1)
        self.fc3 = nn.Linear(128, 1)

    def _residual_block(self, x, conv1, norm1, conv2, norm2):
        """Residual block with skip connection."""
        identity = x
        out = F.relu(norm1(conv1(x)))
        out = norm2(conv2(out))
        out += identity  # Skip connection
        return F.relu(out)

    def forward(self, x):
        # Initial conv
        x = F.relu(self.norm1(self.conv1(x)))
        x = self.dropout1(x)

        # Residual block 1
        x = self._residual_block(
            x, self.res1_conv1, self.res1_norm1, self.res1_conv2, self.res1_norm2
        )
        x = self.pool1(x)

        # Conv 2
        x = F.relu(self.norm2(self.conv2(x)))
        x = self.dropout2(x)

        # Residual block 2
        x = self._residual_block(
            x, self.res2_conv1, self.res2_norm1, self.res2_conv2, self.res2_norm2
        )
        x = self.pool2(x)

        # Conv 3
        x = F.relu(self.norm3(self.conv3(x)))
        x = self.dropout3(x)

        # GAP
        x = self.gap(x)  # (batch, 256, 32)

        # Self-attention
        x_t = x.transpose(1, 2)
        attn_out, _ = self.self_attn(x_t, x_t, x_t)
        x_t = self.attn_norm(x_t + attn_out)  # Residual

        # Flatten and predict
        h = torch.flatten(x_t, 1)
        h = F.relu(self.fc_norm(self.fc1(h)))
        h = self.fc_dropout(h)
        h = F.relu(self.fc2(h))
        h = self.fc_dropout2(h)
        return self.fc3(h).squeeze(1)


# ----------------------------------------------------------------------

In [11]:
def spearman_epoch_scorer(net, dataset_valid, y=None):
    """
    Custom skorch scorer to calculate Spearman correlation on the validation set.

    This function correctly handles skorch's behavior by ignoring the potentially
    incomplete 'y' parameter and reconstructing the full y_true array from the
    provided validation dataset.
    """
    y_pred = net.predict(dataset_valid).ravel()
    y_true = np.array([y_i.item() for _, y_i in dataset_valid]).ravel()

    correlation, _ = spearmanr(y_true, y_pred)

    # This is a necessary sanity check, not error hiding. spearmanr can return
    # NaN if all predictions are identical
    if np.isnan(correlation):
        return 0.0

    return float(correlation)


def validate_dataset(dataset):
    """Validates dataset for NaN, Inf, and prints value ranges."""
    print("\n=== Data Validation ===")

    has_nan = torch.isnan(dataset.promoter_tensor).any()
    has_inf = torch.isinf(dataset.promoter_tensor).any()
    print(f"Promoter - NaN: {has_nan}, Inf: {has_inf}")
    print(
        f"Promoter range: [{dataset.promoter_tensor.min():.3f}, {dataset.promoter_tensor.max():.3f}]"
    )

    has_nan = torch.isnan(dataset.targets).any()
    has_inf = torch.isinf(dataset.targets).any()
    print(f"Targets - NaN: {has_nan}, Inf: {has_inf}")
    print(f"Targets range: [{dataset.targets.min():.3f}, {dataset.targets.max():.3f}]")
    print(
        f"Targets with value 0.0: {(dataset.targets == 0.0).sum()} / {len(dataset.targets)}"
    )
    print("=====================\n")


def train_with_skorch(
    full_ds: Dataset,
    model: nn.Module,
    *,
    device: torch.device,
    batch_size: int = 64,
    max_epochs: int = 10_000,
    learning_rate: float = 1e-3,
    num_workers: int = 0,
    patience: int = 10,
    min_delta: float = 1e-4,
    checkpoint_dir: str = "checkpoints",
    monitor_name: str = "valid_spearman",
):
    N = len(full_ds)
    indices = list(range(N))
    train_idx, val_idx = train_test_split(indices, test_size=0.2, random_state=42)

    train_ds = Subset(full_ds, train_idx)
    valid_ds = Subset(full_ds, val_idx)

    pin_memory = device.type == "cuda"
    callbacks = [
        EpochScoring(
            spearman_epoch_scorer,
            lower_is_better=False,
            name=monitor_name,
            use_caching=False,
        ),
        EarlyStopping(
            monitor=monitor_name,
            patience=patience,
            threshold=min_delta,
            lower_is_better=False,
        ),
        Checkpoint(
            dirname=checkpoint_dir,
            monitor=f"{monitor_name}_best",
            f_params="best_model.pt",
        ),
        # LRScheduler(
        #     policy=torch.optim.lr_scheduler.ReduceLROnPlateau,
        #     mode="max",
        #     factor=0.5,
        #     patience=max(2, patience // 2),
        #     monitor=monitor_name,
        # ),
        LRScheduler(
            policy=CosineAnnealingWarmRestarts,
            T_0=10,
            T_mult=2,
            eta_min=1e-6,
        ),
    ]

    class PearsonCorrelationLoss(nn.Module):
        """Differentiable Pearson correlation loss (1 - correlation)"""

        def __init__(self):
            super().__init__()

        def forward(self, y_pred, y_true):
            y_pred = y_pred.view(-1)
            y_true = y_true.view(-1)

            vx = y_pred - torch.mean(y_pred)
            vy = y_true - torch.mean(y_true)

            cost = torch.sum(vx * vy) / (
                torch.sqrt(torch.sum(vx**2)) * torch.sqrt(torch.sum(vy**2)) + 1e-8
            )
            return 1 - cost

    net = NeuralNetRegressor(
        model,
        criterion=PearsonCorrelationLoss,
        optimizer=optim.AdamW,
        optimizer__lr=learning_rate,
        optimizer__weight_decay=1e-4,
        max_epochs=max_epochs,
        batch_size=batch_size,
        device=device.type,
        train_split=predefined_split(valid_ds),
        callbacks=callbacks,
        iterator_train__num_workers=num_workers,
        iterator_valid__num_workers=num_workers,
        iterator_train__pin_memory=pin_memory,
        iterator_valid__pin_memory=pin_memory,
    )

    print(f"Combined dataset size: {N}")
    print(f"Training set size:   {len(train_idx)}")
    print(f"Validation set size: {len(val_idx)}")

    net.fit(train_ds, y=None)

    from pathlib import Path

    net.load_params(f_params=Path(checkpoint_dir) / "best_model.pt")
    best_torch_model = net.module_
    return net, best_torch_model

In [12]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

BATCH_SIZE = 256
LEARNING_RATE = 3e-4
NUM_WORKERS = 2
MAX_EPOCHS = 100
PATIENCE = 15
MIN_DELTA = 1e-4
MODEL_SAVE_PATH = (
    "/workspaces/Gene-Expression-Prediction/data/best_model/best_model.pth"
)

print("Creating dataset...")
full_ds = GeneExpressionDataset(combined_features)
validate_dataset(full_ds)

model = PromoterAttentionCNN(
    n_bins=combined_features.n_bins,
    n_features=7,
    n_channels=1,
)

# Run Training
print("Starting training...")
net, best_model = train_with_skorch(
    full_ds=full_ds,
    model=model,
    device=DEVICE,
    batch_size=BATCH_SIZE,
    max_epochs=MAX_EPOCHS,
    learning_rate=LEARNING_RATE,
    num_workers=NUM_WORKERS,
    patience=PATIENCE,
    min_delta=MIN_DELTA,
    checkpoint_dir="checkpoints_promoter_only",
)

torch.save(best_model.state_dict(), MODEL_SAVE_PATH)
print(f"\nSaved best model to: {MODEL_SAVE_PATH}")

Using device: cpu
Creating dataset...

TRAINING MODE: Computing per-feature normalization
Computed stats for 7 features (1 channel: mean)
  Mean stats tensor shape: torch.Size([7])
  Std stats tensor shape:  torch.Size([7])

Target normalization:
  log1p mean: 1.403, std: 1.773

Dataset size: 20 samples
Tensor shape: (20, 7, 1536)


=== Data Validation ===
Promoter - NaN: False, Inf: False
Promoter range: [-0.967, 13.568]
Targets - NaN: False, Inf: False
Targets range: [-0.791, 2.495]
Targets with value 0.0: 0 / 20

Starting training...
Combined dataset size: 20
Training set size:   16
Validation set size: 4
  epoch    train_loss    valid_loss    valid_spearman    cp      lr     dur
-------  ------------  ------------  ----------------  ----  ------  ------
      1        [36m1.1628[0m        [32m0.2898[0m            [35m0.6325[0m     +  0.0003  0.8147
      2        [36m0.4194[0m        0.3514            0.6325        0.0003  0.8240
      3        0.5893        0.4425         

## Work Package 1.3 - Prediction on Test Data (Evaluation Metric)

In [11]:
pred = None
test_genes = cell_line_x3.gene_annotations["gene_name"].values

print(f"\nPredicting on {len(test_genes)} test genes\n")

# Create training dataset (computes normalization)
print("Step 1: Loading training data and computing normalization...")
train_ds = GeneExpressionDataset(combined_features)

# Get normalization params
norm_params = train_ds.get_normalization_params()

# Create test dataset (applies training normalization)
print("\nStep 2: Creating test dataset with training normalization...")
test_ds = GeneExpressionDataset(cell_line_x3, normalize_params=norm_params)

# Create and load model
print("\nStep 3: Loading trained model...")
model = PromoterAttentionCNN(
    n_bins=combined_features.n_total_bins,
    n_features=N_FEATURES,
    n_channels=N_CHANNELS,
)

net = NeuralNetRegressor(
    model,
    device=DEVICE.type,
    batch_size=BATCH_SIZE * 2,
)

net.initialize()
net.load_params(
    f_params="/workspaces/Gene-Expression-Prediction/src/gene_expression_prediction/checkpoints_promoter_only/best_model.pt"
)

# Predict (returns normalized predictions)
print("\nStep 4: Making predictions...")
pred_normalized = net.predict(test_ds)

# Denormalize back to original scale
print("\nStep 5: Denormalizing predictions...")
pred = train_ds.denormalize_targets(pred_normalized)

# Summary
print(f"\n{'=' * 60}")
print("PREDICTION SUMMARY")
print(f"{'=' * 60}")
print(f"  Shape: {pred.shape}")
print(f"  Range: [{pred.min():.3f}, {pred.max():.3f}]")
print(f"  Mean:  {pred.mean():.3f}")
print(f"  Median: {np.median(pred):.3f}")
print(f"{'=' * 60}\n")

# Validation
assert isinstance(pred, np.ndarray), "Prediction must be numpy array"
assert np.issubdtype(pred.dtype, np.number), "Prediction must be numeric"
assert pred.shape[0] == len(test_genes), "One prediction per gene"
assert not np.isnan(pred).any(), "No NaN values"
assert not np.isinf(pred).any(), "No Inf values"


Predicting on 1984 test genes

Step 1: Loading training data and computing normalization...

TRAINING MODE: Computing normalization parameters

Promoter normalization (global per channel):
  Mean channel: μ=1.015, σ=5.113
  Max channel:  μ=2.123, σ=8.314

Target normalization:
  Original range: [0.0, 19519.8]
  Log range: [0.000, 9.879]
  Normalized range: [-0.653, 4.481]

Dataset size: 32568 samples
Tensor shape: (32568, 14, 190)


Step 2: Creating test dataset with training normalization...

TEST MODE: Applying training normalization parameters

Using training normalization stats:
  Mean channel: μ=1.015, std=5.113
  Max channel:  μ=2.123, std=8.314

Dataset size: 1984 samples
Tensor shape: (1984, 14, 190)


Step 3: Loading trained model...

Step 4: Making predictions...

Step 5: Denormalizing predictions...

PREDICTION SUMMARY
  Shape: (1984,)
  Range: [0.000, 172654288.607]
  Mean:  273655.766
  Median: 0.000



#### Store Predictions in the Required Format

In [12]:
# Store predictions in a ZIP.
# Upload this zip on the project website under "Your submission".
# Zip this notebook along with the conda environment (and README, optional) and upload this under "Your code".
save_dir = "/workspaces/Gene-Expression-Prediction/data/output"
file_name = "gex_predicted.csv"  # PLEASE DO NOT CHANGE THIS
zip_name = "Tokar_David_Project1.zip"
save_path = f"{save_dir}/{zip_name}"
compression_options = {"method": "zip", "archive_name": file_name}

submission_df = pd.DataFrame({"gene_name": test_genes, "gex_predicted": pred})

compression_options = {"method": "zip", "archive_name": file_name}

submission_df.to_csv(save_path, index=False, compression=compression_options)
print(f"File saved to: {save_path}")
print("\nPreview of the first 5 rows of the submission file:")
print(submission_df.head())

File saved to: /workspaces/Gene-Expression-Prediction/data/output/Tokar_David_Project1.zip

Preview of the first 5 rows of the submission file:
    gene_name  gex_predicted
0       CAPN9            0.0
1        ILF2            0.0
2  ST6GALNAC5            0.0
3  MROH7-TTC4            0.0
4        AGO4            0.0
