In [1]:
import numpy as np
import pandas as pd
import anndata as ad
from scipy.sparse import issparse
import gc

def zscore(x):
    """Applies z-score normalization to each row of the input matrix."""
    x_zscore = []
    for i in range(x.shape[0]):
        x_row = x[i]
        mean = np.mean(x_row)
        std = np.std(x_row)
        if std == 0:
            x_zscore.append(x_row - mean)
        else:
            x_zscore.append((x_row - mean) / std)
    return np.array(x_zscore)


# --- Step 1: Load Data Required for Submission ---
print("--- Step 1: Loading necessary IDs and test predictions ---")

# Load the official ID conversion table
try:
    evaluation_ids = pd.read_csv('evaluation_ids.csv')
except FileNotFoundError:
    print("Error: 'evaluation_ids.csv' not found. This file is required.")
    raise

# Load the test set cell IDs to ensure correct order
PATH_TEST_INP = "test_multi_inputs.h5ad"
adata_test_inp = ad.read_h5ad(PATH_TEST_INP)
test_cell_ids = adata_test_inp.obs_names
del adata_test_inp # Free up memory
gc.collect()

# Load the target gene ID list
PATH_TRAIN_TGT = "train_multi_targets.h5ad"
adata_train_tgt = ad.read_h5ad(PATH_TRAIN_TGT)
target_ids = adata_train_tgt.var_names
del adata_train_tgt # Free up memory
gc.collect()

# Open the test set prediction files using memory-mapping mode
print("Loading test set prediction arrays...")
sub_preds_cos = np.load('sub_preds_cos_multi.npy', mmap_mode='r')
sub_preds_mse = np.load('sub_preds_mse_multi.npy', mmap_mode='r')


# --- Step 2: Process Test Set Predictions in Chunks and Create Submission File ---
print("\n--- Step 2: Processing test set predictions in chunks to create submission file ---")
N_CHUNKS = 2
chunk_size = len(test_cell_ids) // N_CHUNKS
SUBMISSION_PATH = 'D:\\submission_multi.csv'

for i in range(N_CHUNKS):
    print(f"\n--- Processing Chunk {i+1}/{N_CHUNKS} ---")
    
    # a) Determine the cells and indices for the current chunk
    start_idx = i * chunk_size
    # Ensure the last chunk includes all remaining cells
    end_idx = (i + 1) * chunk_size if i < N_CHUNKS - 1 else len(test_cell_ids)
    
    print(f"Processing cells from index {start_idx} to {end_idx-1}...")
    chunk_cell_ids = test_cell_ids[start_idx:end_idx]
    
    # b) Load only the required prediction data slice for the current chunk
    chunk_sub_preds_cos = sub_preds_cos[start_idx:end_idx]
    chunk_sub_preds_mse = sub_preds_mse[start_idx:end_idx]
    
    # c) Perform Z-Score and ensembling on the small chunk
    chunk_cos_z = zscore(chunk_sub_preds_cos)
    chunk_mse_z = zscore(chunk_sub_preds_mse)
    chunk_ensembled = chunk_cos_z * 0.55 + chunk_mse_z * 0.45
    
    # d) Create a long-format prediction table for the current chunk
    chunk_pred_df = pd.DataFrame(chunk_ensembled, index=chunk_cell_ids, columns=target_ids)
    chunk_pred_long = chunk_pred_df.reset_index().rename(columns={'index': 'cell_id'}).melt(
        id_vars='cell_id', 
        var_name='gene_id',
        value_name='target'
    )
    
    # e) Merge the chunk's predictions with the official IDs
    chunk_submission_df = evaluation_ids.merge(chunk_pred_long, on=['cell_id', 'gene_id'], how='inner')
    final_chunk = chunk_submission_df[['row_id', 'target']]
    
    # f) Write to the CSV file in parts (chunk by chunk)
    if i == 0:
        # First chunk: Write normally with a header
        print(f"Writing initial chunk to '{SUBMISSION_PATH}'...")
        final_chunk.to_csv(SUBMISSION_PATH, index=False, mode='w', header=True)
    else:
        # Subsequent chunks: Use append mode without a header
        print(f"Appending chunk to '{SUBMISSION_PATH}'...")
        final_chunk.to_csv(SUBMISSION_PATH, index=False, mode='a', header=False)

    # Clean up memory to prepare for the next chunk
    del chunk_sub_preds_cos, chunk_sub_preds_mse, chunk_cos_z, chunk_mse_z, chunk_ensembled, chunk_pred_df, chunk_pred_long, chunk_submission_df, final_chunk
    gc.collect()

print(f"\n--- Submission file '{SUBMISSION_PATH}' created successfully in chunks. ---")

# (Optional) Read the final file for a quick check
final_submission_check = pd.read_csv(SUBMISSION_PATH)
print("Final submission file head:")
print(final_submission_check.head())
print("Final submission file tail:")
print(final_submission_check.tail())

--- Step 1: Loading necessary IDs and test predictions ---
Loading test set prediction arrays...

--- Step 2: Processing test set predictions in chunks to create submission file ---

--- Processing Chunk 1/2 ---
Processing cells from index 0 to 27966...
Writing initial chunk to 'D:\submission_multi.csv'...

--- Processing Chunk 2/2 ---
Processing cells from index 27967 to 55934...
Appending chunk to 'D:\submission_multi.csv'...

--- Submission file 'D:\submission_multi.csv' created successfully in chunks. ---
Final submission file head:
    row_id    target
0  6812820 -0.680910
1  6812821  7.534191
2  6812822  0.041575
3  6812823  0.444929
4  6812824 -0.676568
Final submission file tail:
            row_id    target
58931355  65740663 -0.605605
58931356  65740664  6.464191
58931357  65740665  0.767095
58931358  65740666 -0.679870
58931359  65740667 -0.047109


In [None]:
import numpy as np
import pandas as pd
import anndata as ad
from scipy.sparse import issparse
import gc

def zscore(x):
    """Applies z-score normalization to each row of the input matrix."""
    x_zscore = []
    for i in range(x.shape[0]):
        x_row = x[i]
        mean = np.mean(x_row)
        std = np.std(x_row)
        if std == 0:
            x_zscore.append(x_row - mean)
        else:
            x_zscore.append((x_row - mean) / std)
    return np.array(x_zscore)


# --- Step 1: Load Data Required for Submission ---
print("--- Step 1: Loading necessary IDs and test predictions ---")

# Load the official ID conversion table
try:
    evaluation_ids = pd.read_csv('evaluation_ids.csv')
except FileNotFoundError:
    print("Error: 'evaluation_ids.csv' not found. This file is required.")
    raise

# Load the test set cell IDs to ensure correct order
PATH_TEST_INP = "test_cite_inputs.h5ad"
adata_test_inp = ad.read_h5ad(PATH_TEST_INP)
test_cell_ids = adata_test_inp.obs_names
del adata_test_inp # Free up memory
gc.collect()

# Load the target gene ID list
PATH_TRAIN_TGT = "train_cite_targets.h5ad"
adata_train_tgt = ad.read_h5ad(PATH_TRAIN_TGT)
target_ids = adata_train_tgt.var_names
del adata_train_tgt # Free up memory
gc.collect()

# Open the test set prediction files using memory-mapping mode
print("Loading test set prediction arrays...")
sub_preds_cos = np.load('sub_preds_cos_cite.npy', mmap_mode='r')


# --- Step 2: Process Test Set Predictions in Chunks and Create Submission File ---
print("\n--- Step 2: Processing test set predictions in chunks to create submission file ---")
N_CHUNKS = 2
chunk_size = len(test_cell_ids) // N_CHUNKS
SUBMISSION_PATH = 'D:\\submission_cite.csv'

for i in range(N_CHUNKS):
    print(f"\n--- Processing Chunk {i+1}/{N_CHUNKS} ---")
    
    # a) Determine the cells and indices for the current chunk
    start_idx = i * chunk_size
    # Ensure the last chunk includes all remaining cells
    end_idx = (i + 1) * chunk_size if i < N_CHUNKS - 1 else len(test_cell_ids)
    
    print(f"Processing cells from index {start_idx} to {end_idx-1}...")
    chunk_cell_ids = test_cell_ids[start_idx:end_idx]
    
    # b) Load only the required prediction data slice for the current chunk
    chunk_sub_preds_cos = sub_preds_cos[start_idx:end_idx]
    #chunk_sub_preds_mse = sub_preds_mse[start_idx:end_idx]
    
    # c) Perform Z-Score and ensembling on the small chunk
    chunk_cos_z = zscore(chunk_sub_preds_cos)
    #chunk_mse_z = zscore(chunk_sub_preds_mse)
    chunk_ensembled = chunk_cos_z# * 0.55 + chunk_mse_z * 0.45
    
    # d) Create a long-format prediction table for the current chunk
    chunk_pred_df = pd.DataFrame(chunk_ensembled, index=chunk_cell_ids, columns=target_ids)
    chunk_pred_long = chunk_pred_df.reset_index().rename(columns={'index': 'cell_id'}).melt(
        id_vars='cell_id', 
        var_name='gene_id',
        value_name='target'
    )
    
    # e) Merge the chunk's predictions with the official IDs
    chunk_submission_df = evaluation_ids.merge(chunk_pred_long, on=['cell_id', 'gene_id'], how='inner')
    final_chunk = chunk_submission_df[['row_id', 'target']]
    
    # f) Write to the CSV file in parts (chunk by chunk)
    if i == 0:
        # First chunk: Write normally with a header
        print(f"Writing initial chunk to '{SUBMISSION_PATH}'...")
        final_chunk.to_csv(SUBMISSION_PATH, index=False, mode='w', header=True)
    else:
        # Subsequent chunks: Use append mode without a header
        print(f"Appending chunk to '{SUBMISSION_PATH}'...")
        final_chunk.to_csv(SUBMISSION_PATH, index=False, mode='a', header=False)

    # Clean up memory to prepare for the next chunk
    #del chunk_sub_preds_cos, chunk_sub_preds_mse, chunk_cos_z, chunk_mse_z, chunk_ensembled, chunk_pred_df, chunk_pred_long, chunk_submission_df, final_chunk
    gc.collect()

print(f"\n--- Submission file '{SUBMISSION_PATH}' created successfully in chunks. ---")

# (Optional) Read the final file for a quick check
final_submission_check = pd.read_csv(SUBMISSION_PATH)
print("Final submission file head:")
print(final_submission_check.head())
print("Final submission file tail:")
print(final_submission_check.tail())

--- Step 1: Loading necessary IDs and test predictions ---
Loading test set prediction arrays...

--- Step 2: Processing test set predictions in chunks to create submission file ---

--- Processing Chunk 1/2 ---
Processing cells from index 0 to 24330...
Writing initial chunk to 'D:\submission_cite.csv'...


NameError: name 'chunk_sub_preds_mse' is not defined

In [None]:
import pandas as pd
import os

# --- File Paths ---
# Input files generated by your CITEseq and Multiome notebooks
CITE_SUBMISSION_PATH = 'submission_cite.csv'
MULTI_SUBMISSION_PATH = 'submission_multi.csv'

# Final output file
FINAL_SUBMISSION_PATH = 'D:\\submission.csv'

print("--- Starting final submission file assembly ---")

# --- Step 1: Check if input files exist ---
if not os.path.exists(CITE_SUBMISSION_PATH):
    raise FileNotFoundError(f"Required input file not found: {CITE_SUBMISSION_PATH}")
if not os.path.exists(MULTI_SUBMISSION_PATH):
    raise FileNotFoundError(f"Required input file not found: {MULTI_SUBMISSION_PATH}")

# --- Step 2: Load the two separate submission files ---
print(f"Loading CITEseq predictions from '{CITE_SUBMISSION_PATH}'...")
df_cite = pd.read_csv(CITE_SUBMISSION_PATH)

print(f"Loading Multiome predictions from '{MULTI_SUBMISSION_PATH}'...")
df_multi = pd.read_csv(MULTI_SUBMISSION_PATH)

print(f"Loaded {len(df_cite)} rows from CITEseq and {len(df_multi)} rows from Multiome.")

# --- Step 3: Concatenate the two DataFrames ---
# ignore_index=True creates a new, clean index for the combined DataFrame.
print("Concatenating the two files...")
final_submission = pd.concat([df_cite, df_multi], ignore_index=True)

# --- Step 4: Sanity Checks  ---
print(f"Total rows in the final submission: {len(final_submission)}")

# Check for any duplicate row_ids, which would indicate an error.
num_duplicates = final_submission['row_id'].duplicated().sum()
if num_duplicates > 0:
    print(f"WARNING: Found {num_duplicates} duplicate row_ids in the combined file! This should be investigated.")
else:
    print("Sanity check passed: No duplicate row_ids found.")

# --- Step 5: Save the final combined file ---
final_submission.to_csv(FINAL_SUBMISSION_PATH, index=False)

print(f"\nSUCCESS: Final '{FINAL_SUBMISSION_PATH}' created successfully.")
print("Final submission file head:")
print(final_submission.head())
print("\nFinal submission file tail:")
print(final_submission.tail())

--- Starting final submission file assembly ---
Loading CITEseq predictions from 'submission_cite.csv'...
Loading Multiome predictions from 'submission_multi.csv'...
Loaded 65744180 rows from CITEseq and 58931360 rows from Multiome.
Concatenating the two files...
Total rows in the final submission: 124675540


KeyboardInterrupt: 