In [1]:
import numpy as np 
import pandas as pd 
import glob
from tqdm.notebook import tqdm
import os

In [2]:
def std(x):
    return (x - np.mean(x)) / np.std(x)

In [3]:
SUBMISSIONS = {
    
    # LB: 0.812
    '/kaggle/input/keraskeras/submission_keras_keras.csv': 1.,
    
    # LB 0.812
    '/kaggle/input/kerasridge/submission_keras_ridge.csv': 1., 
    
    # LB 0.811
    '/kaggle/input/kerascat/submission_keras_cat.csv': 0.95, 
    
    # LB 0.811
    '/kaggle/input/lgbkeras/submission_lgb_keras.csv': 0.95, 
    
    # LB 0.811
    '/kaggle/input/tabnetkeras/submission_tabnet_keras.csv': 0.95
}

In [4]:
cell_ids = pd.read_parquet('../input/multimodal-single-cell-as-sparse-matrix/evaluation.parquet').cell_id

In [5]:
def gen_std_submission(path, cell_ids):
    """
    Standardize submission per cell_id
    """
    df = pd.read_csv(path)
    df['cell_id'] = cell_ids    
    vals = []
    for idx, g in tqdm(df.groupby('cell_id', sort=False), desc=f'Standardizing {path}', miniters=1000):
        vals.append(std(g.target).values)
    vals = np.concatenate(vals)
    return vals

In [6]:
def gen_ensemble(technology):
    ensemble = None
    for path in tqdm([path for path in SUBMISSIONS.keys()], desc='Process submission'):
        weight = SUBMISSIONS[path]
        if ensemble is None:
            ensemble = gen_std_submission(path, cell_ids) * weight
        else:
            ensemble += gen_std_submission(path, cell_ids) * weight
    return ensemble

In [7]:
PRED_SEGMENTS = [(0, 6812820), (6812820, 65744180)]
ensemble = []
for tech, (from_idx, to_idx) in tqdm(list(zip(['citeseq', 'multiome'], PRED_SEGMENTS)), desc='Technology'):  
    ensemble.append(gen_ensemble(tech)[from_idx: to_idx])
    
ensemble = np.concatenate(ensemble)

Technology:   0%|          | 0/2 [00:00<?, ?it/s]

Process submission:   0%|          | 0/5 [00:00<?, ?it/s]

Standardizing /kaggle/input/keraskeras/submission_keras_keras.csv:   0%|          | 0/65443 [00:00<?, ?it/s]

Standardizing /kaggle/input/kerasridge/submission_lolo_1.csv:   0%|          | 0/65443 [00:00<?, ?it/s]

Standardizing /kaggle/input/kerascat/submission_keras_cat.csv:   0%|          | 0/65443 [00:00<?, ?it/s]

Standardizing /kaggle/input/lgbkeras/submission_lgb_keras.csv:   0%|          | 0/65443 [00:00<?, ?it/s]

Standardizing /kaggle/input/tabnetkeras/submission_tabnet_keras.csv:   0%|          | 0/65443 [00:00<?, ?it/s]

Process submission:   0%|          | 0/5 [00:00<?, ?it/s]

Standardizing /kaggle/input/keraskeras/submission_keras_keras.csv:   0%|          | 0/65443 [00:00<?, ?it/s]

Standardizing /kaggle/input/kerasridge/submission_lolo_1.csv:   0%|          | 0/65443 [00:00<?, ?it/s]

Standardizing /kaggle/input/kerascat/submission_keras_cat.csv:   0%|          | 0/65443 [00:00<?, ?it/s]

Standardizing /kaggle/input/lgbkeras/submission_lgb_keras.csv:   0%|          | 0/65443 [00:00<?, ?it/s]

Standardizing /kaggle/input/tabnetkeras/submission_tabnet_keras.csv:   0%|          | 0/65443 [00:00<?, ?it/s]

In [8]:
df_submit = pd.read_parquet('../input/multimodal-single-cell-as-sparse-matrix/sample_submission.parquet')
df_submit['target'] = ensemble
df_submit.to_csv('submission.csv', index=False)
df_submit

Unnamed: 0,row_id,target
0,0,0.458832
1,1,-0.787458
2,2,-1.965860
3,3,-1.467523
4,4,5.404624
...,...,...
65744175,65744175,21.254747
65744176,65744176,-2.985962
65744177,65744177,-2.954488
65744178,65744178,1.239855
