+ I used `TruncatedSVD(n_components=128, random_state=42)` both on train and test data.
+ CatBoostRegressor is used. To save time， only trained on fold_0. Maybe more fold will help to impore score.

In [1]:
import os, gc, pickle
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from colorama import Fore, Back, Style
from matplotlib.ticker import MaxNLocator

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler, scale
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.dummy import DummyRegressor
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import Ridge, LinearRegression, Lasso
from sklearn.metrics import mean_squared_error

import scipy
import scipy.sparse

import gc
import pickle
import warnings
warnings.filterwarnings('ignore')

In [2]:
def correlation_score(y_true, y_pred):
    """Scores the predictions according to the competition rules. 
    
    It is assumed that the predictions are not constant.
    
    Returns the average of each sample's Pearson correlation coefficient"""
    if type(y_true) == pd.DataFrame: y_true = y_true.values
    if type(y_pred) == pd.DataFrame: y_pred = y_pred.values
    if y_true.shape != y_pred.shape: raise ValueError("Shapes are different.")
    corrsum = 0
    for i in range(len(y_true)):
        corrsum += np.corrcoef(y_true[i], y_pred[i])[1, 0]
    return corrsum / len(y_true)


# Preprocessing and cross-validation

We first load all of the training input data for Multiome. It should take less than a minute.

In [3]:
%%time
train_inputs = scipy.sparse.load_npz("../input/multimodal-single-cell-as-sparse-matrix/train_multi_inputs_values.sparse.npz")

CPU times: user 35.7 s, sys: 3.47 s, total: 39.2 s
Wall time: 58.3 s


In [4]:
train_inputs = train_inputs.astype('float16', copy=False)

##  TruncatedSVD

In [5]:
%%time
pca = TruncatedSVD(n_components=128, random_state=42)
train_inputs = pca.fit_transform(train_inputs)
print(pca.explained_variance_ratio_.sum())

0.010997659
CPU times: user 22min, sys: 20.7 s, total: 22min 21s
Wall time: 22min 1s


In [6]:
%%time
train_targets = scipy.sparse.load_npz("../input/multimodal-single-cell-as-sparse-matrix/train_multi_targets_values.sparse.npz")

CPU times: user 15.8 s, sys: 1.27 s, total: 17.1 s
Wall time: 23.8 s


In [7]:
%%time
pca2 = TruncatedSVD(n_components=128, random_state=42)
train_target = pca2.fit_transform(train_targets)
print(pca2.explained_variance_ratio_.sum())

0.11779101
CPU times: user 5min 21s, sys: 5.03 s, total: 5min 26s
Wall time: 5min 18s


In [8]:
def save(name, model):
    with open(name, 'wb') as f:
        pickle.dump(model, f)

In [9]:
save('pca.pkl', pca)
save('pca2.pkl', pca2)

In [10]:
from catboost import CatBoostRegressor
params = {'learning_rate': 0.1, 
          'depth': 7, 
          'l2_leaf_reg': 4, 
          'loss_function': 'MultiRMSE', 
          'eval_metric': 'MultiRMSE', 
          'task_type': 'CPU', 
          'iterations': 200,
          'od_type': 'Iter', 
          'boosting_type': 'Plain', 
          'bootstrap_type': 'Bayesian', 
          'allow_const_label': True, 
          'random_state': 1
         }
model = CatBoostRegressor(**params)

In [11]:
n = 1

In [12]:
np.random.seed(42)
all_row_indices = np.arange(train_inputs.shape[0])
np.random.shuffle(all_row_indices)

kf = KFold(n_splits=5, shuffle=True, random_state=42)

index = 0
score = []

# model = Ridge(copy_X=False)
d = train_inputs.shape[0]//n
for i in range(0, n*d, d):
    print(f'start [{i}:{i+d}]')
    ind = all_row_indices[i:i+d]    
    for idx_tr, idx_va in kf.split(ind):
        X = train_inputs[ind]
        Y = train_target[ind] #.todense()
        Yva = train_targets[ind][idx_va]
        Xtr, Xva = X[idx_tr], X[idx_va]
        Ytr = Y[idx_tr]
        del X, Y
        gc.collect()
        print('Train...')
        model.fit(Xtr, Ytr)
        del Xtr, Ytr
        gc.collect()
        s = correlation_score(Yva.todense(), model.predict(Xva)@pca2.components_)
        score.append(s)
        print(index, s)
        del Xva, Yva
        gc.collect()
        pkl_filename = f"model{index:02d}.pkl"
        index += 1
        with open(pkl_filename, 'wb') as file:
            pickle.dump(model, file)
#         break
#     break
    gc.collect()

start [0:105942]
Train...
0:	learn: 74.4837457	total: 21.9s	remaining: 1h 12m 42s
1:	learn: 72.3356217	total: 40.7s	remaining: 1h 7m 12s
2:	learn: 70.4881701	total: 59.9s	remaining: 1h 5m 33s
3:	learn: 68.7941270	total: 1m 18s	remaining: 1h 4m 30s
4:	learn: 67.3404626	total: 1m 38s	remaining: 1h 3m 49s
5:	learn: 65.9576768	total: 1m 57s	remaining: 1h 3m 10s
6:	learn: 64.7607744	total: 2m 15s	remaining: 1h 2m 28s
7:	learn: 63.7088748	total: 2m 35s	remaining: 1h 2m 6s
8:	learn: 62.7753348	total: 2m 54s	remaining: 1h 1m 35s
9:	learn: 61.9206851	total: 3m 13s	remaining: 1h 1m 16s
10:	learn: 61.2032398	total: 3m 32s	remaining: 1h 54s
11:	learn: 60.5609569	total: 3m 51s	remaining: 1h 25s
12:	learn: 59.9598257	total: 4m 12s	remaining: 1h 27s
13:	learn: 59.4175905	total: 4m 30s	remaining: 59m 58s
14:	learn: 58.9458298	total: 4m 50s	remaining: 59m 39s
15:	learn: 58.5225515	total: 5m 9s	remaining: 59m 14s
16:	learn: 58.0944311	total: 5m 27s	remaining: 58m 49s
17:	learn: 57.7122787	total: 5m 47s	

In [13]:
del train_target, train_inputs, train_targets
gc.collect()

21

# Predicting

In [14]:
%%time
multi_test_x = scipy.sparse.load_npz("../input/multimodal-single-cell-as-sparse-matrix/test_multi_inputs_values.sparse.npz")
multi_test_x = pca.transform(multi_test_x)

CPU times: user 1min 35s, sys: 6.54 s, total: 1min 42s
Wall time: 1min 57s


In [16]:
test_len = multi_test_x.shape[0]
d = test_len//n
x = []
for i in range(n):
    x.append(multi_test_x[i*d:i*d+d])
del multi_test_x
gc.collect()

103

In [17]:
index

5

In [18]:
preds = np.zeros((test_len, 23418), dtype='float16')
for i,xx in enumerate(x):
    for ind in range(index):
        print(ind, end=' ')
        with open(f'model{ind:02}.pkl', 'rb') as file:
            model = pickle.load(file)
        preds[i*d:i*d+d,:] += (model.predict(xx)@pca2.components_)/index
        gc.collect()
    print('')
    del xx
gc.collect()

0 1 2 3 4 


0

In [19]:
del x
gc.collect()

21

In [20]:
np.save('preds.npy', preds)

# Creating submission

We load the cells that will have to appear in submission.

In [21]:
%%time
# Read the table of rows and columns required for submission
eval_ids = pd.read_parquet("../input/multimodal-single-cell-as-sparse-matrix/evaluation.parquet")
# Convert the string columns to more efficient categorical types
eval_ids.cell_id = eval_ids.cell_id.astype(pd.CategoricalDtype())
eval_ids.gene_id = eval_ids.gene_id.astype(pd.CategoricalDtype())

CPU times: user 32.3 s, sys: 10.9 s, total: 43.2 s
Wall time: 37.4 s


In [22]:
# Prepare an empty series which will be filled with predictions
submission = pd.Series(name='target',
                       index=pd.MultiIndex.from_frame(eval_ids), 
                       dtype=np.float32)
submission

row_id    cell_id       gene_id        
0         c2150f55becb  CD86              NaN
1         c2150f55becb  CD274             NaN
2         c2150f55becb  CD270             NaN
3         c2150f55becb  CD155             NaN
4         c2150f55becb  CD112             NaN
                                           ..
65744175  2c53aa67933d  ENSG00000134419   NaN
65744176  2c53aa67933d  ENSG00000186862   NaN
65744177  2c53aa67933d  ENSG00000170959   NaN
65744178  2c53aa67933d  ENSG00000107874   NaN
65744179  2c53aa67933d  ENSG00000166012   NaN
Name: target, Length: 65744180, dtype: float32

We load the `index`  and `columns` of the original dataframe, as we need them to make the submission.

In [23]:
%%time
y_columns = np.load("../input/multimodal-single-cell-as-sparse-matrix/train_multi_targets_idxcol.npz",
                   allow_pickle=True)["columns"]

test_index = np.load("../input/multimodal-single-cell-as-sparse-matrix/test_multi_inputs_idxcol.npz",
                    allow_pickle=True)["index"]

CPU times: user 37.6 ms, sys: 7.95 ms, total: 45.5 ms
Wall time: 101 ms


We assign the predicted values to the correct row in the submission file.

In [24]:
cell_dict = dict((k,v) for v,k in enumerate(test_index)) 
assert len(cell_dict)  == len(test_index)

gene_dict = dict((k,v) for v,k in enumerate(y_columns))
assert len(gene_dict) == len(y_columns)

In [25]:
eval_ids_cell_num = eval_ids.cell_id.apply(lambda x:cell_dict.get(x, -1))
eval_ids_gene_num = eval_ids.gene_id.apply(lambda x:gene_dict.get(x, -1))

valid_multi_rows = (eval_ids_gene_num !=-1) & (eval_ids_cell_num!=-1)

In [26]:
submission.iloc[valid_multi_rows] = preds[eval_ids_cell_num[valid_multi_rows].to_numpy(),
eval_ids_gene_num[valid_multi_rows].to_numpy()]

In [27]:
del eval_ids_cell_num, eval_ids_gene_num, valid_multi_rows, eval_ids, test_index, y_columns
gc.collect()

134

In [28]:
submission

row_id    cell_id       gene_id        
0         c2150f55becb  CD86                    NaN
1         c2150f55becb  CD274                   NaN
2         c2150f55becb  CD270                   NaN
3         c2150f55becb  CD155                   NaN
4         c2150f55becb  CD112                   NaN
                                             ...   
65744175  2c53aa67933d  ENSG00000134419    6.195312
65744176  2c53aa67933d  ENSG00000186862    0.032776
65744177  2c53aa67933d  ENSG00000170959    0.044586
65744178  2c53aa67933d  ENSG00000107874    1.187500
65744179  2c53aa67933d  ENSG00000166012    5.132812
Name: target, Length: 65744180, dtype: float32

# Merging with CITEseq predictions

In [29]:
submission.reset_index(drop=True, inplace=True)
submission.index.name = 'row_id'

In [30]:
cite_submission = pd.read_csv("../input/msci-citeseq-keras-quickstart/submission.csv")
cite_submission = cite_submission.set_index("row_id")
cite_submission = cite_submission["target"]

In [31]:
submission[submission.isnull()] = cite_submission[submission.isnull()]

In [32]:
submission

row_id
0           0.094605
1          -0.162362
2          -0.405332
3          -0.302582
4           1.114355
              ...   
65744175    6.195312
65744176    0.032776
65744177    0.044586
65744178    1.187500
65744179    5.132812
Name: target, Length: 65744180, dtype: float32

In [34]:
submission.isnull().any()

False

In [35]:
submission.to_csv("submission.csv")

In [36]:
!head submission.csv

row_id,target
0,0.09460453
1,-0.16236241
2,-0.40533188
3,-0.30258211
4,1.1143554
5,2.5725958
6,0.24209185
7,-0.799001
8,-0.676689
