# **Main Multiome Notebook**

This is the main notebook for multiome part of the project, where the task is to predict gene expression levels given information about TF_IDF normalized chromatin accessibility data.

Target includes 23000 genes. Kaggle notebook can hardly fit all the target values in available memory, and there is no possibility to fit an individual model for every of 23000 targets. So, I calculate TruncatedSVD components for the target data, predict the TruncatedSVD components and then calculate the predicted targets by using reverse operation to TruncatedSVD calculation. To further improve results, I build 4 models predicting TruncatedSVD components calculated with different random seeds and then calculate the average prediction.

In this Jupyter notebook, data from several sources is joined together and is used further to create predictions for the test dataset. The sources are:

* Pre-calculated Truncated SVD values from chromatin accessibility data (see Prepare_SVD_for_multiome notebook).
* Source data for three input features to be used as is.
* Metadata - donor ID and day each cell was analyzed, few features are built using metadata information.
* Target values for the train set.

In [None]:
# Importing the libraries

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import gc, pickle, scipy.sparse
from sklearn.decomposition import PCA, TruncatedSVD
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
from humanize import naturalsize

In [None]:
# Need this libraby to read the *.h5 data
!pip install --quiet tables

In [None]:
DATA_DIR = "/kaggle/input/open-problems-multimodal/"
FP_CELL_METADATA = os.path.join(DATA_DIR,"metadata.csv")

FP_CITE_TRAIN_INPUTS = os.path.join(DATA_DIR,"train_cite_inputs.h5")
FP_CITE_TRAIN_TARGETS = os.path.join(DATA_DIR,"train_cite_targets.h5")
FP_CITE_TEST_INPUTS = os.path.join(DATA_DIR,"test_cite_inputs.h5")
FP_CITE_TEST_INPUTS_FIX = os.path.join(DATA_DIR,"test_cite_inputs_day_2_donor_27678.h5")

FP_MULTIOME_TRAIN_INPUTS = os.path.join(DATA_DIR,"train_multi_inputs.h5")
FP_MULTIOME_TRAIN_TARGETS = os.path.join(DATA_DIR,"train_multi_targets.h5")
FP_MULTIOME_TEST_INPUTS = os.path.join(DATA_DIR,"test_multi_inputs.h5")

FP_SUBMISSION = os.path.join(DATA_DIR,"sample_submission.csv")
FP_EVALUATION_IDS = os.path.join(DATA_DIR,"evaluation_ids.csv")

In [None]:
# Import specially prepared TruncatedSVD data and select rows related to train data.
# Only 128 components will be used, as cross-validation showed other components add little value to the model.
svd_x = pd.read_csv('../input/raw-features-for-multiome/svd.csv', dtype='float32')
svd_x = svd_x.iloc[:105942, :129]
#svd_x = svd_x.iloc[:105942]
svd_x = svd_x.add_prefix('svd_x_')
del svd_x['svd_x_Unnamed: 0']
gc.collect()

In [None]:
# Get column names from target data.
df_target = pd.read_hdf(FP_MULTIOME_TRAIN_TARGETS, start=0, stop=1)
target_names = df_target.columns

del df_target
gc.collect()

In [None]:
%%time
# Import prepared sparse matrix of target values.

train_targets = scipy.sparse.load_npz("../input/multimodal-single-cell-as-sparse-matrix/train_multi_targets_values.sparse.npz")

In [None]:
def save_pca(name, model):
    with open(name, 'wb') as f:
        pickle.dump(model, f)

In [None]:
# For targets, I calculate the TruncatedSVD components in the main notebook and use pickle to save the model, 
# so that it would be possible to perform reverse operation later.
# To achieve better results, I calculate the TruncatedSVD components 4 times and will later fit 4 models
# and calculate the average.
for i in [2,3,4,5]:
    file_name = 'pca_targets_' + str(i) + '.pkl'
    prefix = 'svd_y_' + str(i) + '_'
    pca_targets = TruncatedSVD(n_components=64, random_state=i)
    #pca_targets = TruncatedSVD(n_components=4, random_state=i)
    t_targets = pca_targets.fit_transform(train_targets)
    save_pca(file_name, pca_targets)
    target_i = pd.DataFrame(t_targets, dtype='float32')
    target_i = target_i.add_prefix(prefix)
    if i == 2:
        target_total = target_i
    else:
        target_total = pd.concat([target_total, target_i], axis=1)

    
del t_targets, train_targets, target_i
gc.collect()
print(target_total.shape)

In [None]:
%%time

# Import metadata and select rows related to train set.
md_df = pd.read_csv(FP_CELL_METADATA, index_col='cell_id')
md_df = md_df.loc[md_df['technology'] == "multiome"]
md_df['day'] = md_df['day'].astype('int8')
del md_df['technology']
md_df = md_df.loc[(md_df['donor'] != 27678) & (md_df['day'] != 10)]
print(md_df.shape)
gc.collect()

In [None]:
# Import pre-selected important features to be used as is.
df_imp_cols = pd.read_parquet('../input/imp-features-for-multiome/train_corr_features.parquet')
very_imp_cols = ['svd_x_chr1:630875-631689', 'svd_x_chr1:633700-634539', 'svd_x_chr17:22520955-22521852']
df_imp_cols = df_imp_cols[very_imp_cols]
print(df_imp_cols.shape)

In [None]:
# Now join all the train data into a single dataframe.
md_df = md_df.merge(df_imp_cols, how = 'left', on = 'cell_id')
df = md_df.reset_index()
df = pd.concat([df, svd_x], axis=1)
df = pd.concat([df, target_total], axis=1)
print(df.shape)


del md_df, svd_x, target_total, df_imp_cols
gc.collect()

In [None]:
# Check the dataframe size.
size = df.memory_usage(deep='True').sum()
print(size)
print(naturalsize(size))

In [None]:
# Now import the prepared TruncatedSVD data for test dataset.
svd_test = pd.read_csv('../input/raw-features-for-multiome/svd.csv', dtype='float32')
svd_test = svd_test.iloc[105942:, :129]
svd_test = svd_test.add_prefix('svd_x_')
del svd_test['svd_x_Unnamed: 0']
svd_test = svd_test.reset_index(drop = True)
print(svd_test.shape)
gc.collect()

In [None]:
# Import metadata for test dataset.
md_df = pd.read_csv(FP_CELL_METADATA, index_col='cell_id')
md_df = md_df.loc[md_df['technology'] == "multiome"]
md_df['day'] = md_df['day'].astype('int8')
del md_df['technology']
md_df = md_df.loc[(md_df['donor'] == 27678) | (md_df['day'] == 10)]
print(md_df.shape)

In [None]:
# Import data for pre-selected important features (test dataset).
df_imp_cols = pd.read_parquet('../input/imp-features-for-multiome/test_corr_features.parquet')
very_imp_cols = ['svd_x_chr1:630875-631689', 'svd_x_chr1:633700-634539', 'svd_x_chr17:22520955-22521852']
df_imp_cols = df_imp_cols[very_imp_cols]
print(df_imp_cols.shape)

In [None]:
# Now join all the test data into a single dataframe.
md_df = md_df.merge(df_imp_cols, how = 'left', on = 'cell_id')
df_test = md_df.reset_index()
df_test = pd.concat([df_test, svd_test], axis=1)
print(df_test.shape)
del md_df, svd_test, df_imp_cols
gc.collect()

In [None]:
cat_params_submit_fast = {
    "learning_rate" : 0.06,
    "eval_metric" : 'RMSE', 
    "max_depth" : 7,
    "verbose" : 100,
    "n_estimators" : 800,
    "task_type" : 'GPU'
    }
cat_params_submit_middle = {
    "learning_rate" : 0.04,
    "eval_metric" : 'RMSE', 
    "max_depth" : 7,
    "verbose" : 100,
    #"reg_lambda" : 20,
    "n_estimators" : 600,
    "task_type" : 'GPU'
    }
cat_params_submit_slow = {
    "learning_rate" : 0.03,
    "eval_metric" : 'RMSE', 
    "max_depth" : 7,
    "verbose" : 100,
    #"reg_lambda" : 20,
    "n_estimators" : 400,
    "task_type" : 'GPU'
    }

In [None]:
# Function to create  metadata features both for test and train.
# Note: here I cannot use "get_dummies" because one of the donors is only present in test set.
def add_metadata_features(d_frame):
    d_frame['svd_x_donor_13176'] = 0
    d_frame['svd_x_donor_31800'] = 0
    d_frame['svd_x_donor_32606'] = 0
    d_frame.loc[d_frame['donor'] == 13176, 'svd_x_donor_13176'] = 1
    d_frame.loc[d_frame['donor'] == 31800, 'svd_x_donor_31800'] = 1
    d_frame.loc[d_frame['donor'] == 32606, 'svd_x_donor_32606'] = 1
    d_frame['svd_x_day'] = d_frame['day']
    return d_frame

In [None]:
# Building catboost models and predicting target TruncatedSVD components in a cycle.
# Note that I use stronger parameters for the first TruncatedSVD components.
# For the last components I use fewer iterations and smaller learning rate to prevent overfitting.
df = add_metadata_features(df)
df_test = add_metadata_features(df_test)
x_cols = [col for col in list(df.columns) if (col.startswith('svd_x_'))]
y_cols = [col for col in list(df.columns) if (col.startswith('svd_y_'))]
X = df[x_cols].values
Y = df[y_cols].values
Xt = df_test[x_cols].values
for i in range(len(y_cols)):
    print('Training_column: ' + str(i))
    num = int(y_cols[i].rsplit('_', 1)[-1])
    #model = lightgbm.LGBMRegressor(**lightgbm_params)
    #model = CatBoostRegressor(**cat_params_submit)
    if num < 16:
        model = CatBoostRegressor(**cat_params_submit_fast)
    elif num < 32:
        model = CatBoostRegressor(**cat_params_submit_middle)
    else:
        model = CatBoostRegressor(**cat_params_submit_slow)
    model.fit(X, Y[:,i].copy())
    col_name = y_cols[i]
    df_test[col_name] = model.predict(Xt)

In [None]:
del df, X, Y, Xt, model
for col in df_test.columns:
    if col in x_cols:
        del df_test[col]
gc.collect()

In [None]:
# Saving the final results.
df_test[y_cols].reset_index().to_feather('multiome_multi.ftr')