# **Prepare SVD for multiome**

In this Jupyter notebook, data from train and test datasets is put together and then the TruncatedSVD is calculated. This is done twice: once for data normalized by organizers, and then for raw data. Only SVD features made from normalized data were used in a final submission.

In kaggle environment it is more convenient to do this in a separate notebook, as it would be a waste of both time and GPU quota to calculate the TruncatedSVD each time before fitting the model.

In [None]:
# Importing the libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import gc, pickle, scipy.sparse
from humanize import naturalsize
from sklearn.decomposition import TruncatedSVD


In [None]:
# Need this library to read *.h5 files
!pip install --quiet tables

In [None]:
DATA_DIR = "/kaggle/input/open-problems-multimodal/"
FP_CELL_METADATA = os.path.join(DATA_DIR,"metadata.csv")

FP_CITE_TRAIN_INPUTS = os.path.join(DATA_DIR,"train_cite_inputs.h5")
FP_CITE_TRAIN_TARGETS = os.path.join(DATA_DIR,"train_cite_targets.h5")
FP_CITE_TEST_INPUTS = os.path.join(DATA_DIR,"test_cite_inputs.h5")

FP_MULTIOME_TRAIN_INPUTS = os.path.join(DATA_DIR,"train_multi_inputs.h5")
FP_MULTIOME_TRAIN_TARGETS = os.path.join(DATA_DIR,"train_multi_targets.h5")
FP_MULTIOME_TEST_INPUTS = os.path.join(DATA_DIR,"test_multi_inputs.h5")

FP_SUBMISSION = os.path.join(DATA_DIR,"sample_submission.csv")
FP_EVALUATION_IDS = os.path.join(DATA_DIR,"evaluation_ids.csv")

In [None]:
# The multiome train dataset raw data is too large to be loaded into RAM. But it is also sparse.
# So, I load the dataset in chunks, and then convert it to sparse matrix.
# Will use this function to do right that.


def read_convert_hdf_in_chunks(link, chunk_size, sparse_matrice=None):
    i = 0
    while i < 1000000:
        df_chunk = pd.read_hdf(link, start=i, stop=i+chunk_size)
        sparse_chunk = scipy.sparse.csr_matrix(df_chunk.values)
        if sparse_matrice == None:
            sparse_matrice = sparse_chunk
        else:
            sparse_matrice = scipy.sparse.vstack([sparse_matrice, sparse_chunk])
        print(i)
        i += chunk_size
        if sparse_chunk.shape[0] < chunk_size:
            return sparse_matrice
    

In [None]:
%%time
# Loading raw data inputs

sparse_X = read_convert_hdf_in_chunks('../input/open-problems-raw-counts/train_multi_inputs_raw.h5', 5000)
print(sparse_X.shape[0])
gc.collect()

In [None]:
%%time
# Same procedure for the test raw data.
sparse_X = read_convert_hdf_in_chunks('/kaggle/input/open-problems-raw-counts/test_multi_inputs_raw.h5', 5000, sparse_X)
print(sparse_X.shape[0])
gc.collect()

In [None]:
# Export total_counts. Maybe they will be useful as a feature.
total_counts = sparse_X.sum(axis=1)
counts_index = [*range(len(total_counts))]
total_counts = total_counts.flat
df_total_counts = pd.DataFrame({'total_counts': total_counts}, index=counts_index)
df_total_counts.to_feather('total_counts_multiome.ftr')

In [None]:
%%time
# Apply the singular value decomposition.

print(f"Shape of both before SVD: {sparse_X.shape}")
svd = TruncatedSVD(n_components=64, random_state=1)
sparse_X = svd.fit_transform(sparse_X)
print(f"Shape of both after SVD:  {sparse_X.shape}")

In [None]:
# Save results in a file.
df_svd = pd.DataFrame(sparse_X)
df_svd.to_csv('svd_raw.csv')
print('Raw data SVD ready')

In [None]:
# Free the RAM.
del sparse_X, df_svd
gc.collect()

In [None]:
%%time
# Generally the same operations for the normalized data using the same function.
# Load the train data in chunks and convert it to sparse matrix.

sparse_X = read_convert_hdf_in_chunks(FP_MULTIOME_TRAIN_INPUTS, 5000)
print(sparse_X.shape[0])
gc.collect()

In [None]:
%%time
# Same for normalized test dataset.

sparse_X = read_convert_hdf_in_chunks(FP_MULTIOME_TEST_INPUTS, 5000, sparse_X)
print(sparse_X.shape[0])
gc.collect()

In [None]:
%%time
# Apply the singular value decomposition.
# Normalized data is more important, so I will prepare more components.

print(f"Shape of both before SVD: {sparse_X.shape}")
svd = TruncatedSVD(n_components=256, random_state=1)
sparse_X = svd.fit_transform(sparse_X)
print(f"Shape of both after SVD:  {sparse_X.shape}")

In [None]:
# Save results in a file.
df_svd = pd.DataFrame(sparse_X)
df_svd.to_csv('svd.csv')
print('All the SVD ready')