# **Prepare SVD for CITE**

In this Jupyter notebook, data from train and test datasets is put together and then the TruncatedSVD is calculated.
Also join data from a fix to test data the organizers have released in the middle of the competition.

In kaggle environment it is more convenient to do this in a separate notebook, as it would be a waste of both time and GPU quota to calculate the TruncatedSVD each time before fitting the model.

In [None]:
# Importing the libraries

import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import gc, pickle, scipy.sparse
from more_itertools import sliced
from sklearn.decomposition import TruncatedSVD


In [None]:
# Need this library to read *.h5 files
!pip install --quiet tables

In [None]:
DATA_DIR = "/kaggle/input/open-problems-multimodal/"
FP_CELL_METADATA = os.path.join(DATA_DIR,"metadata.csv")

FP_CITE_TRAIN_INPUTS = os.path.join(DATA_DIR,"train_cite_inputs.h5")
FP_CITE_TRAIN_TARGETS = os.path.join(DATA_DIR,"train_cite_targets.h5")
FP_CITE_TEST_INPUTS = os.path.join(DATA_DIR,"test_cite_inputs.h5")
FP_CITE_TEST_INPUTS_FIX = os.path.join(DATA_DIR,"test_cite_inputs_day_2_donor_27678.h5")

FP_MULTIOME_TRAIN_INPUTS = os.path.join(DATA_DIR,"train_multi_inputs.h5")
FP_MULTIOME_TRAIN_TARGETS = os.path.join(DATA_DIR,"train_multi_targets.h5")
FP_MULTIOME_TEST_INPUTS = os.path.join(DATA_DIR,"test_multi_inputs.h5")

FP_SUBMISSION = os.path.join(DATA_DIR,"sample_submission.csv")
FP_EVALUATION_IDS = os.path.join(DATA_DIR,"evaluation_ids.csv")

In [None]:
# Import the train dataset to a DataFrame
df = pd.read_hdf(FP_CITE_TRAIN_INPUTS)
print('import finished')

In [None]:
%%time
# Remove constant columns and chunk by chunk create sparse matrix from the DataFrame.
# Then the DataFrame is deleted to free the RAM. 

constant_cols = df.columns[df.nunique() <= 1]
all_df_cols = df.columns

df = df.drop(columns=constant_cols)
print(f"Original df shape: {str(df.shape):14} {df.size*4/1024/1024/1024:2.3f} GByte")
gc.collect()

n = 2000  #chunk row size
index_slices = sliced(range(len(df)), n)
i = 0
for index_slice in index_slices:
    sparse_chunk = scipy.sparse.csr_matrix(df.iloc[index_slice].values)
    if i == 0:
        sparse_X = sparse_chunk
    else:
        sparse_X = scipy.sparse.vstack([sparse_X, sparse_chunk])
    i = i + 1

del df, sparse_chunk, i
gc.collect()

print(f"sparse df shape: {str(sparse_X.shape):14} {sparse_X.size*4/1024/1024/1024:2.3f} GByte")

In [None]:
# Load test data and convert it to sparse matrix.
# Do not read first 7476 rows as this data should be replaced by the data fix.
df_test = pd.read_hdf(FP_CITE_TEST_INPUTS, start=7476).drop(columns=constant_cols)
print(f"Original Xt shape: {str(df_test.shape):14} {df_test.size*4/1024/1024/1024:2.3f} GByte")
gc.collect()
index_slices = sliced(range(len(df_test)), n)
for index_slice in index_slices:
    sparse_chunk = scipy.sparse.csr_matrix(df_test.iloc[index_slice].values)
    sparse_X = scipy.sparse.vstack([sparse_X, sparse_chunk])

del df_test, sparse_chunk
gc.collect()
print(f"Total sparse shape: {str(sparse_X.shape):14} {sparse_X.size*4/1024/1024/1024:2.3f} GByte")


In [None]:
# Same operation with test data fix. It is relatively small, so do not need chunks here.
# For some reason, the fix data contains some additional columns not present neither in train nor test datasets.
# So, I also delete those additional columns.

df_test_fix = pd.read_hdf(FP_CITE_TEST_INPUTS_FIX)
for col in df_test_fix.columns:
    if col not in all_df_cols:
        constant_cols = np.append(constant_cols, col)
df_test_fix = df_test_fix.drop(columns=constant_cols)

print(f"Original Xt_fix shape: {str(df_test_fix.shape):14} {df_test_fix.size*4/1024/1024/1024:2.3f} GByte")
gc.collect()
sparse_fix = scipy.sparse.csr_matrix(df_test_fix)
sparse_X = scipy.sparse.vstack([sparse_X, sparse_fix])

del df_test_fix, sparse_fix
gc.collect()
print(f"Total sparse shape with fix: {str(sparse_X.shape):14} {sparse_X.size*4/1024/1024/1024:2.3f} GByte")

In [None]:
%%time

# Apply the singular value decomposition.
print(f"Shape of both before SVD: {sparse_X.shape}")
svd = TruncatedSVD(n_components=512, random_state=1) # 512
sparse_X = svd.fit_transform(sparse_X)
print(f"Shape of both after SVD:  {sparse_X.shape}")

In [None]:
# Save result to file.
df_svd = pd.DataFrame(sparse_X)
df_svd.to_csv('svd.csv', index=False)