# Data Preprocessing

In [6]:
import numpy as np
from scipy.io import mmread, mmwrite
from scipy import sparse
import os
import pandas as pd

data_path = '../data/'

In [None]:
# Run only once
# !unzip '../data/COVID-Data.zip' -d '../data'

### Normalize data  
Per instructions given to us by the data source provider, the data was normalized as follows: 
1. Remove rows that sum to 0. 
2. Take each column and divide each entry by the column sum. 
2. Multiply matrix by 10000. 
3. Take $\log(1+x)$  
This is a RAM-intensive operation, so variables were deleted after use. 

In [7]:
mtx = mmread(os.path.join(data_path, 'matrix.mtx'))

# Compressed Sparse Column (fast column ops)
csc_mtx = mtx.tocsc()
# Save memory
del mtx

row_sums = csc_mtx.sum(axis=1)
# Slice out all rows that do not sum to 0
csc_mtx = csc_mtx[np.where(np.asarray(row_sums).flatten() != 0)]
del row_sums

# Compressed Sparse Row (fast row ops)
csr_mtx = csc_mtx.tocsr()
del csc_mtx

column_sums = csr_mtx.sum(axis=0)
# We don't want to divide by 0, so we set them to 1. This is ok because
# after the division, the value is 0 anyways. 
column_sums[np.where(column_sums == 0)] = 1
normal = csr_mtx.multiply(1 / column_sums)
del column_sums
del csr_mtx

normalized_data = normal.multiply(10 ** 4).log1p()
del normal

# Write to file
mmwrite(os.path.join(data_path, 'normalized_data.mtx'), normalized_data)

### Metadata

In [8]:
# We can ignore meta.csv because its encompassed in meta_celltypes.csv
df_meta = pd.read_csv(os.path.join(data_path, 'meta_celltypes.csv'))

In [6]:
# df_meta = df_meta.drop(columns='sample_number')
df_meta = df_meta.drop(columns=['sample_number', 'Unnamed: 0'])

### Read from sparse file

In [110]:
mat = mmread(os.path.join(data_path, 'normalized_data.mtx'))
df = pd.DataFrame.sparse.from_spmatrix(mat)

<24557x47174 sparse matrix of type '<class 'numpy.float64'>'
	with 113466611 stored elements in COOrdinate format>

In [7]:
# Tidy data with genes as columns and cells as rows
df_transposed = df.T
df_transposed['cell_type'] = df_meta['cell_type'].copy()
df_transposed['treatment'] = df_meta['sample_id'].copy()
df_transposed['cell_barcode'] = df_meta['cell_barcode'].copy()
df_transposed.index.names = ["cells"]

# let's drop unknown treatment and na cell type. These are the ones with no_max
df_transposed = df_transposed.loc[df_transposed["treatment"]!="unknown"]

In [197]:
# Let us only consider cells that are treated with the drugs. 
df_transposed= df_transposed[df_transposed['treatment'].apply(lambda x: x in drugs)]
df_transposed['treatment'] = df_transposed['treatment'].replace({'CYCLOSPORINE_ALL CYTO.1':'CYCLOSPORINE_ALL CYTO',
                                                                'ALPROSTADIL_ALL CYTO.1':'ALPROSTADIL_ALL CYTO',
                                                                'DASATINIB_ALL CYTO.1':'DASATINIB_ALL CYTO',
                                                                'ALL CYTO.1':'ALL CYTO'})


In [200]:
df_transposed.to_csv('covid_four_drugs.csv')