## Notebook to convert CIRI2 output to quantified matrix

the circRNA quantifications from CIRI2 are based o the RNAB so use the RNAB sample info

In [1]:
!date

Mon Jan 30 17:45:16 UTC 2023


#### import libraries

In [2]:
from pandas import read_csv, DataFrame

#### set notebook variables

In [3]:
# naming
cohort = 'foundin'
modality = 'CIRC'
set_name = f'{cohort}_daALL_{modality}'

# directories
wrk_dir = f'/home/jupyter/foundin_qtl'
quants_dir = f'{wrk_dir}/quants'
info_dir = f'{wrk_dir}/sample_info'

# in files
in_ciri2_file = f'{quants_dir}/circRNA_counts.tsv'
in_info_file = f'{info_dir}/{cohort}_RNAB_sample_info.csv'
    
# out files
quants_file = f'{quants_dir}/{set_name}.csv'
out_info_file = f'{info_dir}/{cohort}_{modality}_sample_info.csv'

# variables
DEBUG = False
replace_id_dict = {'PPMI3966B3': 'PPMI3966'}

### load data

In [4]:
%%time
quants_df = read_csv(in_ciri2_file, sep='\s+')
quants_df.rename(columns={'circRNA_ID': 'feature'}, inplace=True)
print(f'shape of input {quants_df.shape}')
if DEBUG:
    display(quants_df.head())

shape of input (67603, 234)
CPU times: user 801 ms, sys: 108 ms, total: 909 ms
Wall time: 902 ms


### transpose the quants matrix

In [5]:
quants_df.set_index('feature', inplace=True)
quants_df = quants_df.transpose()
print(f'quants shape {quants_df.shape}')
if DEBUG:
    display(quants_df.head())

quants shape (233, 67603)


### fill any missing values with zero's

In [6]:
quants_df.fillna(0, inplace=True)

### simplify the RNAB ids
to be consistent with other modalities
also for these need to update 'd' to 'da' for day

#### split name index to find info

In [7]:
def split_id_parts(df: DataFrame) -> DataFrame:
    id_parts = df.index.str.split('_', expand=True).to_frame()
    id_parts.columns = ['assay', 'sampleid', 'cdi', 'day', 'version']
    # id_parts['fullassayid'] = quant_df.index
    # fix the duplicate control sample ID
    id_parts.replace(replace_id_dict, inplace=True)
    # for the other duplicates add version
    id_subs = id_parts.loc[id_parts.sampleid.str.startswith('PPMI3966B')]
    # id_subs.sampleid = id_subs.sampleid + id_subs.version
    id_parts.loc[id_subs.index, 'sampleid'] = id_subs.sampleid + id_subs.version
    # update the day prefix
    # looks like a couple already have da so wipe these first
    id_parts['day'] = id_parts['day'].str.replace('da', 'd')
    id_parts['day'] = id_parts['day'].str.replace('d', 'da')
    id_parts['assayid'] = id_parts['assay'] + '_' + id_parts['sampleid'] + '_' + id_parts['day']
    print(id_parts.shape)
    if DEBUG:
        display(id_parts.sample(5))
    return id_parts

In [8]:
id_parts = split_id_parts(quants_df)
if DEBUG:
    display(id_parts.loc[id_parts.sampleid.str.startswith('PPMI3966')])                         

(233, 6)


In [9]:
id_parts.assayid.value_counts()

RNAB_PPMI18567_da0        1
RNAB_PPMI51971_da25       1
RNAB_PPMI51625_da25       1
RNAB_PPMI51625_da65       1
RNAB_PPMI51714_da0        1
                         ..
RNAB_PPMI3966B1v5_da25    1
RNAB_PPMI3966B1v6_da25    1
RNAB_PPMI3966B1v8_da25    1
RNAB_PPMI3966B1v9_da25    1
RNAB_PPMI90456_da65       1
Name: assayid, Length: 233, dtype: int64

#### get counts by day

In [10]:
days = id_parts['day'].value_counts()
display(days)

da25    84
da65    75
da0     74
Name: day, dtype: int64

In [11]:
quants_df.index = id_parts['assayid']
quants_df.index.set_names('assayid')
if DEBUG:
    display(quants_df.head())

### save formated data

In [12]:
%%time
quants_df.to_csv(quants_file)

CPU times: user 20.8 s, sys: 811 ms, total: 21.6 s
Wall time: 21.8 s


### the RNAB info file IDs where already fix for processing the RNAB data

check if need to fix the RNAB assay IDs in the info file as well
here might be a little out of place but fits

In [13]:
%%time
info_df = read_csv(in_info_file, index_col=0)
print(f'info shape {info_df.shape}')
if DEBUG:
    display(info_df.head())

info shape (305, 112)
CPU times: user 12.1 ms, sys: 14 µs, total: 12.1 ms
Wall time: 10.6 ms


#### check the corrected PPMI3966 IDs

In [14]:
id_parts = info_df.index.str.split('_', expand=True).to_frame()
id_parts.columns = ['assay', 'sampleid','day']
if DEBUG:
    display(id_parts.loc[id_parts.sampleid.str.startswith('PPMI3966')])  

#### get counts by day

In [15]:
days = id_parts['day'].value_counts()
display(days)

da25    107
da0     100
da65     98
Name: day, dtype: int64

### save the info file

In [16]:
info_df.to_csv(out_info_file)

In [17]:
!date

Mon Jan 30 17:45:39 UTC 2023
