## Notebook to convert DaPar2 output to quantified matrix and feature annotation

loci column are the features and are position based names

In [None]:
!date

#### import libraries

In [None]:
from pandas import read_csv, DataFrame
from dask.dataframe import read_csv as dd_read_csv

#### set notebook variables

In [None]:
# naming
cohort = 'foundin'
modality = 'PDUI'
set_name = f'{cohort}_daALL_{modality}'

# directories
wrk_dir = f'/home/gibbsr/working/foundin/foundin_qtl'
quants_dir = f'{wrk_dir}/quants'
info_dir = f'{wrk_dir}/sample_info'

# in files
if modality == 'PDUI':
    dapars2_file = f'{quants_dir}/DaPars2_result.all_chromosomes.txt'
    info_file = f'{info_dir}/{cohort}_{modality}_sample_info.csv'
    
# out files
quants_file = f'{quants_dir}/{set_name}.csv'
features_file = f'{quants_dir}/{cohort}_{modality}.features.csv'

# variables
DEBUG = False
replace_id_dict = {'PPMI3966B3': 'PPMI3966'}

### load data

In [None]:
%%time
quants_dd = dd_read_csv(dapars2_file, sep='\t')
quants_df = quants_dd.compute()
print(f'shape of input {quants_df.shape}')
if DEBUG:
    display(quants_df.head())

### drop duplicate rows based on Loci (feature)

In [None]:
quants_df.drop_duplicates(subset=['Loci'], inplace=True)
print(f'shape of non-duplicate input {quants_df.shape}')
if DEBUG:
    display(quants_df.head())

### split off and reformat feature information

In [None]:
annot_cols = ['Gene', 'fit_value', 'Predicted_Proximal_APA']
features_df = quants_df[['Loci'] + annot_cols].copy()
print(f'features shape {features_df.shape}')
print(f'quants shape {quants_df.shape}')
if DEBUG:
    display(features_df.head())
    display(quants_df.head())

#### extract gene name as feature column

In [None]:
# split gene name on delimiter and keep gene and chromosome
temp = features_df.Gene.str.split('|', expand=True)
features_df.Gene = temp[1]
features_df['chrom'] = temp[2]
# split loci name on delimiters and keep begin and end
temp = features_df.Loci.str.split(':', expand=True)[1].str.split('-', expand=True)
features_df['start'] = temp[0]
features_df['stop'] = temp[1]
features_df.set_index('Loci', inplace=True)
print(f'features shape {features_df.shape}')
if DEBUG:
    display(features_df.head())

### transpose the quants matrix

In [None]:
quants_df.drop(columns=annot_cols, inplace=True)
quants_df.set_index('Loci', inplace=True)
quants_df = quants_df.transpose()
print(f'quants shape {quants_df.shape}')
if DEBUG:
    display(quants_df.head())

### fill any missing values with zero's

In [None]:
quants_df.fillna(0, inplace=True)

### simplify the RNAB ids
to be consistent with other modalities

#### split name index to find info

In [None]:
def split_id_parts(df: DataFrame) -> DataFrame:
    id_parts = df.index.str.split('_', expand=True).to_frame()
    id_parts.columns = ['assay', 'sampleid', 'cdi', 'day', 'version']
    # id_parts['fullassayid'] = quant_df.index
    # fix the duplicate control sample ID
    id_parts.replace(replace_id_dict, inplace=True)
    # for the other duplicates add version
    id_subs = id_parts.loc[id_parts.sampleid.str.startswith('PPMI3966B')]
    # id_subs.sampleid = id_subs.sampleid + id_subs.version
    id_parts.loc[id_subs.index, 'sampleid'] = id_subs.sampleid + id_subs.version

    id_parts['assayid'] = id_parts['assay'] + '_' + id_parts['sampleid'] + '_' + id_parts['day']
    print(id_parts.shape)
    if DEBUG:
        display(id_parts.sample(5))
    return id_parts

In [None]:
id_parts = split_id_parts(quants_df)
if DEBUG:
    display(id_parts.loc[id_parts.sampleid.str.startswith('PPMI3966')])                         

In [None]:
id_parts.assayid.value_counts()

In [None]:
quants_df.index = id_parts['assayid']
quants_df.index.set_names('assayid')
if DEBUG:
    display(quants_df.head())

### save formated data

In [None]:
%%time
quants_df.to_csv(quants_file)
features_df.to_csv(features_file)

### fix the RNAB assay IDs in the info file as well
here might be a little out of place but fits

In [None]:
%%time
info_df = read_csv(info_file, index_col=0)
print(f'info shape {info_df.shape}')
if DEBUG:
    display(info_df.head())

In [None]:
id_parts = split_id_parts(info_df)
if DEBUG:
    display(id_parts.loc[id_parts.sampleid.str.startswith('PPMI3966')])  

In [None]:
info_df['ori_assayid'] = info_df.index.values
info_df.index = id_parts['assayid']
if DEBUG:
    display(info_df.head())

In [None]:
info_df.to_csv(info_file)

In [None]:
!date