## Notebook to convert exceRpt output to quantified matrix

combine the exceRpt miRNA, piRNA, and tRNA data into single quantified matrix for RNAS, also create feature annotation table for the three analytes, and create sample info file for RNAS

In [1]:
!date

Fri Feb  3 15:49:23 UTC 2023


#### import libraries

In [2]:
from pandas import read_csv, DataFrame, concat

#### set notebook variables

In [3]:
# naming
cohort = 'foundin'
modality = 'RNAS'
set_name = f'{cohort}_daALL_{modality}'

# directories
wrk_dir = f'/home/jupyter/foundin_qtl'
quants_dir = f'{wrk_dir}/quants'
info_dir = f'{wrk_dir}/sample_info'
public_dir = f'{wrk_dir}/public'

# in files
in_qc_file = f'{info_dir}/exceRpt_QCresults.txt'
in_info_file = f'{info_dir}/{cohort}_RNAB_sample_info.csv'
mirbase_gff = f'{public_dir}/mirbase_hsa.gff3'
gtrnadb_bed = f'{public_dir}/GtRNAdb_hg38-tRNAs.bed'
    
# out files
quants_file = f'{quants_dir}/{set_name}.csv'
out_info_file = f'{info_dir}/{cohort}_{modality}_sample_info.csv'
features_file = f'{quants_dir}/{cohort}_{modality}_features.csv'

# variables
DEBUG = False
replace_id_dict = {'PPMI3966B3': 'PPMI3966'}
excerpt_analytes = ['miRNA', 'piRNA', 'tRNA']

#### utility functions

In [4]:
def pirna_id_to_annot(df: DataFrame, verbose: bool=False) -> DataFrame:
    feature_parts = df.index.str.split('|', expand=True).to_frame()
    feature_parts.columns = ['ID', 'GB', 'altID', 'ref']
    # small subset were empty for GB and DQ so ref data in GB col, correct that
    missing_indices = feature_parts.loc[feature_parts.ref.isna()].index
    feature_parts.loc[missing_indices, 'ref'] = feature_parts.loc[missing_indices, 'GB']
    feature_parts.loc[missing_indices, 'GB'] = feature_parts.loc[missing_indices, 'altID']
    feature_pos_parts = feature_parts.ref.str.split(':', expand=True)
    feature_pos_parts.columns = ['species', 'chrom', 'start', 'end', 'strand']
    # feature_annots = DataFrame(columns=['feature', 'chrom', 'start', 'end', 'strand', 'altID'])
    feature_annots = DataFrame(columns=['chrom', 'start', 'end', 'feature'])    
    feature_annots.feature = feature_parts.ID.values
    feature_annots.chrom = feature_pos_parts.chrom.values
    feature_annots.start = feature_pos_parts.start.values
    feature_annots.end = feature_pos_parts.end.values
    feature_annots['type'] = 'piRNA'
    # feature_annots.strand = feature_pos_parts.strand.values
    # feature_annots.altID = feature_parts.altID.values
    if verbose:
        display(feature_annots.head())
    return feature_annots

def load_frmt_mirna(in_file: str, verbose: bool=False) -> DataFrame:
    df = read_csv(in_file, sep='\t', comment='#', header=None)
    print(f'full miRNA features shape {df.shape}')
    df = df.drop(columns=[1, 2, 5, 6, 7])
    df.columns = ['chrom', 'start', 'end', 'feature']
    id_parts = df.feature.str.split(';', expand=True)
    id_parts.columns = ['id', 'alias', 'name', 'derives']
    id_parts = id_parts.drop(columns=['derives'])
    id_parts.id = id_parts.id.str.replace('ID=', '')
    id_parts.alias = id_parts.alias.str.replace('Alias=', '')
    id_parts.name = id_parts.name.str.replace('Name=', '')
    df.feature = id_parts.name
    df['type'] = 'miRNA'
    df = df.drop_duplicates(subset=['feature'], keep='first')
    print(f'formatted miRNA features shape {df.shape}')
    if verbose:
        display(df.head())
    return df
        
def load_frmt_trna(in_file: str, verbose: bool=False) -> DataFrame:
    df = read_csv(in_file, sep='\t', header=None)
    print(f'full tRNA feature shape {df.shape}')
    df = df[[0, 1, 2, 3]]
    df.columns = ['chrom', 'start', 'end', 'feature']
    df['type'] = 'tRNA'
    print(f'formatted tRNA feature shape {df.shape}')
    if DEBUG:
        display(df.head())
    return df        

### load data

#### load the miRNA feature annotations

In [5]:
analyte_annots = {}
analyte_annots['miRNA'] = load_frmt_mirna(mirbase_gff, verbose=DEBUG)

full miRNA features shape (4801, 9)
formatted miRNA features shape (4565, 5)


#### load the tRNA feature annotations

In [6]:
analyte_annots['tRNA'] = load_frmt_trna(gtrnadb_bed, verbose=DEBUG)

full tRNA feature shape (619, 12)
formatted tRNA feature shape (619, 5)


In [7]:
%%time
quants = {}
for analyte in excerpt_analytes:
    print(analyte)
    in_file = f'{quants_dir}/exceRpt_{analyte}_ReadsPerMillion.txt'
    analyte_df = read_csv(in_file, sep='\s+')
    print(f'shape of {analyte} {analyte_df.shape}')
    # for piRNA need to separate out the feature annots from ID in matrix index
    if analyte == 'piRNA':
        these_annots = pirna_id_to_annot(analyte_df, verbose=DEBUG)
        analyte_df.index = these_annots.feature
        analyte_annots[analyte] = these_annots
    # transpose the quants matrix    
    analyte_df = analyte_df.rename_axis(index='feature')
    analyte_df = analyte_df.transpose()
    print(f'{analyte} transposed shape {analyte_df.shape}')
    # fill any missing values with zero's
    analyte_df = analyte_df.fillna(0)
    quants[analyte] = analyte_df
    if DEBUG:
        display(analyte_df.head())    

miRNA
shape of miRNA (2664, 300)
miRNA transposed shape (300, 2664)
piRNA
shape of piRNA (9639, 300)
piRNA transposed shape (300, 9639)
tRNA
shape of tRNA (433, 300)
tRNA transposed shape (300, 433)
CPU times: user 422 ms, sys: 60.3 ms, total: 483 ms
Wall time: 477 ms


#### verify features are present in annots

In [8]:
for analyte in excerpt_analytes:
    quants_df = quants[analyte]
    features_df = analyte_annots[analyte]
    missing = set(quants_df.columns) - set(features_df.feature)
    print(f'in {analyte} {len(missing)} features are missing from annotation table')
print('missing will be dropped')

in miRNA 22 features are missing from annotation table
in piRNA 0 features are missing from annotation table
in tRNA 1 features are missing from annotation table
missing will be dropped


#### flatten feature annotations into single table

In [9]:
features_df = concat(list(analyte_annots.values()))
print(f'shape of combine features is {features_df.shape}')
if DEBUG:
    display(features_df.sample(10))

shape of combine features is (14823, 5)


In [10]:
display(features_df.type.value_counts())

piRNA    9639
miRNA    4565
tRNA      619
Name: type, dtype: int64

### verify that sample IDs are consistent between analytes

In [11]:
samples = None
for analyte in excerpt_analytes:
    if samples is None:
        samples = set(quants[analyte].index)
    else:
        samples = samples & set(quants[analyte].index)
for analyte in excerpt_analytes:
    missing = set(quants[analyte].index) - samples
    print(f'{analyte} has {len(missing)} not present in others')

miRNA has 0 not present in others
piRNA has 0 not present in others
tRNA has 0 not present in others


### flatten the quantified data into single table

In [12]:
quants_df = concat(list(quants.values()), axis='columns')
print(f'shape of combined quants {quants_df.shape}')
if DEBUG:
    display(quants_df.head())

shape of combined quants (300, 12736)


### exclude any features not present in feature annotations table

In [13]:
quants_df = quants_df[quants_df.columns.intersection(features_df.feature)]
print(f'shape of modified quants {quants_df.shape}')
if DEBUG:
    display(quants_df.head())

shape of modified quants (300, 12713)


### simplify the samples ids
to be consistent with other modalities

#### split name index to find info

In [14]:
def split_id_parts(df: DataFrame) -> DataFrame:
    id_parts = df.index.str.split('_', expand=True).to_frame()
    id_parts.columns = ['assay', 'sampleid', 'cdi', 'day', 'version', 'R', 'num']
    # id_parts['fullassayid'] = quant_df.index
    # fix the duplicate control sample ID
    id_parts.replace(replace_id_dict, inplace=True)
    # for the other duplicates add version
    id_subs = id_parts.loc[id_parts.sampleid.str.startswith('PPMI3966B')]
    # id_subs.sampleid = id_subs.sampleid + id_subs.version
    id_parts.loc[id_subs.index, 'sampleid'] = id_subs.sampleid + id_subs.version
    id_parts['assayid'] = id_parts['assay'] + '_' + id_parts['sampleid'] + '_' + id_parts['day']
    print(id_parts.shape)
    if DEBUG:
        display(id_parts.sample(5))
    return id_parts

In [15]:
id_parts = split_id_parts(quants_df)
if DEBUG:
    display(id_parts.loc[id_parts.sampleid.str.startswith('PPMI3966')])                         

(300, 8)


In [16]:
id_parts.assayid.value_counts()

RNAS_PPMI18567_da0     1
RNAS_PPMI51971_da25    1
RNAS_PPMI51867_da65    1
RNAS_PPMI51867_da25    1
RNAS_PPMI51867_da0     1
                      ..
RNAS_PPMI3960_da65     1
RNAS_PPMI3960_da25     1
RNAS_PPMI3960_da0      1
RNAS_PPMI3957_da65     1
RNAS_PPMI90456_da65    1
Name: assayid, Length: 300, dtype: int64

#### get counts by day

In [17]:
days = id_parts['day'].value_counts()
display(days)

da25    105
da0      99
da65     96
Name: day, dtype: int64

In [18]:
quants_df.index = id_parts['assayid']
quants_df.index.set_names('assayid')
if DEBUG:
    display(quants_df.head())

### re-using the RNAB info file as a starting point

here might be a little out of place but fits

In [19]:
%%time
info_df = read_csv(in_info_file, index_col=0)
print(f'info shape {info_df.shape}')
if DEBUG:
    display(info_df.head())

info shape (305, 112)
CPU times: user 11.5 ms, sys: 0 ns, total: 11.5 ms
Wall time: 10.7 ms


#### check the corrected PPMI3966 IDs

In [20]:
id_parts = info_df.index.str.split('_', expand=True).to_frame()
id_parts.columns = ['assay', 'sampleid','day']
if DEBUG:
    display(id_parts.loc[id_parts.sampleid.str.startswith('PPMI3966')])  

### make sure if modified all the necessary IDs are present

if some probably just the control repeats, so that is ok here for our purposes, but should've been handled cleaner and more consistent

In [21]:
temp_info_ids = info_df.index.str.replace('RNAB_', 'RNAS_')
ids_not_present = set(quants_df.index) - set(temp_info_ids)
missing_len = len(ids_not_present)
if missing_len > 0:
    print(f'There are sample IDs missing {missing_len}')
    if missing_len < 20:
        print(ids_not_present)
else:
    print('All RNAS IDs are present, none missing')

There are sample IDs missing 7
{'RNAS_PPMI3966B5E8v1_da65', 'RNAS_PPMI3966B5E8v1_da0', 'RNAS_PPMI3966B5E6v1_da0', 'RNAS_PPMI3966B5E8v1_da25', 'RNAS_PPMI3966B1v10_da25', 'RNAS_PPMI3966B5E6v1_da65', 'RNAS_PPMI3966B5E6v1_da25'}


#### get counts by day

In [22]:
days = id_parts['day'].value_counts()
display(days)

da25    107
da0     100
da65     98
Name: day, dtype: int64

### go ahead and do the ID rename and use this info file

if everything matches to expectation

In [23]:
info_df.index = info_df.index.str.replace('RNAB_', 'RNAS_')
print(f'modified info shape {info_df.shape}')
if DEBUG:
    display(info_df.head())

modified info shape (305, 112)


### remove the RNAB assay covariates and include the RNAS assay covariates

In [24]:
cols_to_remove = ['Assigned', 'MAssigned', 'Aligned', 'MAligned', 'Aligned.1',
                  'MAligned.1', 'Dups', 'GC', 'MSeqs', 'ori_assayid']
info_df = info_df.drop(columns=cols_to_remove)
print(f'modified info, dropped cols, shape {info_df.shape}')
if DEBUG:
    display(info_df.head())

modified info, dropped cols, shape (305, 102)


In [25]:
assay_info = read_csv(in_qc_file, sep='\t', index_col=0)
print(f'shape of assay info {assay_info.shape}')
if DEBUG:
    display(assay_info.head())

shape of assay info (300, 5)


In [26]:
id_parts = split_id_parts(assay_info)
if DEBUG:
    display(id_parts.loc[id_parts.sampleid.str.startswith('PPMI3966')])     

(300, 8)


In [27]:
assay_info.index = id_parts['assayid']
assay_info.index.set_names('assayid')
if DEBUG:
    display(assay_info.head())

#### add the RNAS assay covariates on

In [28]:
info_df = info_df.merge(assay_info, how='inner', 
                        left_index=True, right_index=True)
print(f'modified info, added cols, shape {info_df.shape}')
if DEBUG:
    display(info_df.head())

modified info, added cols, shape (293, 107)


#### get counts by day

In [29]:
display(info_df.day.value_counts())

da25    102
da0      97
da65     94
Name: day, dtype: int64

### save output

#### save formated quantified data

In [30]:
%%time
quants_df.to_csv(quants_file)

CPU times: user 2.29 s, sys: 17.7 ms, total: 2.3 s
Wall time: 2.35 s


#### save the info file

In [31]:
info_df.to_csv(out_info_file)

#### save the feature annotations

In [32]:
features_df.to_csv(features_file)

In [33]:
!date

Fri Feb  3 15:49:27 UTC 2023
