## Notebook to convert METH meffil output to quantified matrix

In [None]:
!date

#### import libraries

In [None]:
from pandas import read_csv, DataFrame

#### set notebook variables

In [None]:
# naming
cohort = 'foundin'
modality = 'METH'
set_name = f'{cohort}_daALL_{modality}'

# directories
wrk_dir = '/labshare/raph/datasets/foundin_qtl'
quants_dir = f'{wrk_dir}/quants'
info_dir = f'{wrk_dir}/sample_info'

# in files
in_file = f'{quants_dir}/{modality}.FINAL_normalized_FOUNDIN_october2020.txt.gz'
info_file = f'{info_dir}/{cohort}_{modality}_sample_info.csv'
    
# out files
quants_file = f'{quants_dir}/{set_name}.csv'

# variables
DEBUG = True
replace_id_dict = {'PPMI3966B3': 'PPMI3966'}

if DEBUG:
    print(f'in_file = {in_file}')
    print(f'info_file = {info_file}')
    print(f'quants_file = {quants_file}')

### load data

In [None]:
%%time
quants_df = read_csv(in_file, sep='\s+')
# quants_df.rename(columns={'Geneid': 'feature'}, inplace=True)
print(f'quants_df shape is {quants_df.shape}')
if DEBUG:
    display(quants_df.head())

### transpose the quants matrix

In [None]:
quants_df = quants_df.transpose()
print(f'new quants shape {quants_df.shape}')
if DEBUG:
    display(quants_df.head())

### fill any missing values with zero's

In [None]:
quants_df = quants_df.fillna(0)

### simplify the modality's IDs
to be consistent with other modalities

#### split name index to find info

In [None]:
def split_id_parts(df: DataFrame) -> DataFrame:
    id_parts = df.index.str.split('_', expand=True).to_frame()
    id_parts.columns = ['assay', 'sampleid', 'cdi', 'day', 'version']
    # id_parts['fullassayid'] = quant_df.index
    # fix the duplicate control sample ID
    id_parts.replace(replace_id_dict, inplace=True)
    # for the other duplicates add version
    id_subs = id_parts.loc[id_parts.sampleid.str.startswith('PPMI3966B')]
    # id_subs.sampleid = id_subs.sampleid + id_subs.version
    id_parts.loc[id_subs.index, 'sampleid'] = id_subs.sampleid + id_subs.version    

    id_parts['assayid'] = id_parts['assay'] + '_' + id_parts['sampleid'] + '_' + id_parts['day']
    print(id_parts.shape)
    if DEBUG:
        display(id_parts.sample(5))
    return id_parts

In [None]:
id_parts = split_id_parts(quants_df)
if DEBUG:
    display(id_parts.loc[id_parts.sampleid.str.startswith('PPMI3966')])                         

In [None]:
id_parts.assayid.value_counts()

In [None]:
quants_df.index = id_parts['assayid']
quants_df.index.set_names('assayid')
if DEBUG:
    display(quants_df.head())

### save formated data

In [None]:
%%time
quants_df.to_csv(quants_file)

### fix the METH assay IDs in the info file as well
here might be a little out of place but fits

In [None]:
%%time
info_df = read_csv(info_file, index_col=0)
print(f'info shape {info_df.shape}')
if DEBUG:
    display(info_df.head())

In [None]:
id_parts = split_id_parts(info_df)
if DEBUG:
    display(id_parts.loc[id_parts.sampleid.str.startswith('PPMI3966')])  

In [None]:
info_df['ori_assayid'] = info_df.index.values
info_df.index = id_parts['assayid']
if DEBUG:
    display(info_df.head())

#### check the corrected PPMI3966 IDs

In [None]:
if DEBUG:
    display(info_df.loc[info_df.index.str.startswith(f'{modality}_PPMI3966')]) 

### save the info file

In [None]:
info_df.to_csv(info_file)

In [None]:
!date