In [1]:
import pandas as pd
import glob
import os
import numpy as np
from tqdm import tqdm

## MethPipe input
col 1: chromosome  
col 2: position of Cytosine  
col 3: strand  
col 4: sequence context  
col 5: proportion of reads reporting C at that site  
col 6: coverage  

In [None]:
methylkit_files = glob.glob('../[A,B]x[A,B]/methylDackel_cytosine_report/F*_1*cytosine_report.txt') # This is the path to the MethylDackel files generated by the cytosine report argument.

shared = np.loadtxt('../SHARED_CpGs.Brahman_coords.txt')
shared_tuple = tuple(shared)
shared_cpgs = pd.read_csv('../../between_ref_SNPs/Angus.CpGs.to.Brahman_coords.SHARED.bed',
                          header=None,
                          index_col=None,
                          sep='\t')
shared = []
for row in shared_cpgs.itertuples():
    shared.append(f'{row[1]}.{row[2]+1}')
    shared.append(f'{row[1]}.{row[3]}')
shared = tuple(shared)

In [None]:
for file in tqdm(methylkit_files):
    group = file.split('/')[1]
    df = pd.read_csv(file,
            header=None,
            sep='\t',
            names=['chromosome','base','strand', 'meth_C_count','unmeth_C_count','C_context','trinucleotide_context'],
            dtype = {0:str,
                    1:int,
                    2:str,
                    3:int,
                    4:int,
                    5:str,
                    6:str})

    chrbase = [f'{row.chromosome}.{row.base}' for row in df.itertuples()]
    #df['chrBase'] = chrbase

    name = os.path.basename(file)
    name = name.split('.')[0]
    df_dict = {}
    df_dict['chrBase'] = chrbase
    df_dict['chromosome'] = df['chromosome'].values
    df_dict['base'] = df['base'].values
    df_dict['strand'] = df['strand'].values
    df_dict['context'] = ['CpG' for i in range(df.shape[0])]
    proportion_c = np.array(df['meth_C_count'] / (df['meth_C_count'] + df['unmeth_C_count']))
    proportion_c[np.isnan(proportion_c)] = 0.
    df_dict['proportion_C'] = proportion_c
    df_dict['coverage'] = df['meth_C_count'] + df['unmeth_C_count']

    new_df = pd.DataFrame.from_dict(df_dict)
    filtered_df = new_df[new_df['chrBase'].isin(shared)]
    new_df = new_df.iloc[:, 1:]
    filtered_df = filtered_df.iloc[:, 1:]
    filtered_df.to_csv(f'./aligned2Brahman/{group}/{name}.Consensus.CpGs.meth',
              header=None,
              index_label=None,
              index=None,
              sep='\t')
    new_df.to_csv(f'./aligned2Brahman/{group}/{name}.CpGs.meth',
              header=None,
              index_label=None,
              index=None,
              sep='\t')