In [2]:
#!/usr/bin/env python3
"""
Scores include:
mCH CH mCH_rate mCH_fmr mCH_fumr mCH_mhl mCH_umhl 
mCG CG mCG_rate mCG_fmr mCG_fumr mCG_mhl mCG_umhl

fmr: fully methylated rate (read level)
fumr: fully unmethylated rate (read level)
mhl: methylation haplotype load (Gou et al 2017)
umhl: unmethylation haplotype load (symmetric to mhl)

mcconc: fraction of unmethylated sites present in partially methylated reads
"""

import numpy as np
from scipy import linalg
import pandas as pd
import argparse
import re

# input_mcinfo = '/cndd2/fangming/projects/cfdna/all_data/bam/test_guo2017_mhb.mc_info.txt'
# output_mcscores = '/cndd2/fangming/projects/cfdna/all_data/bam/test_guo2017_mhb.mc_scores.txt'
# df = pd.read_csv(input_mcinfo, sep='\t', header=None, nrows=1000,
#                  names=['chr', 'start', 'end', 'seq'], 
#                 )
# df = df[~df['seq'].isnull()]
# print(df.shape)
# df.head()

In [None]:
def process_strings(strings):
    """Process strings
    split mCH and mCG
    z Z - mCG
    x X - mCHG, h H - mCHH
    u U - mCN
    . not C
    , read deliminator
    
    return: hH list/zZ list
    """
    strings = (strings.replace('.', '')
                      .replace('u', '')
                      .replace('x', 'h')
                      .replace('X', 'H')
               )
    # z Z h H
    string_list_mch = (strings.replace('z', '')
                             .replace('Z', '')
                             .split(',')
                      )
    string_list_mcg = (strings.replace('h', '')
                             .replace('H', '')
                             .split(',')
                      )
    
    # remove empty entries
    string_list_mch = [string for string in string_list_mch if string]
    string_list_mcg = [string for string in string_list_mcg if string]
    return string_list_mch, string_list_mcg

def calc_mc(string_list, letter):
    """
    """
    letter = letter.upper()
    string = ''.join(string_list)
    mc = string.count(letter)
    c = len(string)
    return mc, c

def calc_fmr(string_list, letter):
    """
    """
    letter_upper = letter.upper()
    letter_lower = letter.lower()
    string_sets = np.array([set(string) for string in string_list])
    num_fm = (string_sets == {letter_upper}).sum()
    num_fum = (string_sets == {letter_lower}).sum()
    num = len(string_sets)
    
    return num_fm, num_fum, num 

def calc_mhl(strings, letter):
    """MHL and uMHL
    """
    letter_lower = letter.lower()
    letter_upper = letter.upper()

    length_counts = np.bincount([len(string) for string in strings])
    length_counts_meth = np.zeros_like(length_counts)
    length_counts_unmeth = np.zeros_like(length_counts)

    for string in strings:
        # methylated
        a = re.split(r'{}+'.format(letter_lower), string.strip(letter_lower)) # get all substring including empty ones
        a = np.bincount([len(_a) for _a in a])
        length_counts_meth[:len(a)] += a

        # unmethylated
        a = re.split(r'{}+'.format(letter_upper), string.strip(letter_upper)) # get all substring including empty ones
        a = np.bincount([len(_a) for _a in a])
        length_counts_unmeth[:len(a)] += a

    length_counts_all = np.vstack([
        length_counts, 
        length_counts_meth, 
        length_counts_unmeth, 
        ])[:, 1:]

    dim = length_counts_all.shape[1]
    trans_mat = np.flip(linalg.hankel((np.arange(dim)+1)[::-1]), axis=0) # lower triangular matrix
    length_counts_all = np.dot(length_counts_all,trans_mat)

    fracs = length_counts_all[1:]/length_counts_all[0]
    weights = (np.arange(dim)+1)/(dim*(dim+1)/2)
    mhls = np.dot(fracs, weights)

    return mhls

def calc_mc_concurance(strings, letter):
    """Shi et al. 2021
    mcconc: fraction of unmethylated sites present in partially methylated reads
    """
    letter_lower = letter.lower()
    letter_upper = letter.upper()
    
    total_sites = 0
    conc_sites = 0
    for string in strings:
        total_sites += len(string)
        if letter_upper in string: # partially methylated
            conc_sites += string.count(letter_lower)
            
    return conc_sites/total_sites

In [13]:
df_proc = pd.DataFrame(df[~df['seq'].isnull()]['seq'].apply(process_strings).tolist(), 
                       index=df['seq'].index,
                       columns=['mch_string', 'mcg_string'])

df_mch = pd.DataFrame(df_proc['mch_string'].apply(lambda x: calc_mc(x, 'h')).tolist(),
                     index=df_proc['mch_string'].index,
                     columns=['ch_mc', 'ch_c'],
                     )
df_mcg = pd.DataFrame(df_proc['mcg_string'].apply(lambda x: calc_mc(x, 'z')).tolist(),
                     index=df_proc['mcg_string'].index,
                     columns=['cg_mc', 'cg_c'],
                     )

df_mch_fmr = pd.DataFrame(df_proc['mch_string'].apply(lambda x: calc_fmr(x, 'h')).tolist(),
                     index=df_proc['mch_string'].index,
                     columns=['ch_fully_meth_reads', 'ch_fully_unmeth_reads', 'ch_total_reads'],
                    )

df_mcg_fmr = pd.DataFrame(df_proc['mcg_string'].apply(lambda x: calc_fmr(x, 'z')).tolist(),
                     index=df_proc['mcg_string'].index,
                     columns=['cg_fully_meth_reads', 'cg_fully_unmeth_reads', 'cg_total_reads'],
                    )

df_mch_mhl = pd.DataFrame(df_proc['mch_string'].apply(lambda x: calc_mhl(x, 'h')).tolist(),
                     index=df_proc['mch_string'].index,
                     columns=['ch_mhl', 'ch_umhl'],
                    )
df_mcg_mhl = pd.DataFrame(df_proc['mcg_string'].apply(lambda x: calc_mhl(x, 'z')).tolist(),
                     index=df_proc['mcg_string'].index,
                     columns=['cg_mhl', 'cg_umhl'],
                    )

df_res = pd.concat([ 
                   df.drop('seq', axis=1),
                   df_mch, df_mcg, 
                   df_mch_fmr, df_mcg_fmr, 
                   df_mch_mhl, df_mcg_mhl,
#                    df_proc,
                  ], axis=1)
print(df_res.shape)
df_res.head()

(22, 17)


Unnamed: 0,chr,start,end,ch_mc,ch_c,cg_mc,cg_c,ch_fully_meth_reads,ch_fully_unmeth_reads,ch_total_reads,cg_fully_meth_reads,cg_fully_unmeth_reads,cg_total_reads,ch_mhl,ch_umhl,cg_mhl,cg_umhl
1,chr1,564470,564533,0,48,0,11,0,3,3,0,3,3,0.0,1.0,0.0,1.0
2,chr1,565785,565901,0,27,0,5,0,6,6,0,3,3,0.0,1.0,0.0,1.0
3,chr1,566722,566812,2,45,0,8,0,4,6,0,3,3,0.00057,0.726999,0.0,1.0
4,chr1,566994,567112,0,40,0,4,0,3,3,0,3,3,0.0,1.0,0.0,1.0
5,chr1,567548,567562,0,3,0,4,0,1,1,0,2,2,0.0,1.0,0.0,1.0


In [19]:
df_res.to_csv(output_mcscores, sep='\t', header=True, index=True, na_rep='NA')

## test section

In [15]:
a = 'hHHHhhzZH,,ZxHH,ZHZ,hHhx,,,'
string_mch, string_mcg = process_strings(a)

print(string_mch)
print(string_mcg)

['hHHHhhH', 'hHH', 'H', 'hHhh']
['zZ', 'Z', 'ZZ']


In [17]:
mhl, umhl = calc_mhl(string_mch, 'h')
mhl, umhl

(0.051920995670995666, 0.029653679653679654)

In [20]:
string_mch

['hHHHhhH', 'hHH', 'H', 'hHhh']

In [27]:
def calc_mhl(strings, letter):
    """MHL and uMHL
    """
    letter_lower = letter.lower()
    letter_upper = letter.upper()

    length_counts = np.bincount([len(string) for string in strings])
    length_counts_meth = np.zeros_like(length_counts)
    length_counts_unmeth = np.zeros_like(length_counts)

    for string in strings:
        # methylated
        a = re.split(r'{}+'.format(letter_lower), string.strip(letter_lower)) # get all substring including empty ones
        a = np.bincount([len(_a) for _a in a])
        length_counts_meth[:len(a)] += a

        # unmethylated
        a = re.split(r'{}+'.format(letter_upper), string.strip(letter_upper)) # get all substring including empty ones
        a = np.bincount([len(_a) for _a in a])
        length_counts_unmeth[:len(a)] += a

    length_counts_all = np.vstack([
        length_counts, 
        length_counts_meth, 
        length_counts_unmeth, 
        ])[:, 1:]
    print(length_counts_all)

    dim = length_counts_all.shape[1]
    trans_mat = np.flip(linalg.hankel((np.arange(dim)+1)[::-1]), axis=0) # lower triangular matrix
    length_counts_all = np.dot(length_counts_all,trans_mat)
    print(dim, trans_mat)
    print(length_counts_all)

    fracs = length_counts_all[1:]/length_counts_all[0]
    weights = (np.arange(dim)+1)/(dim*(dim+1)/2)
    mhls = np.dot(fracs, weights)
    print(fracs, weights, mhls)

    return mhls

In [30]:
# string_mch = ['hhhh', 'hhhh']
# print(calc_mhl(string_mch, 'h'))

# string_mch = ['HHHH', 'HHHH']
# print(calc_mhl(string_mch, 'h'))

# string_mch = ['HHHH', 'hhhh']
# print(calc_mhl(string_mch, 'h'))

string_mch = ['HHhh', 'HHhh',]
print(calc_mhl(string_mch, 'h'))



[[0 0 0 2]
 [0 2 0 0]
 [0 2 0 0]]
4 [[1 0 0 0]
 [2 1 0 0]
 [3 2 1 0]
 [4 3 2 1]]
[[8 6 4 2]
 [4 2 0 0]
 [4 2 0 0]]
[[ 0.5         0.33333333  0.          0.        ]
 [ 0.5         0.33333333  0.          0.        ]] [ 0.1  0.2  0.3  0.4] [ 0.11666667  0.11666667]
[ 0.11666667  0.11666667]


In [33]:

string_mch = [
              'hhhh', 
              'hhhH', 'hhHh', 'hHhh', 'Hhhh', 
              'hhHH', 'hHhH', 'HhhH', 
              'hHHh', 'HhHh', 'HHhh', 
              'hHHH', 'HhHH', 'HHhH', 'HHHh', 
              'HHHH', 
             ]
print(calc_mhl(string_mch, 'h'))

[[ 0  0  0 16]
 [12  5  2  1]
 [12  5  2  1]]
4 [[1 0 0 0]
 [2 1 0 0]
 [3 2 1 0]
 [4 3 2 1]]
[[64 48 32 16]
 [32 12  4  1]
 [32 12  4  1]]
[[ 0.5     0.25    0.125   0.0625]
 [ 0.5     0.25    0.125   0.0625]] [ 0.1  0.2  0.3  0.4] [ 0.1625  0.1625]
[ 0.1625  0.1625]


In [32]:
(1/10)*(2/4)+(2/10)*(1/3)

0.11666666666666667