In [1]:
import pandas as pd
import truvari
import numpy as np
import math
import json

In [2]:
compliment = str.maketrans("ATCGN", "TAGCN")
revcmp = lambda x: x.translate(compliment)[::-1]

In [18]:
catalog_fn = "/Users/english/code/adotto/regions/adotto_TRregions_v1.2.bed"
patho_fn = "/Users/english/code/adotto/pathogenic/Patho.tsv"

catalog = pd.read_csv(catalog_fn, sep='\t')
patho = pd.read_csv(patho_fn, sep='\t')

def smallest_roll(seq):
    if isinstance(seq, float):
        return "GCN" # HOXA13 and ARX cheat
    if ',' in seq:
        return seq
    sm = sorted(list(seq))
    i = seq.index(sm[0])
    return seq[i:] + seq[:i]

cat_pathos = catalog[catalog['patho'] != '.'].copy()
known_motifs = patho[["Locus", "Motifs 1", "Motifs 2"]].copy()

def motif_picker(x):
    """
    Consolidate the two motifs
    """
    m1 = x["Motifs 1"].split(',') if isinstance(x["Motifs 1"], str) else []
    m2 = x["Motifs 2"].split(',') if isinstance(x["Motifs 2"], str) else []
    all_m = set()
    for motif in m1 + m2:
        for roll in range(len(motif)):
            m_mt = motif[roll:] + motif[:roll]
            all_m.add(m_mt)
            all_m.add(revcmp(m_mt))
    return all_m
known_motifs['motifs'] = known_motifs.apply(motif_picker, axis=1)


def anno_puller(x):
    d = json.loads(x)
    ret = set()
    for anno in d:
        motif = anno['motif']
        for roll in range(len(motif)):
            m_mt = motif[roll:] + motif[:roll]
            ret.add(m_mt)
            ret.add(revcmp(m_mt))
    return ret
cat_pathos['motifs'] = cat_pathos['annos'].apply(anno_puller)

In [20]:
to_consolidate = known_motifs[known_motifs['Locus'].str.contains('_')]
to_consolidate 

Unnamed: 0,Locus,Motifs 1,Motifs 2,motifs
154,HOXA13_3,GCN,,"{GCN, CNG, NGC}"
155,HOXA13_2,GCN,,"{GCN, CNG, NGC}"
156,HOXA13_1,GCN,NGC,"{GCN, CNG, NGC}"
177,ARX_2,GCN,,"{GCN, CNG, NGC}"
178,ARX_1,GCN,GCG,"{GCN, CGG, CGC, NGC, GCC, GCG, GGC, CNG, CCG}"


In [21]:
# Fix the split ARX/HOXA13
arx_m = set.union(*[_ for _ in to_consolidate[to_consolidate['Locus'].str.startswith('ARX_')]['motifs']])
hoxa13_m = set.union(*[_ for _ in to_consolidate[to_consolidate['Locus'].str.contains('HOXA13_')]['motifs']])
known_motifs2 = pd.concat([known_motifs, 
                           pd.DataFrame(
                               [["ARX", None, None, arx_m]],
                               columns=known_motifs.columns
                           ),
                           pd.DataFrame(
                               [["HOXA13", None, None, hoxa13_m]],
                                columns=known_motifs.columns
                           )])

In [25]:
rows = []
for _, m_patho in cat_pathos[["patho", "motifs"]].iterrows():
    partner = known_motifs2[known_motifs2['Locus'] == m_patho["patho"]]
    if len(partner) != 1:
        print('messed up', patho)
        continue
    partner = partner.iloc[0]
    rows.append([m_patho['patho'], 
                 m_patho['motifs'], 
                 partner['motifs'], 
                 len(m_patho['motifs'].intersection(partner['motifs']))
                ])
results = pd.DataFrame(rows, columns=["Patho", "CatMotif", "KnoMotif", "IntersectCnt"])
results.set_index('Patho', inplace=True)

In [30]:
results['isVNTR'] = patho.rename(columns={'Locus': "Patho"}).set_index("Patho")["Repeat type"] == 'VNTR'

In [31]:
len(results)

177

In [32]:
# Accuracy before dealing with Ns and non-parsimonious representations
results['plain_compare'] = results['IntersectCnt'] != 0
print(results['plain_compare'].sum(), results['plain_compare'].mean())

78 0.4406779661016949


In [33]:
# Accuracy before dealing with Ns and non-parsimonious representations - by patho/vntr
print(results.groupby('isVNTR')['plain_compare'].sum(), results.groupby('isVNTR')['plain_compare'].mean())

isVNTR
False    47
True     31
Name: plain_compare, dtype: int64 isVNTR
False    0.758065
True     0.274336
Name: plain_compare, dtype: float64


In [9]:
results[~results['plain_compare']]

Unnamed: 0_level_0,CatMotif,KnoMotif,IntersectCnt,plain_compare
Patho,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
RERE,"{CTGCGGGAG, CCGCAGCTC, AGCTCCCGC, TGCGGGAGC, G...","{CTCTCGGATCTCCCG, TCGGATCTCCCGCTC, GGGAGATCCGA...",0,False
MACF1,{GGCACTGCAGCTGCTGGGGAGGTGGGTTCCTCTAGGGTGGGCACT...,"{GGCTCCTCTGGGGTGGGCACTGCAGCAGCTGGGGAGGTG, GGGT...",0,False
L1TD1,{CTTCCTCCTCCTCCTCTAGCCTCTTCTTCTTCTTCCTCCTCCAGC...,"{CCTGAGGTCTCTTCTTCCTCCTCCTCCTCCAGC, AGGAAGAAGA...",0,False
OVGP1,"{TGAGTCATCAGTCTG, GTCTGTGAGTCATCA, TCAGTCTGTGA...","{GTGGGTCATCAGTCTGTGACCCCT, TCATCAGTCTGTGACCCCT...",0,False
TCHH,"{TCCTGCTGCTGCCGGCGCTGCTCCTCT, CCTGCT, GGAGGGAG...","{CTGCTGCTCGCGCCTCAG, AGCAGCAGCTGAGGCGCG, CGCCT...",0,False
...,...,...,...,...
FXN,"{TTTTTTGTAT, TTTTTTTGTA, AAATACAAAA, CAAAAAAAT...","{TTC, CTT, T, AAG, TCT, A, GAA, AGA}",0,False
PRDM12,"{TGCACCGCCCCGCACGCGCACGCGCCCGCGC, TGCGGGGCGGTG...","{GCG, CGG, CGC, GCC, GGC, CCG}",0,False
ARX,"{CGCGGC, CCGG, CTG, CGGG, CCGC, TGC, GCCG, GGC...","{GCN, CGG, NGC, CGC, GCC, GCG, GGC, CNG, CCG}",0,False
SOX3,"{TGGGCG, CCGCTG, CGTGGG, GTGGGC, TGCCGC, CACGC...","{GCN, CNG, NGC}",0,False


In [34]:
# Some of the known have Ns. Need a comparison
def n_compare(x):
    can_compare = False
    for _ in x['KnoMotif']:
        if 'N' in _:
            can_compare = True
    if not can_compare:
        return False
    for kno in x['KnoMotif']:
        subset = [_ for _ in x['CatMotif'] if len(_) == len(kno)]
        for cat in subset:
            all_pass = True
            for a, b in zip(kno, cat):
                if a != 'N' and a != b:
                    all_pass = False
            if all_pass:
                return True
    return False

In [35]:
results['n_compare'] = results.apply(n_compare, axis=1)

In [39]:
# Accuracy after dealing with Ns
is_correct = results['plain_compare'] | results['n_compare']
print(results['n_compare'].sum(), is_correct.mean())

6 0.4745762711864407


In [40]:
# Accuracy after dealing with Ns
for i in [True, False]:
    print('VNTR' if i else "Patho")
    is_correct2 = results[results['isVNTR'] == i]['plain_compare'] | results[results['isVNTR'] == i]['n_compare']
    print(results[results['isVNTR'] == i]['n_compare'].sum(), is_correct2.mean())

VNTR
0 0.2743362831858407
Patho
4 0.8225806451612904


In [46]:
results['isVNTR'].value_counts(dropna=False)

True     113
False     62
NaN        2
Name: isVNTR, dtype: int64

In [48]:
view = results[results['isVNTR'] & ~results['isVNTR'].isna()]
(view['plain_compare'] | view['n_compare']).value_counts()

False    82
True     31
dtype: int64

In [42]:
results[~is_correct & results['isVNTR']].to_csv('hold.txt', sep='\t')

In [None]:
# 9 remaining that don't match.
# Let's manually inspect them

In [154]:
n = "ZNF713"
print('\n'.join(sorted(list(results.loc[n]['CatMotif']))))
print()
print('\n'.join(sorted(list(results.loc[n]['KnoMotif']))))

ACAGCGGGG
AGCGGGGAC
CAGCGGGGA
CCCCGCTGT
CCCGCCGCCGCCG
CCCGCTGTC
CCGCCCGCCGCCG
CCGCCGCCCGCCG
CCGCCGCCGCCCG
CCGCCGCCGCCGC
CCGCTGTCC
CGCCCGCCGCCGC
CGCCGCCCGCCGC
CGCCGCCGCCCGC
CGCCGCCGCCGCC
CGCTGTCCC
CGGCGGCGGCGGG
CGGCGGCGGGCGG
CGGCGGGCGGCGG
CGGGCGGCGGCGG
CGGGGACAG
CTGTCCCCG
GACAGCGGG
GCCCGCCGCCGCC
GCCGCCCGCCGCC
GCCGCCGCCCGCC
GCCGCCGCCGCCC
GCGGCGGCGGCGG
GCGGCGGCGGGCG
GCGGCGGGCGGCG
GCGGGCGGCGGCG
GCGGGGACA
GCTGTCCCC
GGACAGCGG
GGCGGCGGCGGCG
GGCGGCGGCGGGC
GGCGGCGGGCGGC
GGCGGGCGGCGGC
GGGACAGCG
GGGCGGCGGCGGC
GGGGACAGC
GTCCCCGCT
TCCCCGCTG
TGTCCCCGC

CCG
CGC
CGG
GCC
GCG
GGC


- ZIC2 matches because GCn matches with GCaGCgGCg
- FOXL2 matches because GCn pairs with GCAGCCGCAGCGGCT
- SOX3 matches : known GCN is equivalent to catalog GCAGCG
- TMEM185A matches : known CCG is equivlaent to CCGCCG

- AFF3 failed
- FXN failed
- PRDM12 failed - but I also think that it's a mix of repeats.. like they go back and forth between two motifs..
- CSTB was close - cat has CCCCGCG, known has CCCCGCCCCGCG
- ZNF713 was close - kno has GCC, cat has GCCGCCGCCGCCC - just one extra base

So, 2 hard failures, 1 maybe explainable failure, 2 close but no cigars, and 4 that I could probably programmatically find.

- Total Patho in catalog: 62
- Total Fails: 5
- Final Accuracy: 91.9%
- If I can excuse 3 maybe/close - 96.8%

In [50]:
results['isVNTR'] = results['isVNTR'].fillna(False)

In [65]:
# Same length
def glens(x):
    cat = set([len(_) for _ in x['CatMotif']])
    kno = set([len(_) for _ in x['KnoMotif']])
    any_len_inter = len(cat.intersection(kno)) !=0
    
    cat = sorted(list(cat))
    kno = sorted(list(kno))
    any_1bp_inter = False
    for i in cat:
        any_1bp_inter |= (i in kno or i+1 in kno or i-1 in kno)
    return any_len_inter, any_1bp_inter
lcomps = results[results['isVNTR']].apply(glens, axis=1)

In [66]:
lcomps.apply(lambda x: x[0]).value_counts()

False    61
True     52
dtype: int64

In [67]:
lcomps.apply(lambda x: x[1]).value_counts()

True     63
False    50
dtype: int64