In [38]:
import pandas as pd
import truvari
import numpy as np
import math
import json

In [104]:
compliment = str.maketrans("ATCGN", "TAGCN")
revcmp = lambda x: x.translate(compliment)[::-1]

In [106]:
catalog_fn = "/Users/english/code/adotto/regions/adotto_TRregions_v1.1.bed"
patho_fn = "/Users/english/code/adotto/pathogenic/Patho.tsv"

catalog = pd.read_csv(catalog_fn, sep='\t')
patho = pd.read_csv(patho_fn, sep='\t')

def smallest_roll(seq):
    if isinstance(seq, float):
        return "GCN" # HOXA13 and ARX cheat
    if ',' in seq:
        return seq
    sm = sorted(list(seq))
    i = seq.index(sm[0])
    return seq[i:] + seq[:i]

cat_pathos = catalog[catalog['patho'] != '.'].copy()
known_motifs = patho[["Locus", "Motifs 1", "Motifs 2"]].copy()

def motif_picker(x):
    """
    Consolidate the two motifs
    """
    m1 = x["Motifs 1"].split(',') if isinstance(x["Motifs 1"], str) else []
    m2 = x["Motifs 2"].split(',') if isinstance(x["Motifs 2"], str) else []
    all_m = set()
    for motif in m1 + m2:
        for roll in range(len(motif)):
            m_mt = motif[roll:] + motif[:roll]
            all_m.add(m_mt)
            all_m.add(revcmp(m_mt))
    return all_m
known_motifs['motifs'] = known_motifs.apply(motif_picker, axis=1)


def anno_puller(x):
    d = json.loads(x)
    ret = set()
    for anno in d:
        motif = anno['motif']
        for roll in range(len(motif)):
            m_mt = motif[roll:] + motif[:roll]
            ret.add(m_mt)
            ret.add(revcmp(m_mt))
    return ret
cat_pathos['motifs'] = cat_pathos['annos'].apply(anno_puller)

In [107]:
to_consolidate = known_motifs[known_motifs['Locus'].str.contains('_')]
to_consolidate 

Unnamed: 0,Locus,Motifs 1,Motifs 2,motifs
3,ARX_1,GCN,GCG,"{CCG, GCC, GCG, GGC, NGC, GCN, CGC, CGG, CNG}"
4,ARX_2,GCN,,"{NGC, GCN, CNG}"
30,HOXA13_1,GCN,NGC,"{NGC, GCN, CNG}"
31,HOXA13_2,GCN,,"{NGC, GCN, CNG}"
32,HOXA13_3,GCN,,"{NGC, GCN, CNG}"


In [110]:
# Fix the split ARX/HOXA13
arx_m = set.union(*[_ for _ in to_consolidate[to_consolidate['Locus'].str.startswith('ARX_')]['motifs']])
hoxa13_m = set.union(*[_ for _ in to_consolidate[to_consolidate['Locus'].str.contains('HOXA13_')]['motifs']])
known_motifs2 = pd.concat([known_motifs, 
                           pd.DataFrame(
                               [["ARX", None, None, arx_m]],
                               columns=known_motifs.columns
                           ),
                           pd.DataFrame(
                               [["HOXA13", None, None, hoxa13_m]],
                                columns=known_motifs.columns
                           )])

In [127]:
rows = []
for _, patho in cat_pathos[["patho", "motifs"]].iterrows():
    partner = known_motifs2[known_motifs2['Locus'] == patho["patho"]]
    if len(partner) != 1:
        print('messed up', patho)
        continue
    partner = partner.iloc[0]
    rows.append([patho['patho'], 
                 patho['motifs'], 
                 partner['motifs'], 
                 len(patho['motifs'].intersection(partner['motifs']))
                ])
results = pd.DataFrame(rows, columns=["Patho", "CatMotif", "KnoMotif", "IntersectCnt"])
results.set_index('Patho', inplace=True)

In [157]:
len(results)

62

In [156]:
# Accuracy before dealing with Ns and non-parsimonious representations
results['plain_compare'] = results['IntersectCnt'] != 0
print(results['plain_compare'].sum(), results['plain_compare'].mean())

47 0.7580645161290323


In [129]:
results[~results['plain_compare']]

Unnamed: 0_level_0,CatMotif,KnoMotif,IntersectCnt,plain_compare
Patho,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ZIC2,"{CCGCTGCCG, GCTGCCGCC, CGGCAGCGG, AGCGGCGGC, G...","{NGC, GCN, CNG}",0,False
AFF3,"{CGCGGGGCG, CGCGCCCCG, AGGGCCGCCCC, CCCCGCGCCC...","{CCG, GCC, GCG, GGC, CGC, CGG}",0,False
HOXD13,"{AGGCCGG, GCCCTGGCCCG, GGGAGAGA, GTC, CGGCCCTG...","{NGC, GCN, CNG}",0,False
CSTB,"{CCCCGCG, CGGGGCG, GCGCCCC, GCGGGGC, CCGCGCC, ...","{CGCCCCGCGCCC, GGGGCGGGGCGC, CCCGCCCCGCGC, GCG...",0,False
TBX1,"{GCC, AGCCCC, CCAGCC, CTGGGG, GCCCCA, CCCCAG, ...","{NGC, GCN, CNG}",0,False
FOXL2,"{GGCTGCAGCCGCAGC, CAGCCGCAGCGGCTG, CAGCGGCTGCA...","{NGC, GCN, CNG}",0,False
PHOX2B,"{GCCGCTGCGCGCT, GCGCTGCCGCTGC, GCC, GGGCCCGG, ...","{NGC, GCN, CNG}",0,False
RUNX2,"{GCCGCCGCAGCCGCCGCC, CGGCGGCTGCGGCGGCGG, GCT, ...","{NGC, GCN, CNG}",0,False
HOXA13,"{GCCGGG, CGCGATCCCGCCTGG, CGGGATCGCGCCAGG, AAG...","{GCN, NGC, CNG}",0,False
ZNF713,"{GCGGGGACA, TGTCCCCGC, CAGCGGGGA, GGGCGGCGGCGG...","{CCG, GCC, GCG, GGC, CGC, CGG}",0,False


In [130]:
# Some of the known have Ns. Need a comparison
def n_compare(x):
    can_compare = False
    for _ in x['KnoMotif']:
        if 'N' in _:
            can_compare = True
    if not can_compare:
        return False
    for kno in x['KnoMotif']:
        subset = [_ for _ in x['CatMotif'] if len(_) == len(kno)]
        for cat in subset:
            all_pass = True
            for a, b in zip(kno, cat):
                if a != 'N' and a != b:
                    all_pass = False
            if all_pass:
                return True
    return False

In [131]:
results['n_compare'] = results.apply(n_compare, axis=1)

In [158]:
# Accuracy after dealing with Ns
is_correct = results['plain_compare'] | results['n_compare']
print(results['n_compare'].sum(), is_correct.mean())

6 0.8548387096774194


In [135]:
results[~is_correct]

Unnamed: 0_level_0,CatMotif,KnoMotif,IntersectCnt,plain_compare,n_compare
Patho,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ZIC2,"{CCGCTGCCG, GCTGCCGCC, CGGCAGCGG, AGCGGCGGC, G...","{NGC, GCN, CNG}",0,False,False
AFF3,"{CGCGGGGCG, CGCGCCCCG, AGGGCCGCCCC, CCCCGCGCCC...","{CCG, GCC, GCG, GGC, CGC, CGG}",0,False,False
CSTB,"{CCCCGCG, CGGGGCG, GCGCCCC, GCGGGGC, CCGCGCC, ...","{CGCCCCGCGCCC, GGGGCGGGGCGC, CCCGCCCCGCGC, GCG...",0,False,False
FOXL2,"{GGCTGCAGCCGCAGC, CAGCCGCAGCGGCTG, CAGCGGCTGCA...","{NGC, GCN, CNG}",0,False,False
ZNF713,"{GCGGGGACA, TGTCCCCGC, CAGCGGGGA, GGGCGGCGGCGG...","{CCG, GCC, GCG, GGC, CGC, CGG}",0,False,False
FXN,"{TGTATTTTTT, TTTTGTATTT, TTTGTATTTT, TTTTTGTAT...","{GAA, CTT, T, A, AAG, TTC, AGA, TCT}",0,False,False
PRDM12,"{CCCCGCGC, CGCGGGGG, CCGCACGCGCACGCGCCCGCGCTGC...","{CCG, GCC, GCG, GGC, CGC, CGG}",0,False,False
SOX3,"{CAGCGG, TGGGCG, GTCTTG, AGACCA, GCGTGG, TGCCG...","{NGC, GCN, CNG}",0,False,False
TMEM185A,"{GAG, GCCAGC, CGCCGC, CCGCCG, CCT, CTC, GCTGGC...","{CCG, GCC, GCG, GGC, CGC, CGG}",0,False,False


In [None]:
# 9 remaining that don't match.
# Let's manually inspect them

In [154]:
n = "ZNF713"
print('\n'.join(sorted(list(results.loc[n]['CatMotif']))))
print()
print('\n'.join(sorted(list(results.loc[n]['KnoMotif']))))

ACAGCGGGG
AGCGGGGAC
CAGCGGGGA
CCCCGCTGT
CCCGCCGCCGCCG
CCCGCTGTC
CCGCCCGCCGCCG
CCGCCGCCCGCCG
CCGCCGCCGCCCG
CCGCCGCCGCCGC
CCGCTGTCC
CGCCCGCCGCCGC
CGCCGCCCGCCGC
CGCCGCCGCCCGC
CGCCGCCGCCGCC
CGCTGTCCC
CGGCGGCGGCGGG
CGGCGGCGGGCGG
CGGCGGGCGGCGG
CGGGCGGCGGCGG
CGGGGACAG
CTGTCCCCG
GACAGCGGG
GCCCGCCGCCGCC
GCCGCCCGCCGCC
GCCGCCGCCCGCC
GCCGCCGCCGCCC
GCGGCGGCGGCGG
GCGGCGGCGGGCG
GCGGCGGGCGGCG
GCGGGCGGCGGCG
GCGGGGACA
GCTGTCCCC
GGACAGCGG
GGCGGCGGCGGCG
GGCGGCGGCGGGC
GGCGGCGGGCGGC
GGCGGGCGGCGGC
GGGACAGCG
GGGCGGCGGCGGC
GGGGACAGC
GTCCCCGCT
TCCCCGCTG
TGTCCCCGC

CCG
CGC
CGG
GCC
GCG
GGC


- ZIC2 matches because GCn matches with GCaGCgGCg
- FOXL2 matches because GCn pairs with GCAGCCGCAGCGGCT
- SOX3 matches : known GCN is equivalent to catalog GCAGCG
- TMEM185A matches : known CCG is equivlaent to CCGCCG

- AFF3 failed
- FXN failed
- PRDM12 failed - but I also think that it's a mix of repeats.. like they go back and forth between two motifs..
- CSTB was close - cat has CCCCGCG, known has CCCCGCCCCGCG
- ZNF713 was close - kno has GCC, cat has GCCGCCGCCGCCC - just one extra base

So, 2 hard failures, 1 maybe explainable failure, 2 close but no cigars, and 4 that I could probably programmatically find.

- Total Patho in catalog: 62
- Total Fails: 5
- Final Accuracy: 91.9%
- If I can excuse 3 maybe/close - 96.8%