# Manual correction for claim

### Import data

In [1]:
import sys, os
import json
from tqdm import tqdm

path_root = '../../'
path_workd = path_root + '/analysis/4_check_misidentified'

# dir_churu = path_workd + '/1697001520/output/'
path_match = path_workd + '/1697001520.compare_result.before_verification.txt'
path_meta = path_workd + '/1697001520.parse.filt.table'
path_guess = path_workd + '/cellname_guesse_full.txt'
path_cello = path_workd + '/cellosaurus.txt'
path_cello_parsed = path_workd + '/cellosaurus.table.reform.acc.cvcl'
# path_institute = path_workd + '/institute_selected.parsed.corrected.txt'

In [2]:
# Import cellosaurus data
def get_lineage(child, child2parent):
    # this dict contain information about "originate from same individual"
    parents = child2parent.get(child,'').split('; ')
    if [child] == parents:
        return parents
    else:
        lst = [pp for p in parents for pp in get_lineage(p, child2parent)]
        return parents + lst

format_lineage = lambda x: list(set([x] + get_lineage(x, child2parent)) - set(['']))

with open(path_cello_parsed) as f:
    lines = f.readlines()
    items = map(lambda x: x.rstrip('\n').split('\t'), lines)
    child2parent = {x[0]:x[2] for x in items}
    # child: [parent1, parent2, ... ]
    child2lineage = {x:list(set(get_lineage(x, child2parent)) - set(['','-'])) for x in child2parent}
    # child: [lineage1, lineage2, lineage3, ...]

with open(path_cello) as f:
    lines = f.readlines()
    cell_id = ''
    name2problems = {} # problematic
    for line in lines:
        line = line.rstrip('\n')
        if line.startswith('//'):
            cell_id = ''
        elif line.startswith('ID'):
            cell_id = line.split('   ')[-1]
            name2problems.setdefault(cell_id,[])
        elif cell_id == '':
            continue
        elif line.startswith('CC'):
            if 'problematic' in line.lower():
                name2problems[cell_id].append(line.split('   ')[-1])

In [3]:
# Import authentication, guess data, institute data
with open(path_match) as f:
    lines = f.readlines()
    items = map(lambda x: x.rstrip('\n').split('\t'), lines)
    sam2match = {x[0]: x[1:] for x in items}
    sam_mismatched = [k for k,v in sam2match.items() if not v[2].startswith('skip')]
    # sam: [call, claim, result]

with open(path_guess) as f:
    get_doc = lambda x: x['prompt1'].split('\n    ')[2].lstrip('Given context to you: "').rstrip('"')
    json_concat = json.load(f)
    sam2guess = {k:v for json in json_concat for k, v in json.items()}
    sam2doc = {k:get_doc(v) for k,v in sam2guess.items()}
    # sam: {id:?, prompt1:?, gpt-4o-mini-response-1:?, cellosaurus-candidates:?, prompt2:? gpt-4o-mini-response-2:?, final-choice:?}

with open(path_meta) as f:
    lines = f.readlines()
    items = map(lambda x: x.rstrip('\n').split('\t'), lines)
    sam2srr = {x[10]:x[0] for x in items}
    # sam: srr

### Verification

In [4]:
import re
from thefuzz import fuzz, process

format_red = "\x1b[1;30;41m%s\x1b[m"
format_orange = "\x1b[1;30;43m%s\x1b[m"
format_blue = "\x1b[1;30;44m%s\x1b[m"
format_sky = "\x1b[1;30;46m%s\x1b[m"
format_grey = "\x1b[1;31;47m%s\x1b[m"

clean = lambda name: re.sub('\W','',name).upper()

def add_color(text, call, claim):
    claim = re.sub(r'([\[\]])', r'\\\1', claim)
    call_clean, claim_clean = clean(call), clean(claim)
    text = re.sub(call_clean, format_sky % call_clean, text, flags=re.IGNORECASE)
    text = re.sub(claim_clean, format_orange % claim_clean, text, flags=re.IGNORECASE)
    text = re.sub(call, format_blue % call, text, flags=re.IGNORECASE)
    text = re.sub(claim, format_red % claim, text, flags=re.IGNORECASE)
    # if nothing found in text, mark grey
    if "\x1b" not in text:
        text = format_grey % text
    return text

def find_cell_from_churu(name, churu):
    name2churu = {line.split('\t')[1]:line for line in churu}
    names_match = process.extract(name, list(name2churu.keys()), limit=2)
    return [name2churu[n[0]] for n in names_match]

def format_churu(churu, n_line=5):
    return '\n'.join(churu[:n_line])

def format_problem(name, name2problem):
    problems = name2problem[name]
    if problems:
        return f"{name}: \n\t" + '\n\t'.join(name2problem[name])
    else:
        return f"{name}: " + '-'

In [5]:
def pick_likely_claim(claims, doc):
    # return first mamtch
    for claim in claims:
        if re.findall(claim, doc, flags=re.IGNORECASE):
            return claim
    else: # if nothing found on claims, just return first one
        return claims[0]

def show_info(mysam):
    text = ''
    call, claim, curation = sam2match[mysam][:3]
    doc = sam2doc[mysam]
    if '; ' in claim: # multiple guesses
        text += '>>> Multiple guess case <<<' + '\n'
        text += format_red % claim + '\n'
        claims = claim.split('; ')
        claim = pick_likely_claim(claims, doc)
    # churu = sam2churu[mysam]
    # churu_call = find_cell_from_churu(clean(call), churu)
    # churu_claim = find_cell_from_churu(clean(claim), churu)
    text += "="*10 + " NAME  " + "="*10 + '\n'
    text += f"call  : {format_blue % call}" + '\n'
    text += f"claim : {format_red % claim}" + '\n'
    text += f"curation: {curation}" + '\n'
    text += "="*10 + " LINE  " + "="*10 + '\n'
    text += add_color(' --> '.join(format_lineage(call)), call, claim) + '\n'
    text += add_color(' --> '.join(format_lineage(claim)), call, claim) + '\n'
    text += "="*10 + " PROB  " + "="*10 + '\n'
    text += add_color(format_problem(call, name2problems), call, claim) + '\n'
    text += add_color(format_problem(claim, name2problems), call, claim) + '\n'
    text += "="*10 + " DOC   " + "="*10 + '\n'
    text += add_color(doc, call, claim) + '\n'
    # text += "="*10 + " TOP 5 " + "="*10 + '\n'
    # text += add_color(format_churu(churu,5), call, claim) + '\n'
    # text += "="*10 + " CALL  " + "="*10 + '\n'
    # text += add_color(format_churu(churu_call, 2), call, claim) + '\n'
    # text += "="*10 + " CLAIM " + "="*10 + '\n'
    # text += add_color(format_churu(churu_claim, 2), call, claim) + '\n'
    text += "="*27
    return text

In [6]:
# Let's check Known contams from the "match"

sam_match = [sam for sam in sam_mismatched if sam not in sam_done if sam2match[sam][2].startswith('match')]
sam_interest = []
print(len(sam_target))
counter = {}
for sam in sam_match:
    call, claim, curation = sam2match[sam]
    doc = sam2doc[sam]
    if '; ' in claim: # multiple guesses
        text += '>>> Multiple guess case <<<' + '\n'
        text += format_red % claim + '\n'
        claims = claim.split('; ')
        claim = pick_likely_claim(claims, doc)
    # print(format_problem(call, name2problems))
    prob_call = name2problems[call]
    prob_claim = name2problems[claim]
    # print(sam, call, claim, prob_call + prob_claim)
    if prob_call:
        key = f"{curation} / call"
        counter.setdefault(key,0)
        counter[key] += 1
    if prob_claim:
        key = f"{curation} / claim"
        counter.setdefault(key,0)
        counter[key] += 1
        if key == 'match_lineage / claim':
            sam_interest.append(sam)
            
for k in sorted(counter.keys()):
    print(f"{k}: {counter[k]}")

NameError: name 'sam_done' is not defined

In [92]:
# unique claim > call sets
len(set([f"{' > '.join(sam2match[sam][:2])}" for sam in sam_interest]))
pattern2sams = {}
for sam in sam_interest:
    pattern = ' > '.join(sam2match[sam][:2])
    pattern2sams.setdefault(pattern,[]).append(sam)

In [94]:
# show one representative cases, and apply to all
pattern2response = {}
for pattern in tqdm(pattern2sams.keys()):
    mysam = pattern2sams[pattern][0]
    tic = time.time()
    text = show_info(mysam)
    print(mysam)
    print(text)
    response = input('Your response: ') or ""
    pattern2response[pattern] = response
    clear_output(wait=False)

100%|██████████████████████████████████████████████████████████████| 61/61 [10:17<00:00, 10.12s/it]


In [101]:
# now fill the sam with the responses
sam2response = {}
for pattern in pattern2sams:
    sams = pattern2sams[pattern]
    response = pattern2response[pattern]
    for sam in sams:
        sam2response[sam] = response

In [107]:
output_txt = path_workd + '/manual_check_known_contam_from_match_lineage.txt'

lines = []
for sam in sorted(sam2response.keys()):
    call, claim, curation = sam2match[sam]
    lines.append(f"{sam}\t{call}\t{claim}\t{sam2response[sam]}\n")

with open(output_txt, 'w') as f:
    f.writelines(lines)

In [19]:
mysam = 'SAMN15086646'
print(show_info(mysam))

call  : [1;30;44mU-251MG[m
claim : [1;30;41mU251-TR3[m
curation: match_lineage
[1;30;44mU-251MG[m
[1;30;44mU-251MG[m --> [1;30;41mU251-TR3[m
[1;30;44mU-251MG[m: -
[1;30;41mU251-TR3[m: 
	Problematic cell line: Misidentified. Originally thought (PubMed=19584161) to be a A-172 (Cellosaurus=CVCL_0131) derivative but shown to be a [1;30;44mU-251MG[m derivative (DOI=10.1158/1078-0432.CCR-13-1821).
[1;31;47msample id: SAMN15086646
title: U251TR rep2
sample name: -
source_name: glioblastoma cells
tissue: glioblastoma
cell line: U251TR
phenotype: temozolomide resistant cells[m
