# Manual verification of misidentified data

### Import data

In [1]:
import sys, os
import json
from tqdm import tqdm

path_root = '../../'
path_workd = path_root + '/analysis/4_check_misidentified'

dir_churu = path_workd + '/1697001520/output/'
path_match = path_workd + '/1697001520.compare_result.before_verification.txt'
path_meta = path_workd + '/1697001520.parse.filt.table'
path_guess = path_workd + '/cellname_guesse_full.txt'
path_cello = path_workd + '/cellosaurus.txt'
path_cello_parsed = path_workd + '/cellosaurus.table.reform.acc.cvcl'
path_institute = path_workd + '/institute_selected.parsed.corrected.txt'

In [2]:
# Import cellosaurus data
def get_lineage(child, child2parent):
    # this dict contain information about "originate from same individual"
    parents = child2parent.get(child,'').split('; ')
    if [child] == parents:
        return parents
    else:
        lst = [pp for p in parents for pp in get_lineage(p, child2parent)]
        return parents + lst

format_lineage = lambda x: list(set([x] + get_lineage(x, child2parent)) - set(['']))

with open(path_cello_parsed) as f:
    lines = f.readlines()
    items = map(lambda x: x.rstrip('\n').split('\t'), lines)
    child2parent = {x[0]:x[2] for x in items}
    # child: [parent1, parent2, ... ]
    child2lineage = {x:list(set(get_lineage(x, child2parent)) - set(['','-'])) for x in child2parent}
    # child: [lineage1, lineage2, lineage3, ...]

with open(path_cello) as f:
    lines = f.readlines()
    cell_id = ''
    name2problems = {} # problematic
    for line in lines:
        line = line.rstrip('\n')
        if line.startswith('//'):
            cell_id = ''
        elif line.startswith('ID'):
            cell_id = line.split('   ')[-1]
            name2problems.setdefault(cell_id,[])
        elif cell_id == '':
            continue
        elif line.startswith('CC'):
            if 'problematic' in line.lower():
                name2problems[cell_id].append(line.split('   ')[-1])

In [4]:
# Import authentication, guess data, institute data
with open(path_match) as f:
    lines = f.readlines()
    items = map(lambda x: x.rstrip('\n').split('\t'), lines)
    sam2match = {x[0]: x[1:] for x in items}
    sam_mismatched = [k for k,v in sam2match.items() if v[2] == 'mismatch']
    # sam: [call, claim, result]

with open(path_guess) as f:
    get_doc = lambda x: x['prompt1'].split('\n    ')[2].lstrip('Given context to you: "').rstrip('"')
    json_concat = json.load(f)
    sam2guess = {k:v for json in json_concat for k, v in json.items()}
    sam2doc = {k:get_doc(v) for k,v in sam2guess.items()}
    # sam: {id:?, prompt1:?, gpt-4o-mini-response-1:?, cellosaurus-candidates:?, prompt2:? gpt-4o-mini-response-2:?, final-choice:?}

with open(path_meta) as f:
    lines = f.readlines()
    items = map(lambda x: x.rstrip('\n').split('\t'), lines)
    sam2srr = {x[10]:x[0] for x in items}
    # sam: srr

with open(path_institute) as f:
    lines = f.readlines()
    items = map(lambda x: x.rstrip('\n').split('\t'), lines)
    sam2institute = {x[0]:x[1:] for x in items}
    # sam: [owner, ror_id, institute, country, relation]

In [5]:
# Import raw authentication data
# This one takes some long time (2-3 minutes)
sam2churu = {}
for sam in tqdm(sam_mismatched): # only mismatched samples 
    path_churu = dir_churu + sam2srr[sam] + '/churu.out'
    with open(path_churu) as f:
        churu = list(map(lambda x: x.rstrip('\n'), f.readlines()[1:]))
    sam2churu[sam] = churu

100%|██████████████████████████████████████████████████████████| 5127/5127 [03:49<00:00, 22.36it/s]


### Verification

In [6]:
import re
from thefuzz import fuzz, process

format_red = "\x1b[1;30;41m%s\x1b[m"
format_orange = "\x1b[1;30;43m%s\x1b[m"
format_blue = "\x1b[1;30;44m%s\x1b[m"
format_sky = "\x1b[1;30;46m%s\x1b[m"
format_grey = "\x1b[1;31;47m%s\x1b[m"

clean = lambda name: re.sub('\W','',name).upper()

def add_color(text, call, claim):
    claim = re.sub(r'([\[\]])', r'\\\1', claim)
    call_clean, claim_clean = clean(call), clean(claim)
    text = re.sub(call_clean, format_sky % call_clean, text, flags=re.IGNORECASE)
    text = re.sub(claim_clean, format_orange % claim_clean, text, flags=re.IGNORECASE)
    text = re.sub(call, format_blue % call, text, flags=re.IGNORECASE)
    text = re.sub(claim, format_red % claim, text, flags=re.IGNORECASE)
    # if nothing found in text, mark grey
    if "\x1b" not in text:
        text = format_grey % text
    return text

def find_cell_from_churu(name, churu):
    name2churu = {line.split('\t')[1]:line for line in churu}
    names_match = process.extract(name, list(name2churu.keys()), limit=2)
    return [name2churu[n[0]] for n in names_match]

def format_churu(churu, n_line=5):
    return '\n'.join(churu[:n_line])

def format_problem(name, name2problem):
    problems = name2problem[name]
    if problems:
        return f"{name}: \n\t" + '\n\t'.join(name2problem[name])
    else:
        return f"{name}: " + '-'

In [7]:
def pick_likely_claim(claims, doc):
    # return first mamtch
    for claim in claims:
        if re.findall(claim, doc, flags=re.IGNORECASE):
            return claim
    else: # if nothing found on claims, just return first one
        return claims[0]

def show_info(mysam):
    text = ''
    call, claim = sam2match[mysam][:2]
    doc = sam2doc[mysam]
    if '; ' in claim: # multiple guesses
        text += '>>> Multiple guess case <<<' + '\n'
        text += format_red % claim + '\n'
        claims = claim.split('; ')
        claim = pick_likely_claim(claims, doc)
    churu = sam2churu[mysam]
    churu_call = find_cell_from_churu(clean(call), churu)
    churu_claim = find_cell_from_churu(clean(claim), churu)
    text += "="*10 + " NAME  " + "="*10 + '\n'
    text += f"call  : {format_blue % call}" + '\n'
    text += f"claim : {format_red % claim}" + '\n'
    text += "="*10 + " LINE  " + "="*10 + '\n'
    text += add_color(' --> '.join(format_lineage(call)), call, claim) + '\n'
    text += add_color(' --> '.join(format_lineage(claim)), call, claim) + '\n'
    text += "="*10 + " PROB  " + "="*10 + '\n'
    text += add_color(format_problem(call, name2problems), call, claim) + '\n'
    text += add_color(format_problem(claim, name2problems), call, claim) + '\n'
    text += "="*10 + " DOC   " + "="*10 + '\n'
    text += add_color(doc, call, claim) + '\n'
    text += "="*10 + " TOP 5 " + "="*10 + '\n'
    text += add_color(format_churu(churu,5), call, claim) + '\n'
    text += "="*10 + " CALL  " + "="*10 + '\n'
    text += add_color(format_churu(churu_call, 2), call, claim) + '\n'
    text += "="*10 + " CLAIM " + "="*10 + '\n'
    text += add_color(format_churu(churu_claim, 2), call, claim) + '\n'
    text += "="*27 + '\n'
    text += "Checkpoint 1: is the claim in document?" + '\n'
    text += "Checkpoint 2: is the contamination supported by sufficient SNPs?" + '\n'
    text += "pass: '.', similar: 's', problematic: 'x', questionable: '?', not in CCLE: 'c'" + '\n'
    text += "="*27
    return text

In [87]:
import random
# mysam = random.choice(sam_mismatched)
mysam = 'SAMN15688113'

call, claim = sam2match[mysam][0:2]
' --> '.join(format_lineage(call))

# add_color(' --> '.join(format_lineage(call)), call, claim)
print(show_info(mysam))
# input('Your opinion: ')

call  : [1;30;44mHeLa[m
claim : [1;30;41mMCF-7 TH[m
[1;30;46m[1;30;44mHeLa[m[m
[1;30;41mMCF-7 TH[m --> OVCAR-8
[1;30;46m[1;30;44mHeLa[m[m: -
[1;30;41mMCF-7 TH[m: 
	Problematic cell line: Contaminated. Shown to be a OVCAR-8 derivative (PubMed=10995814). Originally thought to be a derivative of MCF-7.
[1;31;47msample id: SAMN15688113
title: MCF-7T-2-IP
sample name: -
source_name: Breast cancer cells
cell type: Breast cancer drug-resistant cells
cell line: MCF-7/T
treatment: treated with Taxol for 15 days[m
ACH-001086	[1;30;46m[1;30;44mHeLa[m[m	PT-c34xau	11	23.91%	1.00	-
ACH-000794	BICR22	PT-zkbKhd	4	8.70%	2.7e-21	-
ACH-000089	NCIH684	PT-0joF2E	4	8.70%	2.7e-21	-
ACH-000271	SUDHL10	PT-i02i7U	3	6.52%	6.5e-22	-
ACH-000914	HT	PT-zrYGap	3	6.52%	6.5e-22	-
ACH-001086	[1;30;46m[1;30;44mHeLa[m[m	PT-c34xau	11	23.91%	1.00	-
ACH-000004	HEL	PT-q4K2cp	1	2.17%	2.6e-35	-
[1;31;47mACH-000019	MCF7	PT-viJKnw	1	2.17%	5.4e-31	-
ACH-000914	HT	PT-zrYGap	3	6.52%	6.5e-22	-[m
Checkpoint

In [423]:
from IPython.display import clear_output
import time

output_txt = path_workd + '/manual_inspection_result.txt'
output_log = path_workd + '/manual_inspection_result.log'

sam_mismatched.sort()
size = len(sam_mismatched)
sam_done = set()
if os.path.exists(output_txt):
    with open(output_txt) as f:
        lines = f.readlines()
        sam_done = set([l.split('\t')[0] for l in lines])

sam_left = [sam for sam in sam_mismatched if sam not in sam_done]
print(f"total: {size}")
print(f"done: {len(sam_done)}")
print(f"left: {len(sam_left)}")

total: 5127
done: 5127
left: 0


In [422]:
for mysam in tqdm(sam_left):
    print(mysam)
    tic = time.time()
    text = show_info(mysam)
    print(text)
    response = input('Your response: ') or ""
    toc = time.time()
    time_took = f"{toc - tic:.2f}sec"
    with open(output_txt, 'a') as f:
        f.write(f"{mysam}\t{response}\t{time_took}\n")
    with open(output_log, 'a') as f:
        f.write(f">{mysam}\n>{text}\n>response:{response}\ntime_took:{time_took}\n")
    clear_output(wait=False)

100%|████████████████████████████████████████████████████████████| 266/266 [14:10<00:00,  3.20s/it]


### second verification

In [20]:
path_proj = path_workd + '/SRA_churu_output_compare.1697001520.meta'
with open(path_proj) as f:
    items = map(lambda x: x.rstrip('\n').split('\t'), f.readlines())
    sam2proj = {item[17]:item[18] for item in items}
    proj2sam = {}
    for sam, proj in sam2proj.items():
        proj2sam.setdefault(proj,[]).append(sam)

In [21]:
input_txt = path_workd + '/manual_inspection_result.txt'
output_txt = path_workd + '/manual_inspection_result.2.txt'

counter = {}
with open(input_txt) as f:
    items_curation = list(map(lambda x: x.rstrip('\n').split('\t'), f.readlines()))
    for _, curation, _ in items_curation:
        counter.setdefault(curation,0)
        counter[curation] += 1

In [79]:
# check one by one
mysam = 'SAMN08929693'
print(show_info(mysam))

call  : [1;30;44mLNCaP clone FGC[m
claim : [1;30;41mLB42[m
[1;30;44mLNCaP clone FGC[m --> LNCaP
[1;30;43m[1;30;41mLB42[m[m
[1;30;44mLNCaP clone FGC[m: -
[1;30;43m[1;30;41mLB42[m[m: -
[1;31;47msample id: SAMN08929693
title: Sample_42Dplus
sample name: -
source_name: 42D
rnase r treatment: yes
treatment: N/A
cell line: 42D
cell type: prostate cancer cell line[m
ACH-000977	[1;30;46mLNCAPCLONEFGC[m	PT-tY34fU	32	94.12%	1.00	-
ACH-001369	OCIC5X	PT-0GisSs	1	2.94%	8.4e-26	-
ACH-002109	ES8	PT-XwatVo	1	2.94%	3.2e-26	-
ACH-000571	T98G	PT-dw7sni	1	2.94%	1.3e-26	-
ACH-001610	NP5	PT-WoVaBP	1	2.94%	9.1e-28	-
ACH-000977	[1;30;46mLNCAPCLONEFGC[m	PT-tY34fU	32	94.12%	1.00	-
ACH-000835	GCT	PT-0RBdNq	0	0.00%	6.9e-30	-
[1;31;47mACH-002151	LB2241RCC	PT-hcnphT	0	0.00%	6.9e-30	-
ACH-002152	LB2518MEL	PT-g7qDcx	0	0.00%	6.9e-30	-[m
Checkpoint 1: is the claim in document?
Checkpoint 2: is the contamination supported by sufficient SNPs?
pass: '.', similar: 's', problematic: 'x', questionable

In [None]:
# check one by one
mysam = 'SAMN15582920'
print(show_info(mysam))

In [451]:
# same project, different curation
proj2curation = {}
sam2curation = {}
for sam, curation, _ in items_curation:
    proj = sam2proj[sam]
    proj2curation.setdefault(proj ,set()).add(curation)
    sam2curation[sam] = curation

In [452]:
tmp = 0
for proj in proj2curation:
    if len(proj2curation[proj]) != 1:
        tmp += 1
        # print(proj)
        # print(proj2sam[proj])
        # break
print(tmp)

77


In [456]:
sam_input = [] # [ (sam, input), ... ]
idx = 0
project_with_multiple_curations = [proj for proj in proj2curation if len(proj2curation[proj]) != 1]

In [639]:
idx += 1

proj = project_with_multiple_curations[idx]
sams = [sam for sam in proj2sam[proj] if sam in sam2curation]
sams.sort()
print(idx)
print(proj)
print(sams)
print([sam2curation[sam] for sam in sams])

76
PRJNA1004843
['SAMN36955233', 'SAMN36955234', 'SAMN36955235', 'SAMN36955236', 'SAMN36955237', 'SAMN36955238']
['?', '??', '?', '?', '?', '?']


In [640]:
for sam in sams:
    print(show_info(sam))
    print(sam2curation[sam])
    input()
    clear_output(wait=False)

### HUVEC mark

In [676]:
input_txt = path_workd + '/manual_inspection_result.txt'
output_txt = path_workd + '/manual_inspection_result.2.txt'

counter = {}
with open(input_txt) as f:
    items_curation = list(map(lambda x: x.rstrip('\n').split('\t'), f.readlines()))

In [680]:
sams_huvec = set()
for sam in sam2match:
    call, claim, _ = sam2match[sam]
    if 'huvec' in claim.lower():
        sams_huvec.add(sam)

In [683]:
for i in range(len(items_curation)):
    if items_curation[i][0] in sams_huvec:
        items_curation[i][1] = 'huvec'
with open(output_txt, 'w') as f:
    lines_out = map(lambda x: '\t'.join(x) + '\t' + '\t'.join(sam2match[x[0]]) + '\n', items_curation)
    f.writelines(lines_out)

In [None]:
for i in range(len(items_curation)):
    sam, curation, _ = items_curation[i]