Data input
==========

In [1]:
import pandas as pd

In [29]:
%ls ~/results/ | grep BRCA-EU

[0m[01;32mall.BRCA-EU.mutations-context.tsv[0m*
[01;34meach_gene_in_BRCA-EU[0m/
[01;34mmain-genes_BRCA-EU[0m/


In [3]:
help(pd.read_table)

Help on function read_table in module pandas.io.parsers:

read_table(filepath_or_buffer, sep='\t', delimiter=None, header='infer', names=None, index_col=None, usecols=None, squeeze=False, prefix=None, mangle_dupe_cols=True, dtype=None, engine=None, converters=None, true_values=None, false_values=None, skipinitialspace=False, skiprows=None, nrows=None, na_values=None, keep_default_na=True, na_filter=True, verbose=False, skip_blank_lines=True, parse_dates=False, infer_datetime_format=False, keep_date_col=False, date_parser=None, dayfirst=False, iterator=False, chunksize=None, compression='infer', thousands=None, decimal=b'.', lineterminator=None, quotechar='"', quoting=0, escapechar=None, comment=None, encoding=None, dialect=None, tupleize_cols=None, error_bad_lines=True, warn_bad_lines=True, skipfooter=0, skip_footer=0, doublequote=True, delim_whitespace=False, as_recarray=None, compact_ints=None, use_unsigned=None, low_memory=True, buffer_lines=None, memory_map=False, float_precision=N

In [82]:
mutations_reader = pd.read_table('~/results/all.BRCA-EU.mutations-context.tsv', 
                                 header=1, 
                                 delim_whitespace=True,
                                 iterator=True)  # Lazily load lines since file may be large

In [83]:
mutations = mutations_reader.get_chunk(50)
mutations

Unnamed: 0,MUTATION_ID,MUTATION,POSITION_GRCh37,POSITION_GRCh38,RELATIVE_POSITION,OVERLAPPED_GENES,CONSEQUENCE(S),PROJECT(S)
0,MU64868974,C>G,chr2:169922536,chr2:169066026,INTRONIC,ENSG00000073737(DHRS9),"ENSG00000073737(DHRS9):intron_variant,ENSG0000...",BRCA-EU
1,MU67221130,C>T,chr2:1699231,chr2:1695459,INTRONIC,ENSG00000130508(PXDN),"ENSG00000130508(PXDN):intron_variant,ENSG00000...",BRCA-EU
2,MU64619292,T>-,chr2:16992349,chr2:16811082,INTERGENIC,BRCA-EU,,
3,MU66012277,C>T,chr2:169923756,chr2:169067246,NON-CODING-EXONIC,ENSG00000073737(DHRS9),"ENSG00000073737(DHRS9):intron_variant,ENSG0000...",BRCA-EU
4,MU66538864,A>T,chr2:169925859,chr2:169069349,INTRONIC,ENSG00000073737(DHRS9),"ENSG00000073737(DHRS9):intron_variant,ENSG0000...",BRCA-EU
5,MU63433685,T>A,chr2:169926262,chr2:169069752,INTRONIC,ENSG00000073737(DHRS9),"ENSG00000073737(DHRS9):intron_variant,ENSG0000...",BRCA-EU
6,MU64418202,C>G,chr2:169926829,chr2:169070319,NON-CODING-EXONIC,ENSG00000073737(DHRS9),"ENSG00000073737(DHRS9):5_prime_UTR_variant,ENS...",BRCA-EU
7,MU66019785,T>C,chr2:169927702,chr2:169071192,INTRONIC,ENSG00000073737(DHRS9),"ENSG00000073737(DHRS9):intron_variant,ENSG0000...",BRCA-EU
8,MU63584668,G>A,chr2:16992866,chr2:16811599,INTERGENIC,BRCA-EU,,
9,MU66013896,T>G,chr2:169928818,chr2:169072308,INTRONIC,ENSG00000073737(DHRS9),"ENSG00000073737(DHRS9):intron_variant,ENSG0000...",BRCA-EU


Data mangling
=============

In [6]:
# Helper function to test
from itertools import islice

def head(iterable, items=10):
    'Return the first items of an iterator.'
    iterator = iter(iterable)
    return islice(iterable, items)
# ---

[item for item in head(range(100), items=5)]

[0, 1, 2, 3, 4]

In [7]:
## Exploring how to iterate over the data
i = mutations.itertuples()

print(next(i))

row = next(i)
print(row.POSITION_GRCh37)

for mutation in head(i):
    print(mutation.MUTATION_ID)

Pandas(Index=0, MUTATION_ID='MU64868974', MUTATION='C>G', POSITION_GRCh37='chr2:169922536', POSITION_GRCh38='chr2:169066026', RELATIVE_POSITION='INTRONIC', OVERLAPPED_GENES='ENSG00000073737(DHRS9)', _7='ENSG00000073737(DHRS9):intron_variant,ENSG00000073737(DHRS9):upstream_gene_variant', _8='BRCA-EU')
chr2:1699231
MU64619292
MU66012277
MU66538864
MU63433685
MU64418202
MU66019785
MU63584668
MU66013896
MU67196340
MU66105214


In [8]:
# Test on the file
for mutation in head(mutations.itertuples()):
    
    # < --- Parse the position 
    chrom_str, pos_str = mutation.POSITION_GRCh37.split(':')
    # For the chromosome, eliminate the trailing "chr"
    chrom = chrom_str.replace('chr', '')
    # The position is an integer
    pos = int(pos_str)
    
    print(mutation.MUTATION_ID, chrom, pos)

MU64868974 2 169922536
MU67221130 2 1699231
MU64619292 2 16992349
MU66012277 2 169923756
MU66538864 2 169925859
MU63433685 2 169926262
MU64418202 2 169926829
MU66019785 2 169927702
MU63584668 2 16992866
MU66013896 2 169928818


Exploring how to validate
===============

The validation is done by requests to the [Ensembl REST API](http://rest.ensembl.org/documentation/)

In [9]:
import requests, sys
 
grch38_server = "http://rest.ensembl.org"
server = "http://grch37.rest.ensembl.org" # We are using the GRCh37 server


ext = "/sequence/region/human"
headers={ "Content-Type" : "application/json", 
         "Accept" : "application/json"}

In [10]:
r = requests.post(server+ext, 
                  headers=headers, 
                  data='''{"regions":["X:1000000..1000100:1", 
                                    "ABBA01004489.1:1..100"] }''')
 
if not r.ok:
  r.raise_for_status()
  sys.exit()
 
decoded = r.json()
print(repr(decoded))

[{'query': 'X:1000000..1000100:1', 'id': 'chromosome:GRCh37:X:1000000:1000100:1', 'seq': 'GAAACAGCTACTTGGAAGGCTGAAGCAGGAGGATTGTTTGAGTCTAGGAGTTTGAGGCTGCAGTGAGTTATGAGCACACCACGGCACTCCAGCCTGGGAGA', 'molecule': 'dna'}, {'query': 'ABBA01004489.1:1..100', 'id': 'contig::ABBA01004489.1:1:100:1', 'seq': 'CTGTACTTTCCTTGGGATGGAGTAGTTTCGAAACACACTTTCTGTAGAATCTGCAAGTGGATATTTGGACCTGTCTGAGGAATTCGTTGGAAACGGGATA', 'molecule': 'dna'}]


In [11]:
#### Test for region string encoding

for mutation in head(mutations.itertuples()):
    
    # < --- Parse the mutation's start position 
    chrom_str, pos_str = mutation.POSITION_GRCh37.split(':')
    # For the chromosome, eliminate the trailing "chr"
    chrom_str = chrom_str.replace('chr', '')
    # The position is an integer
    pos = int(pos_str)
    
    # < --- Parse the mutation's end position
    # Get the mutation's reference allele
    ref, _ = mutation.MUTATION.split('>')
    end = pos + (len(ref)-1)
    
    # < --- Assemble the region string
    region_str = '{chrom}:{start}..{end}'.format(chrom=chrom_str, 
                                                 start=pos_str, 
                                                 end=end)
    print(region_str)

2:169922536..169922536
2:1699231..1699231
2:16992349..16992349
2:169923756..169923756
2:169925859..169925859
2:169926262..169926262
2:169926829..169926829
2:169927702..169927702
2:16992866..16992866
2:169928818..169928818


In [12]:
####  Assemble that into a function

def to_region_str(mutation):
    'Get a region string suitable for requests to Ensembl'
    
    # < --- Parse the mutation's start position 
    chrom_str, pos_str = mutation.POSITION_GRCh37.split(':')
    # For the chromosome, eliminate the trailing "chr"
    chrom_str = chrom_str.replace('chr', '')
    # The position is an integer
    pos = int(pos_str)
    
    # < --- Parse the mutation's end position
    # Get the mutation's reference allele
    ref, _ = mutation.MUTATION.split('>')
    end = pos + (len(ref)-1)
    
    # < --- Assemble the region string
    return '{chrom}:{start}..{end}'.format(chrom=chrom_str, 
                                           start=pos-1, # Just in case, extra space
                                           end=end+1) # Also, in case
# ---

for mutation in head(mutations.itertuples()):
    print(to_region_str(mutation))

2:169922535..169922537
2:1699230..1699232
2:16992348..16992350
2:169923755..169923757
2:169925858..169925860
2:169926261..169926263
2:169926828..169926830
2:169927701..169927703
2:16992865..16992867
2:169928817..169928819


In [13]:
####  Test for JSON string encoding

import json
from collections import defaultdict

data = defaultdict(list)
for mutation in head(mutations.itertuples()):
    # Add region string to the data dictionary
    # at the automatically created 'region' key
    data['regions'].append( to_region_str(mutation) )

json.dumps(data)

'{"regions": ["2:169922535..169922537", "2:1699230..1699232", "2:16992348..16992350", "2:169923755..169923757", "2:169925858..169925860", "2:169926261..169926263", "2:169926828..169926830", "2:169927701..169927703", "2:16992865..16992867", "2:169928817..169928819"]}'

In [14]:
#### First actual requests

import json
from collections import defaultdict


# < --- Assemble the request data

request_data = defaultdict(list)
for mutation in head(mutations.itertuples()):
    # Add region string to the data dictionary
    # at the automatically created 'region' key
    request_data['regions'].append( to_region_str(mutation) )

# < --- Make the request

# Request the sequences at the position
r = requests.post(server+ext, 
                  headers=headers, 
                  data=json.dumps(data)) # Here goes the data

# < --- Check the response

if not r.ok:
    r.raise_for_status()
 
r.json()

[{'id': 'chromosome:GRCh37:2:169922535:169922537:1',
  'molecule': 'dna',
  'query': '2:169922535..169922537',
  'seq': 'TCC'},
 {'id': 'chromosome:GRCh37:2:1699230:1699232:1',
  'molecule': 'dna',
  'query': '2:1699230..1699232',
  'seq': 'TCC'},
 {'id': 'chromosome:GRCh37:2:16992348:16992350:1',
  'molecule': 'dna',
  'query': '2:16992348..16992350',
  'seq': 'TAT'},
 {'id': 'chromosome:GRCh37:2:169923755:169923757:1',
  'molecule': 'dna',
  'query': '2:169923755..169923757',
  'seq': 'CCC'},
 {'id': 'chromosome:GRCh37:2:169925858:169925860:1',
  'molecule': 'dna',
  'query': '2:169925858..169925860',
  'seq': 'GAG'},
 {'id': 'chromosome:GRCh37:2:169926261:169926263:1',
  'molecule': 'dna',
  'query': '2:169926261..169926263',
  'seq': 'ATT'},
 {'id': 'chromosome:GRCh37:2:169926828:169926830:1',
  'molecule': 'dna',
  'query': '2:169926828..169926830',
  'seq': 'ACT'},
 {'id': 'chromosome:GRCh37:2:169927701:169927703:1',
  'molecule': 'dna',
  'query': '2:169927701..169927703',
  'se

In [15]:
#### Inspect the response
for item in r.json():
    print(item['seq'])

TCC
TCC
TAT
CCC
GAG
ATT
ACT
TTC
TGT
TTC


In [16]:
#### Verify first positions
for r_item, mut in zip(r.json(), mutations.itertuples()):
    print(r_item['seq'], mut.MUTATION.split('>')[0])

TCC C
TCC C
TAT T
CCC C
GAG A
ATT T
ACT C
TTC T
TGT G
TTC T


Yay! It worked! Now let's automate the validation process.

First, the `to_region_str` fetched extra positions on purpose. So, let's remove that. Besides, it wolu be good to be able to specify whether we are validating GRCh37 or GRCh38 positions.

In [119]:
def to_region_str(mutation, assembly='GRCh37'):
    'Get a region string suitable for requests to Ensembl'
    
    # < --- Parse the mutation's start position
    if assembly == 'GRCh37':
        chrom_str, pos_str = mutation.POSITION_GRCh37.split(':')
    else:
        chrom_str, pos_str = mutation.POSITION_GRCh38.split(':')
    # For the chromosome, eliminate the trailing "chr"
    chrom_str = chrom_str.replace('chr', '')
    # The position is an integer
    pos = int(pos_str)
    
    # < --- Parse the mutation's end position
    # Get the mutation's reference allele
    ref, _ = mutation.MUTATION.split('>')
    end = pos + (len(ref)-1)
    
    # < --- Assemble the region string
    return '{chrom}:{start}..{end}'.format(chrom=chrom_str, 
                                           start=(pos-1), 
                                           end=(end+1)) # Just the right chars
# ---

for mutation in head(mutations.itertuples()):
    print(to_region_str(mutation))

2:169922535..169922537
2:1699230..1699232
2:16992348..16992350
2:169923755..169923757
2:169925858..169925860
2:169926261..169926263
2:169926828..169926830
2:169927701..169927703
2:16992865..16992867
2:169928817..169928819


Now, let's define the complete algorithm:

In [120]:
import json
from collections import defaultdict
import requests, sys


# < --- Request server and headers

grch38_server = "http://rest.ensembl.org"
grch37_server = "http://grch37.rest.ensembl.org" # We are using the GRCh37 server

ext = "/sequence/region/human"
headers={ "Content-Type" : "application/json", 
          "Accept" : "application/json"}


# < --- Validation

def validate_mutations(mutations, verbose=False, assembly='GRCh37'):
    "Return the mutation IDs that doesn't match the reference."
    invalid = []  # Here we will accumulate those that didn't match
    
    # < --- Assemble the request data
    request_data = defaultdict(list)
    for mutation in mutations.itertuples():
        # Add region string to the data dictionary
        # at the automatically created 'region' key
        request_data['regions'].append( to_region_str(mutation, assembly) )

    # < --- Make the request
    server = grch37_server if assembly=='GRCh37' else grch38_server
    
    # Request the sequences at the position
    r = requests.post(server+ext, 
                      headers=headers, 
                      data=json.dumps(request_data)) # Here goes the data

    # < --- Check the response
    if not r.ok:
        r.raise_for_status()

    # < --- Validate the mutations
    for r_item, mut in zip(r.json(), mutations.itertuples()):
        # The sequence from the file
        file_seq = mut.MUTATION.split('>')[0]
        # The sequence from the request
        req_seq = r_item['seq']
        
        # < --- Parse the mutation's start position
        if assembly == 'GRCh37':
            chrom_str, pos_str = mut.POSITION_GRCh37.split(':')
        else:
            chrom_str, pos_str = mut.POSITION_GRCh38.split(':')
        
        if file_seq != req_seq[1:-1]:
            invalid.append({'ID':mut.MUTATION_ID,
                            'expected':file_seq,
                            'got':req_seq,
                            'pos':int(pos_str)})
        if verbose:
            print('ID: ', mut.MUTATION_ID,
                  'Expected: ', file_seq,
                  'Got: ', req_seq)
        
    return invalid
# ---


# Test

from pprint import pprint

pprint(validate_mutations(mutations, 
                          verbose=True, 
                          assembly='GRCh37'))
pprint(validate_mutations(mutations, 
                          verbose=True, 
                          assembly='GRCh38'))

ID:  MU64868974 Expected:  C Got:  TCC
ID:  MU67221130 Expected:  C Got:  TCC
ID:  MU64619292 Expected:  T Got:  TAT
ID:  MU66012277 Expected:  C Got:  CCC
ID:  MU66538864 Expected:  A Got:  GAG
ID:  MU63433685 Expected:  T Got:  ATT
ID:  MU64418202 Expected:  C Got:  ACT
ID:  MU66019785 Expected:  T Got:  TTC
ID:  MU63584668 Expected:  G Got:  TGT
ID:  MU66013896 Expected:  T Got:  TTC
ID:  MU67196340 Expected:  C Got:  TCA
ID:  MU66105214 Expected:  C Got:  TCT
ID:  MU67026903 Expected:  T Got:  GTT
ID:  MU65212871 Expected:  C Got:  TCT
ID:  MU66439690 Expected:  G Got:  AGG
ID:  MU63790286 Expected:  G Got:  AGA
ID:  MU65263602 Expected:  A Got:  CAG
ID:  MU64627748 Expected:  C Got:  ACA
ID:  MU66502389 Expected:  C Got:  TCA
ID:  MU63506559 Expected:  C Got:  GCG
ID:  MU65823313 Expected:  C Got:  TCC
ID:  MU65742206 Expected:  G Got:  AGG
ID:  MU66303273 Expected:  G Got:  GGC
ID:  MU65168306 Expected:  A Got:  AAG
ID:  MU66279019 Expected:  T Got:  GTG
ID:  MU38161712 Expected:

Processing all mutations
==============

Now that every part of the algorithm is OK, let's parse the complete file!

In [121]:
mutations_reader = pd.read_table('~/results/all.BRCA-EU.mutations-context.tsv', 
                                 header=1, 
                                 delim_whitespace=True,
                                 chunksize=50)  # Changed a little to ease iteration [1].
                                                # chunksize=50 because that's the maximum
                                                # allowed per request by Ensembl [2].
# [1]. See: https://pandas.pydata.org/pandas-docs/stable/io.html#io-chunking
# [2]. See: http://rest.ensembl.org/documentation/info/sequence_region_post

In [122]:
from pprint import pprint

invalid_grch37 = []
invalid_grch38 = []

# Iterate through each chunk
for i, mutation_chunk in enumerate(mutations_reader):
    # Validate the chunk
    new_invalid_grch37 = validate_mutations(mutation_chunk, assembly='GRCh37')
    new_invalid_grch38 = validate_mutations(mutation_chunk, assembly='GRCh38')
    print('Chunk: ', i, '-'*20)
    pprint(new_invalid_grch37)
    print('\n')
    pprint(new_invalid_grch38)
    
    invalid_grch37 += new_invalid_grch37
    invalid_grch38 += new_invalid_grch38

pprint(invalid_grch37)
pprint(invalid_grch38)

Chunk:  0 --------------------
[{'ID': 'MU64619292', 'expected': 'T', 'got': 'TAT', 'pos': 16992349},
 {'ID': 'MU38161712', 'expected': 'TC', 'got': 'CATC', 'pos': 169942588}]


[{'ID': 'MU64619292', 'expected': 'T', 'got': 'TAT', 'pos': 16811082},
 {'ID': 'MU38161712', 'expected': 'TC', 'got': 'CATC', 'pos': 169086078}]
Chunk:  1 --------------------
[{'ID': 'MU63854834', 'expected': 'A', 'got': 'GGA', 'pos': 169967250},
 {'ID': 'MU64350164', 'expected': 'T', 'got': 'TAT', 'pos': 16996995},
 {'ID': 'MU63418111', 'expected': '-', 'got': 'AGA', 'pos': 169982032},
 {'ID': 'MU64994227', 'expected': '-', 'got': 'TTA', 'pos': 169984697},
 {'ID': 'MU63952827', 'expected': 'AAGT', 'got': 'AAAAGT', 'pos': 169987319},
 {'ID': 'MU64982903', 'expected': 'TT', 'got': 'TCTT', 'pos': 169989944}]


[{'ID': 'MU63854834', 'expected': 'A', 'got': 'GGA', 'pos': 169110740},
 {'ID': 'MU64350164', 'expected': 'T', 'got': 'TAT', 'pos': 16815728},
 {'ID': 'MU63418111', 'expected': '-', 'got': 'AGA', 'pos': 16

Chunk:  13 --------------------
[{'ID': 'MU64367249', 'expected': 'ATT', 'got': 'AAATT', 'pos': 170420944},
 {'ID': 'MU64367252', 'expected': 'T', 'got': 'CAT', 'pos': 170421411},
 {'ID': 'MU63501914', 'expected': 'T', 'got': 'AGT', 'pos': 170434087},
 {'ID': 'MU64631133', 'expected': 'A', 'got': 'ACA', 'pos': 170434765}]


[{'ID': 'MU64367249', 'expected': 'ATT', 'got': 'AAATT', 'pos': 169564434},
 {'ID': 'MU64367252', 'expected': 'T', 'got': 'CAT', 'pos': 169564901},
 {'ID': 'MU63501914', 'expected': 'T', 'got': 'AGT', 'pos': 169577577},
 {'ID': 'MU64631133', 'expected': 'A', 'got': 'ACA', 'pos': 169578255}]
Chunk:  14 --------------------
[{'ID': 'MU64631137', 'expected': 'T', 'got': 'AAT', 'pos': 170448688},
 {'ID': 'MU63913109', 'expected': '-', 'got': 'TCT', 'pos': 170449121},
 {'ID': 'MU64367256', 'expected': 'A', 'got': 'TTA', 'pos': 170455975},
 {'ID': 'MU64367259', 'expected': 'T', 'got': 'TGT', 'pos': 170456137},
 {'ID': 'MU64808483', 'expected': '-', 'got': 'TGT', 'pos': 17

HTTPError: 429 Client Error: Too Many Requests for url: http://grch37.rest.ensembl.org/sequence/region/human

Ok, looks like we saturated GRCh37 server, so let's ask for only the GRCh38 mutations...

In [None]:
from pprint import pprint

invalid_grch38 = []

# Iterate through each chunk
for i, mutation_chunk in enumerate(mutations_reader):
    # Validate the chunk
    new_invalid_grch38 = validate_mutations(mutation_chunk, assembly='GRCh38')
    print('Chunk: ', i, '-'*20)
    pprint(new_invalid_grch38)
    invalid_grch38 += new_invalid_grch38

pprint(invalid_grch38)

Validating the original file
================

As the mutations are wrong, no matter what assembly we are talking about, the next step is to validate things directly from source. The ICGC's Simple Somatic Mutations file.

In [123]:
# Get one of the invalidated mutations
m_not_valid = invalid_grch37[0]
m_not_valid

{'ID': 'MU64619292', 'expected': 'T', 'got': 'TAT', 'pos': 16992349}

In [32]:
## Searching the data file
! ls ../data/data-release-25/

ssm_alt.sqlite	ssm.sqlite  ssm.vcf


The file `ssm.sqlite` is a pre-made database file containing the mutations. Let's inspect it...

In [75]:
import dataset

# Connect to the database
database = dataset.connect('sqlite:///../data/data-release-25/ssm.sqlite')

# Connect to the mutations table
mut_table = database['Mutation']

In [42]:
database.tables

['Consequence',
 'Consequence_Mutation',
 'Mutation',
 'Mutation_OccurrenceByProject',
 'OccurrenceByProject',
 'OccurrenceGlobal',
 'sqlite_sequence']

In [76]:
mut_table.columns

['id',
 'mutation_id',
 'chromosome',
 'GRCh37_pos',
 'reference_allele',
 'mutated_allele',
 'quality',
 'filter',
 'occurrence_global']

In [124]:
mut_table.find_one(mutation_id='MU64619292')

OrderedDict([('id', 27323401),
             ('mutation_id', 'MU64619292'),
             ('chromosome', '2'),
             ('GRCh37_pos', 16992349),
             ('reference_allele', 'AT'),
             ('mutated_allele', 'A'),
             ('quality', '.'),
             ('filter', '.'),
             ('occurrence_global', 27323401)])

In [125]:
# To compare with the db result
m_not_valid

{'ID': 'MU64619292', 'expected': 'T', 'got': 'TAT', 'pos': 16992349}

In [126]:
## Compare the positions

# To accumulate the not matching ones
not_matching = list()

for invalid in invalid_grch37:
    # Get the original mutation record
    original = mut_table.find_one(mutation_id=invalid['ID'])
    # Fetch the position from there
    original_position = original['GRCh37_pos']
    original_mutation = original['reference_allele']
    # Check if the positions match
    if original_position != invalid['pos']:
        not_matching.append({'ID': invalid['ID'],
                             'pos':invalid['pos'],
                             'original_pos':original_position,
                             'expected':invalid['expected'],
                             'original_expected':original_mutation,
                             'got':invalid['got']})
    
not_matching

[]

So, the positions correspond well, let's verify the 'expected' nucleotides

In [129]:
## Compare the reference allele

# To accumulate the not matching ones
not_matching = list()

for invalid in invalid_grch37:
    # Get the original mutation record
    db_mut = mut_table.find_one(mutation_id=invalid['ID'])
    # Fetch the position from there
    db_position = db_mut['GRCh37_pos']
    db_ref_allele = db_mut['reference_allele']
    # Check if the positions match
    if db_ref_allele != invalid['expected'][1:-1]:
        not_matching.append({'ID': invalid['ID'],
                             'pos':invalid['pos'],
                             'db_pos':db_position,
                             'expected':invalid['expected'],
                             'db_expected':db_ref_allele,
                             'got':invalid['got']})
    
not_matching

[{'ID': 'MU64619292',
  'db_expected': 'AT',
  'db_pos': 16992349,
  'expected': 'T',
  'got': 'TAT',
  'pos': 16992349},
 {'ID': 'MU38161712',
  'db_expected': 'ATC',
  'db_pos': 169942588,
  'expected': 'TC',
  'got': 'CATC',
  'pos': 169942588},
 {'ID': 'MU63854834',
  'db_expected': 'GA',
  'db_pos': 169967250,
  'expected': 'A',
  'got': 'GGA',
  'pos': 169967250},
 {'ID': 'MU64350164',
  'db_expected': 'AT',
  'db_pos': 16996995,
  'expected': 'T',
  'got': 'TAT',
  'pos': 16996995},
 {'ID': 'MU63418111',
  'db_expected': 'G',
  'db_pos': 169982032,
  'expected': '-',
  'got': 'AGA',
  'pos': 169982032},
 {'ID': 'MU64994227',
  'db_expected': 'T',
  'db_pos': 169984697,
  'expected': '-',
  'got': 'TTA',
  'pos': 169984697},
 {'ID': 'MU63952827',
  'db_expected': 'AAAGT',
  'db_pos': 169987319,
  'expected': 'AAGT',
  'got': 'AAAAGT',
  'pos': 169987319},
 {'ID': 'MU64982903',
  'db_expected': 'CTT',
  'db_pos': 169989944,
  'expected': 'TT',
  'got': 'TCTT',
  'pos': 169989944},

In [130]:
print(len(not_matching), len(invalid_grch37))
len(not_matching) / len(invalid_grch37)

86 86


1.0

We can see that every mutation that failed to match with the Ensembl reference also failed to match with the database reference. This may mean that the `reference_allele` field was corrupted during the analysis, or, taking into account that the database was setup with the SSM file from the 25th ICGC's Data Release and the analysis was done with the equivalent file from the 22th Data Release, maybe the reference changed from a data release to another.

To test this, we compare the raw files from which the analysis was made, from which the database was deployed and the final mutations file.

In [132]:
## The mutation once again??
m_not_valid

{'ID': 'MU64619292', 'expected': 'T', 'got': 'TAT', 'pos': 16992349}

In [131]:
## Print the header line
! grep -P "^#[^#]" ../data/data-release-25/ssm.vcf

#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO


In [142]:
## From data release 22
_ = ! grep MU64619292 ../data/data-release-22/ssm.vcf
print(_[0])

## From data release 25
_ = ! grep MU64619292 ../data/data-release-25/ssm.vcf
print(_[0])

## From the analysis file
_ = ! grep MU64619292 ~/results/all.BRCA-EU.mutations-context.tsv
print(_[0])

2	16992349	MU64619292	AT	A	.	.	CONSEQUENCE=||||||intergenic_region||;OCCURRENCE=BRCA-EU|1|560|0.00179;affected_donors=1;mutation=T>-;project_count=1;tested_donors=10638
2	16992349	MU64619292	AT	A	.	.	CONSEQUENCE=||||||intergenic_region||;OCCURRENCE=BRCA-EU|1|569|0.00176;affected_donors=1;mutation=T>-;project_count=1;studies=.;tested_donors=12198
MU64619292	T>-	chr2:16992349	chr2:16811082	INTERGENIC			BRCA-EU	


We can now clearly see that the analysis results file has the mutation string wrong! Just in case, we may try with another problematic mutation

In [136]:
## Convert the invalidated mutations to a pandas dataset for convenience
invalid_grch37_df = pd.DataFrame(invalid_grch37)
invalid_grch37_df

Unnamed: 0,ID,expected,got,pos
0,MU64619292,T,TAT,16992349
1,MU38161712,TC,CATC,169942588
2,MU63854834,A,GGA,169967250
3,MU64350164,T,TAT,16996995
4,MU63418111,-,AGA,169982032
5,MU64994227,-,TTA,169984697
6,MU63952827,AAGT,AAAAGT,169987319
7,MU64982903,TT,TCTT,169989944
8,MU130883,G,ATG,169997024
9,MU64367222,A,GGA,170005227


In [138]:
## Get an entry that is not already in the invalidated
m_not_valid_2 = invalid_grch37_df.iloc[1]
m_not_valid_2

ID          MU38161712
expected            TC
got               CATC
pos          169942588
Name: 1, dtype: object

In [141]:
#CHROM    POS    ID    REF    ALT    QUAL    FILTER    INFO

## From data release 22
_ = ! grep MU38161712 ../data/data-release-22/ssm.vcf
print(_[0])

## From data release 25
_ = ! grep MU38161712 ../data/data-release-25/ssm.vcf
print(_[0])

## From the analysis file
_ = ! grep MU38161712 ~/results/all.BRCA-EU.mutations-context.tsv
print(_[0])

2	169942588	MU38161712	ATC	A	.	.	CONSEQUENCE=DHRS9|ENSG00000073737|+|DHRS9-201|ENST00000327239||intron_variant||,DHRS9|ENSG00000073737|+|DHRS9-001|ENST00000357546||intron_variant||,DHRS9|ENSG00000073737|+|DHRS9-202|ENST00000412271||intron_variant||,DHRS9|ENSG00000073737|+|DHRS9-008|ENST00000421653||intron_variant||,DHRS9|ENSG00000073737|+|DHRS9-005|ENST00000428522||intron_variant||,DHRS9|ENSG00000073737|+|DHRS9-203|ENST00000432060||intron_variant||,DHRS9|ENSG00000073737|+|DHRS9-004|ENST00000436483||intron_variant||,DHRS9|ENSG00000073737|+|DHRS9-006|ENST00000450153||downstream_gene_variant||,DHRS9|ENSG00000073737|+|DHRS9-002|ENST00000602501||intron_variant||;OCCURRENCE=ESAD-UK|1|203|0.00493,BRCA-EU|2|560|0.00357;affected_donors=3;mutation=TC>-;project_count=2;tested_donors=10638
2	169942588	MU38161712	ATC	A	.	.	CONSEQUENCE=DHRS9|ENSG00000073737|+|DHRS9-201|ENST00000327239||intron_variant||,DHRS9|ENSG00000073737|+|DHRS9-001|ENST00000357546||intron_variant||,DHRS9|ENSG00000073737|+|DHRS9-