Data input
==========

In [2]:
import pandas as pd

In [29]:
%ls ~/results/ | grep BRCA-EU

[0m[01;32mall.BRCA-EU.mutations-context.tsv[0m*
[01;34meach_gene_in_BRCA-EU[0m/
[01;34mmain-genes_BRCA-EU[0m/


In [3]:
help(pd.read_table)

Help on function read_table in module pandas.io.parsers:

read_table(filepath_or_buffer, sep='\t', delimiter=None, header='infer', names=None, index_col=None, usecols=None, squeeze=False, prefix=None, mangle_dupe_cols=True, dtype=None, engine=None, converters=None, true_values=None, false_values=None, skipinitialspace=False, skiprows=None, nrows=None, na_values=None, keep_default_na=True, na_filter=True, verbose=False, skip_blank_lines=True, parse_dates=False, infer_datetime_format=False, keep_date_col=False, date_parser=None, dayfirst=False, iterator=False, chunksize=None, compression='infer', thousands=None, decimal=b'.', lineterminator=None, quotechar='"', quoting=0, escapechar=None, comment=None, encoding=None, dialect=None, tupleize_cols=None, error_bad_lines=True, warn_bad_lines=True, skipfooter=0, skip_footer=0, doublequote=True, delim_whitespace=False, as_recarray=None, compact_ints=None, use_unsigned=None, low_memory=True, buffer_lines=None, memory_map=False, float_precision=N

In [19]:
mutations_reader = pd.read_table('~/results/all.BRCA-EU.mutations-context.tsv', 
                                 header=1, 
                                 delim_whitespace=True,
                                 iterator=True)  # Lazily load lines since file may be large

In [20]:
mutations = mutations_reader.get_chunk(50)
mutations

Unnamed: 0,MUTATION_ID,MUTATION,POSITION_GRCh37,POSITION_GRCh38,RELATIVE_POSITION,OVERLAPPED_GENES,CONSEQUENCE(S),PROJECT(S)
0,MU64868974,C>G,chr2:169922536,chr2:169066026,INTRONIC,ENSG00000073737(DHRS9),"ENSG00000073737(DHRS9):intron_variant,ENSG0000...",BRCA-EU
1,MU67221130,C>T,chr2:1699231,chr2:1695459,INTRONIC,ENSG00000130508(PXDN),"ENSG00000130508(PXDN):intron_variant,ENSG00000...",BRCA-EU
2,MU64619292,T>-,chr2:16992349,chr2:16811082,INTERGENIC,BRCA-EU,,
3,MU66012277,C>T,chr2:169923756,chr2:169067246,NON-CODING-EXONIC,ENSG00000073737(DHRS9),"ENSG00000073737(DHRS9):intron_variant,ENSG0000...",BRCA-EU
4,MU66538864,A>T,chr2:169925859,chr2:169069349,INTRONIC,ENSG00000073737(DHRS9),"ENSG00000073737(DHRS9):intron_variant,ENSG0000...",BRCA-EU
5,MU63433685,T>A,chr2:169926262,chr2:169069752,INTRONIC,ENSG00000073737(DHRS9),"ENSG00000073737(DHRS9):intron_variant,ENSG0000...",BRCA-EU
6,MU64418202,C>G,chr2:169926829,chr2:169070319,NON-CODING-EXONIC,ENSG00000073737(DHRS9),"ENSG00000073737(DHRS9):5_prime_UTR_variant,ENS...",BRCA-EU
7,MU66019785,T>C,chr2:169927702,chr2:169071192,INTRONIC,ENSG00000073737(DHRS9),"ENSG00000073737(DHRS9):intron_variant,ENSG0000...",BRCA-EU
8,MU63584668,G>A,chr2:16992866,chr2:16811599,INTERGENIC,BRCA-EU,,
9,MU66013896,T>G,chr2:169928818,chr2:169072308,INTRONIC,ENSG00000073737(DHRS9),"ENSG00000073737(DHRS9):intron_variant,ENSG0000...",BRCA-EU


Data mangling
=============

In [21]:
# Helper function to test
from itertools import islice

def head(iterable, items=10):
    'Return the first items of an iterator.'
    iterator = iter(iterable)
    return islice(iterable, items)
# ---

[item for item in head(range(100), items=5)]

[0, 1, 2, 3, 4]

In [22]:
## Exploring how to iterate over the data
i = mutations.itertuples()

print(next(i))

row = next(i)
print(row.POSITION_GRCh37)

for mutation in head(i):
    print(mutation.MUTATION_ID)

Pandas(Index=0, MUTATION_ID='MU64868974', MUTATION='C>G', POSITION_GRCh37='chr2:169922536', POSITION_GRCh38='chr2:169066026', RELATIVE_POSITION='INTRONIC', OVERLAPPED_GENES='ENSG00000073737(DHRS9)', _7='ENSG00000073737(DHRS9):intron_variant,ENSG00000073737(DHRS9):upstream_gene_variant', _8='BRCA-EU')
chr2:1699231
MU64619292
MU66012277
MU66538864
MU63433685
MU64418202
MU66019785
MU63584668
MU66013896
MU67196340
MU66105214


In [23]:
# Test on the file
for mutation in head(mutations.itertuples()):
    
    # < --- Parse the position 
    chrom_str, pos_str = mutation.POSITION_GRCh37.split(':')
    # For the chromosome, eliminate the trailing "chr"
    chrom = chrom_str.replace('chr', '')
    # The position is an integer
    pos = int(pos_str)
    
    print(mutation.MUTATION_ID, chrom, pos)

MU64868974 2 169922536
MU67221130 2 1699231
MU64619292 2 16992349
MU66012277 2 169923756
MU66538864 2 169925859
MU63433685 2 169926262
MU64418202 2 169926829
MU66019785 2 169927702
MU63584668 2 16992866
MU66013896 2 169928818


Exploring how to validate
===============

The validation is done by requests to the [Ensembl REST API](http://rest.ensembl.org/documentation/)

In [24]:
import requests, sys
 
grch38_server = "http://rest.ensembl.org"
server = "http://grch37.rest.ensembl.org" # We are using the GRCh37 server


ext = "/sequence/region/human"
headers={ "Content-Type" : "application/json", 
         "Accept" : "application/json"}

In [25]:
r = requests.post(server+ext, 
                  headers=headers, 
                  data='''{"regions":["X:1000000..1000100:1", 
                                    "ABBA01004489.1:1..100"] }''')
 
if not r.ok:
  r.raise_for_status()
  sys.exit()
 
decoded = r.json()
print(repr(decoded))

[{'query': 'X:1000000..1000100:1', 'id': 'chromosome:GRCh37:X:1000000:1000100:1', 'seq': 'GAAACAGCTACTTGGAAGGCTGAAGCAGGAGGATTGTTTGAGTCTAGGAGTTTGAGGCTGCAGTGAGTTATGAGCACACCACGGCACTCCAGCCTGGGAGA', 'molecule': 'dna'}, {'query': 'ABBA01004489.1:1..100', 'id': 'contig::ABBA01004489.1:1:100:1', 'seq': 'CTGTACTTTCCTTGGGATGGAGTAGTTTCGAAACACACTTTCTGTAGAATCTGCAAGTGGATATTTGGACCTGTCTGAGGAATTCGTTGGAAACGGGATA', 'molecule': 'dna'}]


In [26]:
#### Test for region string encoding

for mutation in head(mutations.itertuples()):
    
    # < --- Parse the mutation's start position 
    chrom_str, pos_str = mutation.POSITION_GRCh37.split(':')
    # For the chromosome, eliminate the trailing "chr"
    chrom_str = chrom_str.replace('chr', '')
    # The position is an integer
    pos = int(pos_str)
    
    # < --- Parse the mutation's end position
    # Get the mutation's reference allele
    ref, _ = mutation.MUTATION.split('>')
    end = pos + (len(ref)-1)
    
    # < --- Assemble the region string
    region_str = '{chrom}:{start}..{end}'.format(chrom=chrom_str, 
                                                 start=pos_str, 
                                                 end=end)
    print(region_str)

2:169922536..169922536
2:1699231..1699231
2:16992349..16992349
2:169923756..169923756
2:169925859..169925859
2:169926262..169926262
2:169926829..169926829
2:169927702..169927702
2:16992866..16992866
2:169928818..169928818


In [27]:
####  Assemble that into a function

def to_region_str(mutation):
    'Get a region string suitable for requests to Ensembl'
    
    # < --- Parse the mutation's start position 
    chrom_str, pos_str = mutation.POSITION_GRCh37.split(':')
    # For the chromosome, eliminate the trailing "chr"
    chrom_str = chrom_str.replace('chr', '')
    # The position is an integer
    pos = int(pos_str)
    
    # < --- Parse the mutation's end position
    # Get the mutation's reference allele
    ref, _ = mutation.MUTATION.split('>')
    end = pos + (len(ref)-1)
    
    # < --- Assemble the region string
    return '{chrom}:{start}..{end}'.format(chrom=chrom_str, 
                                           start=pos-1, # Just in case, extra space
                                           end=end+1) # Also, in case
# ---

for mutation in head(mutations.itertuples()):
    print(to_region_str(mutation))

2:169922535..169922537
2:1699230..1699232
2:16992348..16992350
2:169923755..169923757
2:169925858..169925860
2:169926261..169926263
2:169926828..169926830
2:169927701..169927703
2:16992865..16992867
2:169928817..169928819


In [28]:
####  Test for JSON string encoding

import json
from collections import defaultdict

data = defaultdict(list)
for mutation in head(mutations.itertuples()):
    # Add region string to the data dictionary
    # at the automatically created 'region' key
    data['regions'].append( to_region_str(mutation) )

json.dumps(data)

'{"regions": ["2:169922535..169922537", "2:1699230..1699232", "2:16992348..16992350", "2:169923755..169923757", "2:169925858..169925860", "2:169926261..169926263", "2:169926828..169926830", "2:169927701..169927703", "2:16992865..16992867", "2:169928817..169928819"]}'

In [29]:
#### First actual requests

import json
from collections import defaultdict


# < --- Assemble the request data

request_data = defaultdict(list)
for mutation in head(mutations.itertuples()):
    # Add region string to the data dictionary
    # at the automatically created 'region' key
    request_data['regions'].append( to_region_str(mutation) )

# < --- Make the request

# Request the sequences at the position
r = requests.post(server+ext, 
                  headers=headers, 
                  data=json.dumps(data)) # Here goes the data

# < --- Check the response

if not r.ok:
    r.raise_for_status()
 
r.json()

[{'id': 'chromosome:GRCh37:2:169922535:169922537:1',
  'molecule': 'dna',
  'query': '2:169922535..169922537',
  'seq': 'TCC'},
 {'id': 'chromosome:GRCh37:2:1699230:1699232:1',
  'molecule': 'dna',
  'query': '2:1699230..1699232',
  'seq': 'TCC'},
 {'id': 'chromosome:GRCh37:2:16992348:16992350:1',
  'molecule': 'dna',
  'query': '2:16992348..16992350',
  'seq': 'TAT'},
 {'id': 'chromosome:GRCh37:2:169923755:169923757:1',
  'molecule': 'dna',
  'query': '2:169923755..169923757',
  'seq': 'CCC'},
 {'id': 'chromosome:GRCh37:2:169925858:169925860:1',
  'molecule': 'dna',
  'query': '2:169925858..169925860',
  'seq': 'GAG'},
 {'id': 'chromosome:GRCh37:2:169926261:169926263:1',
  'molecule': 'dna',
  'query': '2:169926261..169926263',
  'seq': 'ATT'},
 {'id': 'chromosome:GRCh37:2:169926828:169926830:1',
  'molecule': 'dna',
  'query': '2:169926828..169926830',
  'seq': 'ACT'},
 {'id': 'chromosome:GRCh37:2:169927701:169927703:1',
  'molecule': 'dna',
  'query': '2:169927701..169927703',
  'se

In [30]:
#### Inspect the response
for item in r.json():
    print(item['seq'])

TCC
TCC
TAT
CCC
GAG
ATT
ACT
TTC
TGT
TTC


In [31]:
#### Verify first positions
for r_item, mut in zip(r.json(), mutations.itertuples()):
    print(r_item['seq'], mut.MUTATION.split('>')[0])

TCC C
TCC C
TAT T
CCC C
GAG A
ATT T
ACT C
TTC T
TGT G
TTC T


Yay! It worked! Now let's automate the validation process.

First, the `to_region_str` fetched extra positions on purpose. So, let's remove that. Besides, it wolu be good to be able to specify whether we are validating GRCh37 or GRCh38 positions.

In [32]:
def to_region_str(mutation, assembly='GRCh37'):
    'Get a region string suitable for requests to Ensembl'
    
    # < --- Parse the mutation's start position
    if assembly == 'GRCh37':
        chrom_str, pos_str = mutation.POSITION_GRCh37.split(':')
    else:
        chrom_str, pos_str = mutation.POSITION_GRCh38.split(':')
    # For the chromosome, eliminate the trailing "chr"
    chrom_str = chrom_str.replace('chr', '')
    # The position is an integer
    pos = int(pos_str)
    
    # < --- Parse the mutation's end position
    # Get the mutation's reference allele
    ref, _ = mutation.MUTATION.split('>')
    end = pos + (len(ref)-1)
    
    # < --- Assemble the region string
    return '{chrom}:{start}..{end}'.format(chrom=chrom_str, 
                                           start=(pos-1), 
                                           end=(end+1)) # Just the right chars
# ---

for mutation in head(mutations.itertuples()):
    print(to_region_str(mutation))

2:169922535..169922537
2:1699230..1699232
2:16992348..16992350
2:169923755..169923757
2:169925858..169925860
2:169926261..169926263
2:169926828..169926830
2:169927701..169927703
2:16992865..16992867
2:169928817..169928819


Now, let's define the complete algorithm:

In [33]:
import json
from collections import defaultdict
import requests, sys


# < --- Request server and headers

grch38_server = "http://rest.ensembl.org"
grch37_server = "http://grch37.rest.ensembl.org" # We are using the GRCh37 server

ext = "/sequence/region/human"
headers={ "Content-Type" : "application/json", 
          "Accept" : "application/json"}


# < --- Validation

def validate_mutations(mutations, verbose=False, assembly='GRCh37'):
    "Return the mutation IDs that doesn't match the reference."
    invalid = []  # Here we will accumulate those that didn't match
    
    # < --- Assemble the request data
    request_data = defaultdict(list)
    for mutation in mutations.itertuples():
        # Add region string to the data dictionary
        # at the automatically created 'region' key
        request_data['regions'].append( to_region_str(mutation, assembly) )

    # < --- Make the request
    server = grch37_server if assembly=='GRCh37' else grch38_server
    
    # Request the sequences at the position
    r = requests.post(server+ext, 
                      headers=headers, 
                      data=json.dumps(request_data)) # Here goes the data

    # < --- Check the response
    if not r.ok:
        r.raise_for_status()

    # < --- Validate the mutations
    for r_item, mut in zip(r.json(), mutations.itertuples()):
        # The sequence from the file
        file_seq = mut.MUTATION.split('>')[0]
        # The sequence from the request
        req_seq = r_item['seq']
        
        # < --- Parse the mutation's start position
        if assembly == 'GRCh37':
            chrom_str, pos_str = mut.POSITION_GRCh37.split(':')
        else:
            chrom_str, pos_str = mut.POSITION_GRCh38.split(':')
        
        if file_seq != req_seq[1:-1]:
            invalid.append({'ID':mut.MUTATION_ID,
                            'expected':file_seq,
                            'got':req_seq,
                            'pos':int(pos_str)})
        if verbose:
            print('ID: ', mut.MUTATION_ID,
                  'Expected: ', file_seq,
                  'Got: ', req_seq)
        
    return invalid
# ---


# Test

from pprint import pprint

pprint(validate_mutations(mutations, 
                          verbose=True, 
                          assembly='GRCh37'))
pprint(validate_mutations(mutations, 
                          verbose=True, 
                          assembly='GRCh38'))

ID:  MU64868974 Expected:  C Got:  TCC
ID:  MU67221130 Expected:  C Got:  TCC
ID:  MU64619292 Expected:  T Got:  TAT
ID:  MU66012277 Expected:  C Got:  CCC
ID:  MU66538864 Expected:  A Got:  GAG
ID:  MU63433685 Expected:  T Got:  ATT
ID:  MU64418202 Expected:  C Got:  ACT
ID:  MU66019785 Expected:  T Got:  TTC
ID:  MU63584668 Expected:  G Got:  TGT
ID:  MU66013896 Expected:  T Got:  TTC
ID:  MU67196340 Expected:  C Got:  TCA
ID:  MU66105214 Expected:  C Got:  TCT
ID:  MU67026903 Expected:  T Got:  GTT
ID:  MU65212871 Expected:  C Got:  TCT
ID:  MU66439690 Expected:  G Got:  AGG
ID:  MU63790286 Expected:  G Got:  AGA
ID:  MU65263602 Expected:  A Got:  CAG
ID:  MU64627748 Expected:  C Got:  ACA
ID:  MU66502389 Expected:  C Got:  TCA
ID:  MU63506559 Expected:  C Got:  GCG
ID:  MU65823313 Expected:  C Got:  TCC
ID:  MU65742206 Expected:  G Got:  AGG
ID:  MU66303273 Expected:  G Got:  GGC
ID:  MU65168306 Expected:  A Got:  AAG
ID:  MU66279019 Expected:  T Got:  GTG
ID:  MU38161712 Expected:

KeyboardInterrupt: 

Processing all mutations
==============

Now that every part of the algorithm is OK, let's parse the complete file!

In [121]:
mutations_reader = pd.read_table('~/results/all.BRCA-EU.mutations-context.tsv', 
                                 header=1, 
                                 delim_whitespace=True,
                                 chunksize=50)  # Changed a little to ease iteration [1].
                                                # chunksize=50 because that's the maximum
                                                # allowed per request by Ensembl [2].
# [1]. See: https://pandas.pydata.org/pandas-docs/stable/io.html#io-chunking
# [2]. See: http://rest.ensembl.org/documentation/info/sequence_region_post

In [122]:
from pprint import pprint

invalid_grch37 = []
invalid_grch38 = []

# Iterate through each chunk
for i, mutation_chunk in enumerate(mutations_reader):
    # Validate the chunk
    new_invalid_grch37 = validate_mutations(mutation_chunk, assembly='GRCh37')
    new_invalid_grch38 = validate_mutations(mutation_chunk, assembly='GRCh38')
    print('Chunk: ', i, '-'*20)
    pprint(new_invalid_grch37)
    print('\n')
    pprint(new_invalid_grch38)
    
    invalid_grch37 += new_invalid_grch37
    invalid_grch38 += new_invalid_grch38

pprint(invalid_grch37)
pprint(invalid_grch38)

Chunk:  0 --------------------
[{'ID': 'MU64619292', 'expected': 'T', 'got': 'TAT', 'pos': 16992349},
 {'ID': 'MU38161712', 'expected': 'TC', 'got': 'CATC', 'pos': 169942588}]


[{'ID': 'MU64619292', 'expected': 'T', 'got': 'TAT', 'pos': 16811082},
 {'ID': 'MU38161712', 'expected': 'TC', 'got': 'CATC', 'pos': 169086078}]
Chunk:  1 --------------------
[{'ID': 'MU63854834', 'expected': 'A', 'got': 'GGA', 'pos': 169967250},
 {'ID': 'MU64350164', 'expected': 'T', 'got': 'TAT', 'pos': 16996995},
 {'ID': 'MU63418111', 'expected': '-', 'got': 'AGA', 'pos': 169982032},
 {'ID': 'MU64994227', 'expected': '-', 'got': 'TTA', 'pos': 169984697},
 {'ID': 'MU63952827', 'expected': 'AAGT', 'got': 'AAAAGT', 'pos': 169987319},
 {'ID': 'MU64982903', 'expected': 'TT', 'got': 'TCTT', 'pos': 169989944}]


[{'ID': 'MU63854834', 'expected': 'A', 'got': 'GGA', 'pos': 169110740},
 {'ID': 'MU64350164', 'expected': 'T', 'got': 'TAT', 'pos': 16815728},
 {'ID': 'MU63418111', 'expected': '-', 'got': 'AGA', 'pos': 16

Chunk:  13 --------------------
[{'ID': 'MU64367249', 'expected': 'ATT', 'got': 'AAATT', 'pos': 170420944},
 {'ID': 'MU64367252', 'expected': 'T', 'got': 'CAT', 'pos': 170421411},
 {'ID': 'MU63501914', 'expected': 'T', 'got': 'AGT', 'pos': 170434087},
 {'ID': 'MU64631133', 'expected': 'A', 'got': 'ACA', 'pos': 170434765}]


[{'ID': 'MU64367249', 'expected': 'ATT', 'got': 'AAATT', 'pos': 169564434},
 {'ID': 'MU64367252', 'expected': 'T', 'got': 'CAT', 'pos': 169564901},
 {'ID': 'MU63501914', 'expected': 'T', 'got': 'AGT', 'pos': 169577577},
 {'ID': 'MU64631133', 'expected': 'A', 'got': 'ACA', 'pos': 169578255}]
Chunk:  14 --------------------
[{'ID': 'MU64631137', 'expected': 'T', 'got': 'AAT', 'pos': 170448688},
 {'ID': 'MU63913109', 'expected': '-', 'got': 'TCT', 'pos': 170449121},
 {'ID': 'MU64367256', 'expected': 'A', 'got': 'TTA', 'pos': 170455975},
 {'ID': 'MU64367259', 'expected': 'T', 'got': 'TGT', 'pos': 170456137},
 {'ID': 'MU64808483', 'expected': '-', 'got': 'TGT', 'pos': 17

HTTPError: 429 Client Error: Too Many Requests for url: http://grch37.rest.ensembl.org/sequence/region/human

Ok, looks like we saturated GRCh37 server, so let's ask for only the GRCh38 mutations...

In [None]:
from pprint import pprint

invalid_grch38 = []

# Iterate through each chunk
for i, mutation_chunk in enumerate(mutations_reader):
    # Validate the chunk
    new_invalid_grch38 = validate_mutations(mutation_chunk, assembly='GRCh38')
    print('Chunk: ', i, '-'*20)
    pprint(new_invalid_grch38)
    invalid_grch38 += new_invalid_grch38

pprint(invalid_grch38)

Validating the original file
================

As the mutations are wrong, no matter what assembly we are talking about, the next step is to validate things directly from source. The ICGC's Simple Somatic Mutations file.

In [123]:
# Get one of the invalidated mutations
m_not_valid = invalid_grch37[0]
m_not_valid

{'ID': 'MU64619292', 'expected': 'T', 'got': 'TAT', 'pos': 16992349}

In [32]:
## Searching the data file
! ls ../data/data-release-25/

ssm_alt.sqlite	ssm.sqlite  ssm.vcf


The file `ssm.sqlite` is a pre-made database file containing the mutations. Let's inspect it...

In [46]:
import dataset

# Connect to the database
database = dataset.connect('sqlite:///../data/data-release-25/ssm.sqlite')

# Connect to the mutations table
mut_table = database['Mutation']

In [42]:
database.tables

['Consequence',
 'Consequence_Mutation',
 'Mutation',
 'Mutation_OccurrenceByProject',
 'OccurrenceByProject',
 'OccurrenceGlobal',
 'sqlite_sequence']

In [76]:
mut_table.columns

['id',
 'mutation_id',
 'chromosome',
 'GRCh37_pos',
 'reference_allele',
 'mutated_allele',
 'quality',
 'filter',
 'occurrence_global']

In [124]:
mut_table.find_one(mutation_id='MU64619292')

OrderedDict([('id', 27323401),
             ('mutation_id', 'MU64619292'),
             ('chromosome', '2'),
             ('GRCh37_pos', 16992349),
             ('reference_allele', 'AT'),
             ('mutated_allele', 'A'),
             ('quality', '.'),
             ('filter', '.'),
             ('occurrence_global', 27323401)])

In [125]:
# To compare with the db result
m_not_valid

{'ID': 'MU64619292', 'expected': 'T', 'got': 'TAT', 'pos': 16992349}

In [126]:
## Compare the positions

# To accumulate the not matching ones
not_matching = list()

for invalid in invalid_grch37:
    # Get the original mutation record
    original = mut_table.find_one(mutation_id=invalid['ID'])
    # Fetch the position from there
    original_position = original['GRCh37_pos']
    original_mutation = original['reference_allele']
    # Check if the positions match
    if original_position != invalid['pos']:
        not_matching.append({'ID': invalid['ID'],
                             'pos':invalid['pos'],
                             'original_pos':original_position,
                             'expected':invalid['expected'],
                             'original_expected':original_mutation,
                             'got':invalid['got']})
    
not_matching

[]

So, the positions correspond well, let's verify the 'expected' nucleotides

In [129]:
## Compare the reference allele

# To accumulate the not matching ones
not_matching = list()

for invalid in invalid_grch37:
    # Get the original mutation record
    db_mut = mut_table.find_one(mutation_id=invalid['ID'])
    # Fetch the position from there
    db_position = db_mut['GRCh37_pos']
    db_ref_allele = db_mut['reference_allele']
    # Check if the positions match
    if db_ref_allele != invalid['expected'][1:-1]:
        not_matching.append({'ID': invalid['ID'],
                             'pos':invalid['pos'],
                             'db_pos':db_position,
                             'expected':invalid['expected'],
                             'db_expected':db_ref_allele,
                             'got':invalid['got']})
    
not_matching

[{'ID': 'MU64619292',
  'db_expected': 'AT',
  'db_pos': 16992349,
  'expected': 'T',
  'got': 'TAT',
  'pos': 16992349},
 {'ID': 'MU38161712',
  'db_expected': 'ATC',
  'db_pos': 169942588,
  'expected': 'TC',
  'got': 'CATC',
  'pos': 169942588},
 {'ID': 'MU63854834',
  'db_expected': 'GA',
  'db_pos': 169967250,
  'expected': 'A',
  'got': 'GGA',
  'pos': 169967250},
 {'ID': 'MU64350164',
  'db_expected': 'AT',
  'db_pos': 16996995,
  'expected': 'T',
  'got': 'TAT',
  'pos': 16996995},
 {'ID': 'MU63418111',
  'db_expected': 'G',
  'db_pos': 169982032,
  'expected': '-',
  'got': 'AGA',
  'pos': 169982032},
 {'ID': 'MU64994227',
  'db_expected': 'T',
  'db_pos': 169984697,
  'expected': '-',
  'got': 'TTA',
  'pos': 169984697},
 {'ID': 'MU63952827',
  'db_expected': 'AAAGT',
  'db_pos': 169987319,
  'expected': 'AAGT',
  'got': 'AAAAGT',
  'pos': 169987319},
 {'ID': 'MU64982903',
  'db_expected': 'CTT',
  'db_pos': 169989944,
  'expected': 'TT',
  'got': 'TCTT',
  'pos': 169989944},

In [130]:
print(len(not_matching), len(invalid_grch37))
len(not_matching) / len(invalid_grch37)

86 86


1.0

We can see that every mutation that failed to match with the Ensembl reference also failed to match with the database reference. This may mean that the `reference_allele` field was corrupted during the analysis, or, taking into account that the database was setup with the SSM file from the 25th ICGC's Data Release and the analysis was done with the equivalent file from the 22th Data Release, maybe the reference changed from a data release to another.

To test this, we compare the raw files from which the analysis was made, from which the database was deployed and the final mutations file.

In [132]:
## The mutation once again??
m_not_valid

{'ID': 'MU64619292', 'expected': 'T', 'got': 'TAT', 'pos': 16992349}

In [131]:
## Print the header line
! grep -P "^#[^#]" ../data/data-release-25/ssm.vcf

#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO


In [142]:
## From data release 22
_ = ! grep MU64619292 ../data/data-release-22/ssm.vcf
print(_[0])

## From data release 25
_ = ! grep MU64619292 ../data/data-release-25/ssm.vcf
print(_[0])

## From the analysis file
_ = ! grep MU64619292 ~/results/all.BRCA-EU.mutations-context.tsv
print(_[0])

2	16992349	MU64619292	AT	A	.	.	CONSEQUENCE=||||||intergenic_region||;OCCURRENCE=BRCA-EU|1|560|0.00179;affected_donors=1;mutation=T>-;project_count=1;tested_donors=10638
2	16992349	MU64619292	AT	A	.	.	CONSEQUENCE=||||||intergenic_region||;OCCURRENCE=BRCA-EU|1|569|0.00176;affected_donors=1;mutation=T>-;project_count=1;studies=.;tested_donors=12198
MU64619292	T>-	chr2:16992349	chr2:16811082	INTERGENIC			BRCA-EU	


We can now clearly see that the analysis results file has the mutation string wrong! Just in case, we may try with another problematic mutation

In [136]:
## Convert the invalidated mutations to a pandas dataset for convenience
invalid_grch37_df = pd.DataFrame(invalid_grch37)
invalid_grch37_df

Unnamed: 0,ID,expected,got,pos
0,MU64619292,T,TAT,16992349
1,MU38161712,TC,CATC,169942588
2,MU63854834,A,GGA,169967250
3,MU64350164,T,TAT,16996995
4,MU63418111,-,AGA,169982032
5,MU64994227,-,TTA,169984697
6,MU63952827,AAGT,AAAAGT,169987319
7,MU64982903,TT,TCTT,169989944
8,MU130883,G,ATG,169997024
9,MU64367222,A,GGA,170005227


In [138]:
## Get an entry that is not already in the invalidated
m_not_valid_2 = invalid_grch37_df.iloc[1]
m_not_valid_2

ID          MU38161712
expected            TC
got               CATC
pos          169942588
Name: 1, dtype: object

In [141]:
#CHROM    POS    ID    REF    ALT    QUAL    FILTER    INFO

## From data release 22
_ = ! grep MU38161712 ../data/data-release-22/ssm.vcf
print(_[0])

## From data release 25
_ = ! grep MU38161712 ../data/data-release-25/ssm.vcf
print(_[0])

## From the analysis file
_ = ! grep MU38161712 ~/results/all.BRCA-EU.mutations-context.tsv
print(_[0])

2	169942588	MU38161712	ATC	A	.	.	CONSEQUENCE=DHRS9|ENSG00000073737|+|DHRS9-201|ENST00000327239||intron_variant||,DHRS9|ENSG00000073737|+|DHRS9-001|ENST00000357546||intron_variant||,DHRS9|ENSG00000073737|+|DHRS9-202|ENST00000412271||intron_variant||,DHRS9|ENSG00000073737|+|DHRS9-008|ENST00000421653||intron_variant||,DHRS9|ENSG00000073737|+|DHRS9-005|ENST00000428522||intron_variant||,DHRS9|ENSG00000073737|+|DHRS9-203|ENST00000432060||intron_variant||,DHRS9|ENSG00000073737|+|DHRS9-004|ENST00000436483||intron_variant||,DHRS9|ENSG00000073737|+|DHRS9-006|ENST00000450153||downstream_gene_variant||,DHRS9|ENSG00000073737|+|DHRS9-002|ENST00000602501||intron_variant||;OCCURRENCE=ESAD-UK|1|203|0.00493,BRCA-EU|2|560|0.00357;affected_donors=3;mutation=TC>-;project_count=2;tested_donors=10638
2	169942588	MU38161712	ATC	A	.	.	CONSEQUENCE=DHRS9|ENSG00000073737|+|DHRS9-201|ENST00000327239||intron_variant||,DHRS9|ENSG00000073737|+|DHRS9-001|ENST00000357546||intron_variant||,DHRS9|ENSG00000073737|+|DHRS9-

Now, we can make a summary of the problem.

Resúmen
====

El problema
-------------

El archivo `all.BRCA-EU.mutations-context.tsv` tiene el siquiente formato:

In [166]:
import pandas as pd

# Open the file
mutations_reader = pd.read_table('~/results/all.BRCA-EU.mutations-context.tsv', 
                                 header=1, 
                                 delim_whitespace=True,
                                 iterator=True)  # Lazily load lines since file may be large
# Read a part of the file
mutations = mutations_reader.get_chunk(10)
mutations

Unnamed: 0,MUTATION_ID,MUTATION,POSITION_GRCh37,POSITION_GRCh38,RELATIVE_POSITION,OVERLAPPED_GENES,CONSEQUENCE(S),PROJECT(S)
0,MU64868974,C>G,chr2:169922536,chr2:169066026,INTRONIC,ENSG00000073737(DHRS9),"ENSG00000073737(DHRS9):intron_variant,ENSG0000...",BRCA-EU
1,MU67221130,C>T,chr2:1699231,chr2:1695459,INTRONIC,ENSG00000130508(PXDN),"ENSG00000130508(PXDN):intron_variant,ENSG00000...",BRCA-EU
2,MU64619292,T>-,chr2:16992349,chr2:16811082,INTERGENIC,BRCA-EU,,
3,MU66012277,C>T,chr2:169923756,chr2:169067246,NON-CODING-EXONIC,ENSG00000073737(DHRS9),"ENSG00000073737(DHRS9):intron_variant,ENSG0000...",BRCA-EU
4,MU66538864,A>T,chr2:169925859,chr2:169069349,INTRONIC,ENSG00000073737(DHRS9),"ENSG00000073737(DHRS9):intron_variant,ENSG0000...",BRCA-EU
5,MU63433685,T>A,chr2:169926262,chr2:169069752,INTRONIC,ENSG00000073737(DHRS9),"ENSG00000073737(DHRS9):intron_variant,ENSG0000...",BRCA-EU
6,MU64418202,C>G,chr2:169926829,chr2:169070319,NON-CODING-EXONIC,ENSG00000073737(DHRS9),"ENSG00000073737(DHRS9):5_prime_UTR_variant,ENS...",BRCA-EU
7,MU66019785,T>C,chr2:169927702,chr2:169071192,INTRONIC,ENSG00000073737(DHRS9),"ENSG00000073737(DHRS9):intron_variant,ENSG0000...",BRCA-EU
8,MU63584668,G>A,chr2:16992866,chr2:16811599,INTERGENIC,BRCA-EU,,
9,MU66013896,T>G,chr2:169928818,chr2:169072308,INTRONIC,ENSG00000073737(DHRS9),"ENSG00000073737(DHRS9):intron_variant,ENSG0000...",BRCA-EU


El archivo provee para cada mutación su posición en el ensamble genómico GRCh37 o GRCh38 y un campo `MUTATION` que especifica el alelo de referencia y el alelo mutado. El alelo de referencia es lo que se debería encontrar en la posición especificada en el ensamble genómico correspondiente.

Después de validar por dos métodos diferentes, comprobamos que en varios casos el alelo de referencia especificado en el archivo no concordaba con lo que se encuentra en el ensamble genómico en ~10% de las mutaciones, sino que se encontró que todas estas estaban desfasadas.

El problema real
----------------

Después de un análisis se llegó a la raiz del problema.

El archivo `all.BRCA-EU.mutations-context.tsv` es el resultado de analizar el archivo de mutaciones de ICGC, el cual tiene el siguiente formato: 

In [145]:
# Open the file
ssmfile_reader = pd.read_table('../data/data-release-25/ssm.vcf',
                               names=['CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO'],
                               comment='#',
                               delim_whitespace=True,
                               iterator=True)  # Lazily load lines since file may be large
# Read a part of the file
ssmfile_mutations = ssmfile_reader.get_chunk(10)
ssmfile_mutations

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO
0,1,1000000,MU88749506,T,.,.,.,CONSEQUENCE=.;OCCURRENCE=NKTL-SG|23|23|1.00000...
1,1,100000022,MU39532371,C,T,.,.,"CONSEQUENCE=||||||intergenic_region||,RP11-413..."
2,1,100000049,MU87095619,TA,T,.,.,"CONSEQUENCE=||||||intergenic_region||,RP11-413..."
3,1,100000110,MU82202760,G,A,.,.,"CONSEQUENCE=||||||intergenic_region||,RP11-413..."
4,1,100000128,MU85052896,A,C,.,.,"CONSEQUENCE=||||||intergenic_region||,RP11-413..."
5,1,10000015,MU91785757,A,G,.,.,CONSEQUENCE=NMNAT1|ENSG00000173614|+|NMNAT1-00...
6,1,100000181,MU259333,G,A,.,.,"CONSEQUENCE=||||||intergenic_region||,RP11-413..."
7,1,100000340,MU64917116,AGGAATAGGGTGGGTCTGTGGCATTTAATCAGCGG,AGTATAGAGTGTAAAGAGT,.,.,"CONSEQUENCE=||||||intergenic_region||,RP11-413..."
8,1,100000409,MU1214865,G,A,.,.,"CONSEQUENCE=||||||intergenic_region||,RP11-413..."
9,1,100000520,MU66254099,T,C,.,.,"CONSEQUENCE=||||||intergenic_region||,RP11-413..."


Los campos a destacar de este archivo son `REF`, `INFO` y el subcampo `mutation` de `INFO`.

In [147]:
import re
mut = ssmfile_mutations.iloc[0]

# REF
print('REF: ', mut['REF'])

# INFO
print('\nINFO:', mut['INFO'])

# INFO.mutation
print('\nINFO.mutation: ',re.findall('(mutation=.*?);', mut['INFO'])[0])

REF:  T

INFO: CONSEQUENCE=.;OCCURRENCE=NKTL-SG|23|23|1.00000;affected_donors=23;mutation=T>T;project_count=1;studies=.;tested_donors=12198

INFO.mutation:  mutation=T>T


En este ejemplo, el campo `mutation` especifica que en la referencia hay una 'T', lo mismo que indica `REF` pero no siempre es así.

In [149]:
## From data release 22
! grep MU64619292 ../data/data-release-22/ssm.vcf

2	16992349	MU64619292	AT	A	.	.	CONSEQUENCE=||||||intergenic_region||;OCCURRENCE=BRCA-EU|1|560|0.00179;affected_donors=1;mutation=T>-;project_count=1;tested_donors=10638


En este ejemplo, `REF` indica que en la posición 16992349 del cromosoma 2 debe haber 'AT', y `mutation` indica un cambio de 'T' a '-'!

Esto se explica notando que `mutation` es la versión condensada de `REF`>`ALT` y sólo especifica lo importante de la mutación, no necesariamente significa que si `mutation` es 'T>-', en esa posición hay una 'T'. `REF` y `ALT` son los que sirven para ubicar las mutaciones en la referencia y ellos si especifican lo que se encuentra en la referencia en la posición especificada.

Regresando al archivo en cuestión, `all.BRCA-EU.mutations-context.tsv` el problema es que a la hora de hacer el análisis no se conocía este detalle y se utilizó lo que venía en el subcampo `mutation` cuando deberían de haberse utilizado los campos `REF` y `ALT`, por lo que las mutaciones pueden estar desfasadas.

Como ejemplo, mostramos la mutación de arriba como se encuentra en `all.BRCA-EU.mutations-context.tsv`:

In [161]:
## From the analysis file
! grep MU64619292 ~/results/all.BRCA-EU.mutations-context.tsv

MU64619292	T>-	chr2:16992349	chr2:16811082	INTERGENIC			BRCA-EU	


Problem solution
==========

Knowing this, we can correct the `MUTATION` field in the file.

In [168]:
# The columns of the file
fields = ['MUTATION_ID',
          'MUTATION',
          'POSITION_GRCh37',
          'POSITION_GRCh38',
          'RELATIVE_POSITION',
          'OVERLAPPED_GENES',
          'CONSEQUENCE(S)',
          'PROJECT(S)']
#The index of the columns
idx = {name:i for i, name in enumerate(fields)}

with open('../../results/all.BRCA-EU.mutations-context.tsv') as file:
    # Print the header
    print('\t'.join(fields))
    #Correct line by line
    for line in file:
        # Split in fields
        fields = line.split('\t')
        # Correct the mutation field
        mut_wrong = fields[idx['MUTATION']]
        print(mut_wrong)

MUTATION_ID	MUTATION	POSITION_GRCh37	POSITION_GRCh38	RELATIVE_POSITION	OVERLAPPED_GENES	CONSEQUENCE(S)	PROJECT(S)
Gene: All

MUTATION
C>G
C>T
T>-
C>T
A>T
T>A
C>G
T>C
G>A
T>G
C>T
C>G
T>C
C>G
G>A
G>A
A>G
C>A
C>T
C>T
C>T
G>A
G>T
A>T
T>A
TC>-
T>C
T>G
G>T
G>T
G>A
A>T
T>C
G>C
C>A
A>C
G>C
C>T
G>C
C>G
G>A
C>G
C>T
T>C
G>A
C>A
A>T
G>A
A>T
G>C
C>G
T>C
T>C
C>G
T>C
C>G
G>A
A>T
C>T
A>-
C>T
G>C
T>-
T>C
G>T
A>T
G>C
T>G
C>T
G>A
G>C
C>T
C>T
A>G
C>T
C>G
C>A
C>G
C>T
A>T
C>T
A>C
G>C
C>T
C>T
->A
C>G
T>A
A>T
->TA
AAGT>-
G>A
C>A
TT>-
C>G
G>A
A>C
G>A
C>T
C>A
G>A
C>G
G>-
G>T
T>C
A>G
C>T
C>A
T>A
C>G
A>T
A>G
A>-
T>C
G>C
G>T
C>T
ATATAACAGAATT>-
A>G
A>G
C>T
C>T
G>A
C>T
A>T
A>G
G>A
A>C
A>G
A>T
G>C
T>C
G>C
->A
G>C
A>G
G>A
C>G
T>A
C>G
G>A
T>C
G>T
A>G
G>T
G>A
G>T
G>A
A>G
C>T
G>A
C>T
G>A
C>T
C>T
G>A
A>G
C>A
G>C
C>T
G>C
G>A
T>G
G>C
C>A
G>C
TTAA>-
A>-
G>A
A>G
T>C
C>T
A>G
A>T
G>C
A>-
C>G
G>C
C>T
C>G
A>G
T>C
A>C
G>C
C>G
G>A
C>G
G>A
T>C
C>T
T>C
A>C
C>G
C>T
T>C
T>-
G>A
T>A
C>A
C>A
T>A
A>G
G>C
G>T
C>T
C>T
C>T
C>A
C>G
C>T
C>A
C

G>C
A>-
G>C
G>C
G>A
C>G
C>T
C>G
C>G
A>T
G>T
C>T
A>G
G>T
->T
G>A
G>C
T>A
C>A
C>T
G>A
G>A
G>A
CAGACAAGA>-
G>T
A>C
G>C
T>A
G>A
C>G
G>C
G>A
C>T
T>C
C>G
A>T
G>T
C>T
C>G
C>G
T>C
G>A
C>T
G>C
G>T
A>C
C>T
T>C
T>C
C>G
G>C
A>T
G>T
G>A
G>C
T>A
G>T
C>G
A>T
G>C
C>G
G>C
A>G
G>A
G>A
G>A
A>-
C>A
T>-
T>A
T>-
A>T
A>-
A>G
T>C
A>G
T>C
C>T
C>T
T>A
G>A
A>-
A>T
C>G
A>C
C>G
G>A
A>-
->A
C>G
A>C
G>A
C>T
A>G
A>-
C>T
G>A
A>T
A>-
C>T
C>A
G>C
C>A
C>T
G>C
C>G
C>G
T>C
C>A
C>G
T>C
T>G
G>C
C>T
T>C
C>A
C>T
C>T
C>G
A>T
T>C
T>C
A>G
G>C
CATTGGAG>-
G>T
G>C
C>A
G>A
T>C
->A
C>G
G>T
->A
T>A
G>C
G>C
C>A
C>T
C>T
C>G
A>-
C>T
G>A
C>G
G>A
T>-
C>T
G>C
G>C
G>C
C>A
T>C
C>G
C>G
A>G
C>G
C>G
->A
C>T
C>T
T>G
A>T
C>T
G>A
->AT
T>C
C>G
G>C
G>A
A>G
G>T
G>T
G>A
T>C
TGGTAACCACTTTGGTATC>-
C>T
G>A
A>G
T>A
C>T
G>A
G>C
C>T
C>G
G>C
G>A
G>A
C>G
C>G
T>C
C>A
G>T
T>C
G>A
C>G
C>T
T>C
G>A
A>G
T>A
C>T
G>A
C>G
G>T
A>T
T>G
C>G
G>C
G>A
G>A
G>T
A>G
CTC>-
G>A
A>-
A>T
C>T
T>-
A>T
G>A
C>T
G>C
G>A
G>A
->A
G>C
C>G
G>A
C>T
G>A
T>-
C>G
T>A
G>C
C>G
A>G
C>A
G>T
C>T
G>C


G>A
G>C
G>T
C>A
G>T
C>G
G>T
C>A
T>G
G>C
T>C
G>C
G>A
C>G
T>C
G>A
G>C
->A
C>G
G>C
G>A
T>C
T>G
G>T
A>T
T>C
T>C
T>G
G>C
G>A
C>T
C>T
C>G
A>C
G>T
G>A
C>A
C>G
C>T
->A
C>A
A>-
G>T
C>G
C>T
->A
G>T
A>T
A>T
G>C
A>C
C>T
T>-
G>T
G>T
G>C
G>C
G>A
T>C
G>C
A>T
C>T
C>T
C>A
T>A
C>T
C>A
T>A
C>T
G>C
T>C
G>A
->A
G>C
A>G
A>G
G>A
A>G
->TAA
C>G
G>C
A>G
C>T
C>T
G>C
G>A
G>A
C>A
T>A
C>A
C>T
C>A
C>T
T>A
A>-
A>-
G>A
T>G
C>T
C>G
A>C
G>C
G>A
AAAGA>-
A>G
G>A
C>A
T>G
A>-
CT>-
T>C
T>A
C>T
C>G
T>-
C>G
G>C
G>T
G>T
C>G
A>T
T>A
C>T
A>G
G>C
C>T
A>G
A>C
G>C
A>C
T>A
G>A
G>-
G>T
C>T
G>A
G>C
A>T
A>-
G>C
G>A
G>A
G>A
->A
G>C
C>A
A>G
G>C
A>G
C>G
T>C
G>A
C>T
G>C
A>G
TC>-
G>C
A>C
G>A
C>A
G>C
C>T
G>A
->A
G>C
C>G
G>C
A>G
G>A
C>T
G>-
A>C
C>T
G>T
A>C
C>T
C>A
C>G
G>C
A>C
T>A
G>C
A>-
C>T
C>G
G>C
A>-
C>T
C>T
C>A
T>-
G>C
G>A
C>G
C>A
T>-
G>C
T>C
A>C
C>T
T>C
A>C
C>G
C>T
G>T
G>A
G>A
A>T
G>A
T>A
CAGTCCAACTT>-
G>C
A>C
C>T
G>C
A>T
AT>-
C>A
C>T
C>T
G>A
C>A
G>T
G>A
C>A
C>T
G>C
A>G
C>T
G>T
C>T
G>C
T>A
G>C
C>T
C>T
C>T
G>C
T>G
G>C
T>G
G>A
C>G
G>T
G>A
T

A>T
G>C
T>A
C>G
C>T
T>C
G>T
C>T
C>A
G>A
T>-
G>C
A>-
G>A
G>A
C>T
C>T
T>C
G>C
C>G
C>G
C>A
C>T
A>G
T>C
C>T
G>T
G>A
G>A
A>G
C>T
G>T
G>A
C>T
T>C
A>G
T>-
C>T
C>T
T>C
G>A
C>T
C>G
T>C
C>A
T>-
A>G
G>C
G>A
G>C
T>A
A>T
C>T
G>T
G>T
G>A
T>G
T>C
C>G
C>G
G>A
C>T
C>A
C>T
A>T
C>T
G>C
G>A
G>C
G>A
G>C
C>T
T>G
T>C
G>T
T>C
C>T
G>A
G>A
A>-
C>G
A>G
G>C
C>T
C>A
G>T
G>A
T>C
C>T
C>G
G>A
C>T
A>T
C>G
T>A
C>T
G>T
A>G
C>A
G>C
A>-
C>G
A>G
C>T
G>C
T>G
A>T
T>G
G>A
G>C
A>T
A>T
G>T
C>G
G>A
T>A
G>A
A>G
A>T
G>T
A>G
G>T
T>C
G>T
->T
T>C
A>G
C>T
A>G
A>T
A>G
G>T
A>-
G>T
G>C
G>A
A>T
C>G
G>-
G>A
C>A
T>A
C>G
T>-
A>T
G>C
G>A
G>T
C>A
C>A
T>A
G>A
G>A
->GA
G>T
C>G
A>T
->TGGGATAAAT
G>T
G>A
G>C
C>A
G>T
C>T
A>G
T>A
A>C
C>A
T>-
G>C
T>C
T>C
C>T
A>G
C>T
A>T
T>A
C>T
G>T
C>G
G>T
C>A
C>T
G>A
->A
C>T
G>T
C>G
C>A
G>T
T>C
C>T
C>T
G>A
C>A
G>A
G>C
G>A
C>T
A>-
A>T
G>C
A>G
G>C
G>C
C>G
G>A
G>C
G>C
T>C
G>T
G>C
G>C
T>A
G>A
C>G
A>-
C>A
G>A
G>A
G>C
A>G
T>C
C>T
T>A
G>A
C>T
C>G
C>A
G>T
C>T
G>A
G>T
C>G
T>A
T>-
C>T
C>T
G>A
A>G
C>G
C>T
G>-
T>A
C>G
C>G
G>A
T>

A>T
C>G
G>A
G>T
G>T
C>T
G>T
C>G
A>-
T>-
T>C
C>T
A>G
G>T
T>C
->T
T>A
->G
G>A
G>C
T>G
G>A
A>C
T>C
A>T
T>C
A>T
C>T
T>G
G>A
G>C
G>C
G>A
G>A
G>T
T>A
A>G
C>G
C>T
C>T
G>C
C>T
C>T
C>T
T>A
A>C
G>C
G>A
C>T
C>A
T>G
T>A
T>A
A>T
A>G
C>A
G>C
G>A
G>A
G>A
T>G
T>-
G>T
C>G
C>A
G>C
C>G
G>A
->A
G>A
T>-
C>G
G>A
C>G
T>A
G>A
C>T
C>A
A>C
T>C
A>G
A>-
C>T
T>C
G>C
A>G
T>G
C>T
A>T
C>G
G>C
T>C
G>C
C>T
T>C
G>A
C>T
G>C
->A
G>A
C>T
G>A
G>T
C>T
G>C
G>A
G>T
T>A
A>-
A>T
T>A
C>G
A>C
T>A
G>A
G>A
G>T
T>C
A>C
A>-
T>C
C>T
A>T
G>T
G>C
A>C
G>T
C>T
C>T
A>-
G>A
C>T
C>G
C>A
G>T
A>T
G>A
C>T
CTACT>-
G>C
A>T
A>G
A>C
A>C
G>A
A>T
T>G
G>A
A>T
T>G
A>G
TA>-
G>A
C>A
G>C
G>C
G>C
C>T
A>G
T>C
G>C
->T
T>C
A>T
G>C
G>C
G>A
G>A
T>-
A>C
A>G
T>-
C>T
G>C
T>A
T>C
A>T
T>C
G>A
T>G
C>A
T>C
G>A
T>G
G>T
T>G
T>C
G>C
->A
G>A
G>A
C>T
A>T
C>G
C>G
G>A
G>A
T>G
G>A
G>C
T>C
C>G
C>G
C>G
C>G
G>-
G>A
A>T
A>G
A>G
C>T
G>C
G>A
T>G
C>T
C>A
A>G
C>T
A>T
G>C
A>G
G>C
G>T
A>C
G>A
G>C
C>A
G>A
G>T
A>G
A>T
A>T
A>T
T>C
G>A
G>T
G>C
T>C
A>T
G>A
T>C
T>G
C>A
G>T
T>G
C>A
T>G
T>-
G>T

C>T
G>T
T>-
G>C
A>T
C>T
T>G
A>G
C>G
G>T
G>C
G>A
C>G
T>C
G>C
G>C
A>-
C>A
C>G
G>A
C>T
C>G
A>C
T>-
C>T
G>A
G>A
G>T
G>C
G>A
A>G
C>G
C>T
C>G
C>T
C>G
TTTTCCA>-
T>A
T>C
A>T
G>C
A>G
A>C
->A
C>T
C>T
C>G
C>A
C>T
A>G
A>-
T>C
T>C
C>T
G>A
A>-
G>A
A>T
T>-
G>C
A>C
A>T
C>T
->T
G>C
C>-
C>T
G>A
T>G
G>A
A>-
C>A
C>G
T>A
G>A
C>G
C>T
->A
G>T
C>T
A>G
C>T
G>C
G>C
A>T
G>C
G>T
A>G
C>T
C>T
G>-
T>C
T>C
G>A
C>T
G>C
T>C
G>T
C>G
->T
G>A
G>C
A>G
C>T
G>A
G>C
C>A
A>G
C>G
A>G
C>T
G>T
G>C
C>A
C>T
G>A
G>A
C>T
A>G
G>C
G>C
A>T
T>-
T>A
C>G
C>A
G>A
C>T
A>G
T>C
T>A
G>A
A>G
T>C
C>T
T>-
A>T
C>T
A>G
C>G
G>C
C>T
G>C
C>G
C>G
->T
G>T
C>A
G>A
G>A
A>-
C>G
C>T
C>T
G>T
G>A
G>A
T>-
G>A
T>A
C>G
ACTGGAGACATCTGTAA>-
C>T
C>G
G>T
G>C
C>T
C>G
T>C
C>G
C>A
C>T
C>G
A>T
G>A
C>G
G>C
T>G
G>C
A>-
G>A
G>C
T>A
C>G
C>G
G>A
C>T
G>C
C>T
C>A
C>T
T>G
T>A
G>T
G>T
T>A
C>T
C>T
C>T
G>A
C>T
G>C
C>G
C>T
->A
G>A
C>A
C>A
C>T
G>A
G>A
G>C
C>A
C>G
C>-
G>T
C>A
C>G
C>T
G>C
G>C
A>G
A>T
T>A
C>T
A>T
T>C
G>C
G>A
C>T
C>T
C>T
C>G
GAATGCTAAGGCACT>-
G>-
C>G
G>A
C>T
A>T
G>-
G>C


G>T
A>G
C>T
C>T
G>T
C>T
C>T
C>T
C>T
C>T
C>T
C>T
C>A
T>C
G>T
T>G
C>G
A>C
G>T
T>C
A>T
G>T
C>G
G>T
C>A
C>G
C>A
A>T
C>T
C>A
T>G
C>T
C>G
C>G
C>T
C>G
C>G
A>C
C>T
T>C
A>G
C>G
C>G
G>C
T>C
G>C
C>T
A>-
C>G
A>G
T>C
T>A
A>G
C>A
G>T
->A
T>C
T>C
C>T
C>G
T>C
A>-
C>T
C>T
T>C
C>G
A>-
C>A
C>G
A>G
G>C
C>G
G>C
A>G
G>A
C>T
A>G
T>A
T>G
G>C
C>G
C>T
A>G
C>G
T>C
G>C
T>C
T>C
T>G
G>C
C>T
C>G
A>T
T>A
G>C
A>-
C>T
C>G
C>A
A>G
G>T
C>T
C>G
A>G
C>G
C>T
T>G
G>A
A>C
A>C
C>T
C>T
A>G
C>T
G>A
G>C
G>C
G>C
G>C
T>C
C>T
A>G
G>C
T>-
C>T
G>C
C>T
C>T
A>C
C>A
C>A
C>T
G>C
C>T
C>T
C>T
A>G
C>T
C>G
G>T
T>A
A>T
C>T
G>A
G>A
C>T
C>A
T>C
C>T
G>A
G>A
C>T
G>C
G>A
C>A
G>T
G>A
G>T
CTTTA>-
->C
C>A
C>T
G>A
C>T
T>A
C>A
G>T
A>G
C>G
C>T
A>G
C>T
C>G
T>G
G>T
C>A
C>T
C>A
T>C
T>-
C>G
G>T
T>C
G>T
C>T
C>T
T>A
T>C
T>A
A>G
C>T
G>A
A>T
G>T
T>C
A>-
A>T
T>A
T>A
->T
T>C
A>T
G>A
T>C
G>T
C>T
A>G
A>T
G>C
A>-
A>C
C>T
T>C
C>G
G>A
T>C
C>T
G>T
A>T
A>C
G>A
C>A
G>C
G>A
G>C
G>A
A>C
C>T
G>A
C>T
C>G
C>G
A>G
G>C
C>T
G>A
A>G
T>-
C>T
T>G
A>G
C>A
G>C
C>T
T>C
G>C
A>T
C>T
G>A


G>A
G>A
G>C
G>T
C>A
T>C
A>G
C>T
A>G
C>T
G>A
T>A
C>A
G>T
C>A
A>-
C>A
C>T
T>A
C>A
A>T
T>-
G>C
C>G
A>-
A>C
->T
A>C
C>-
T>-
TAAGTTACTCAGCA>-
C>T
C>G
G>T
T>C
G>A
C>T
T>G
G>C
G>A
G>C
G>A
C>A
G>C
C>T
A>T
C>G
A>C
G>C
T>A
G>T
C>A
T>G
G>A
G>A
A>C
C>T
G>C
G>A
T>A
T>C
T>-
A>T
C>A
C>T
G>A
G>A
G>C
A>T
T>C
G>T
T>G
C>T
G>A
T>-
G>C
C>A
T>C
C>T
T>-
C>G
A>T
C>A
G>A
->TA
G>A
C>T
A>C
C>A
C>A
G>T
C>G
G>C
T>A
C>T
C>G
T>C
G>A
G>C
C>A
T>A
G>A
C>G
G>T
C>T
A>G
A>T
G>C
A>G
A>-
C>G
A>C
C>T
G>A
T>C
TTTA>-
A>C
G>C
A>G
G>A
A>T
C>A
->T
G>T
T>A
T>C
G>A
T>-
G>C
C>A
G>A
G>C
G>C
G>T
C>-
C>A
A>G
C>G
G>T
G>T
C>G
A>T
->T
A>-
C>T
C>T
G>A
T>C
A>C
G>C
G>C
A>G
T>-
G>A
G>A
G>A
C>A
T>A
G>C
T>C
C>T
C>T
A>G
C>A
G>A
G>C
C>A
C>A
A>C
C>G
G>T
G>A
C>T
C>G
G>A
CTT>-
G>C
C>T
G>A
G>T
G>A
TC>-
->TC
G>A
T>G
A>G
AGG>-
A>G
A>T
A>C
A>-
C>T
G>A
G>C
T>A
->CA
C>T
C>G
G>A
C>A
C>G
G>A
A>T
C>A
C>T
T>C
C>A
C>G
G>T
A>C
G>C
G>C
T>-
G>A
TTTTGA>-
C>G
C>A
G>T
T>C
C>T
C>T
A>G
C>A
T>-
T>-
G>T
A>-
C>G
C>A
C>T
G>C
G>C
C>A
G>A
->AAAG
G>-
C>T
G>A
C>T
C>T
A>G
A>-


A>G
C>T
TATAGCTAAGAGACTT>-
C>T
C>T
T>A
CT>-
G>C
A>G
G>C
->A
G>C
C>G
GA>-
C>G
C>A
C>T
G>A
C>-
C>G
G>T
G>C
T>C
A>-
G>A
C>G
G>A
G>A
G>A
T>C
T>A
G>A
C>T
G>A
G>C
C>G
C>T
T>G
A>G
C>G
G>A
T>A
G>A
T>G
C>G
G>C
G>A
G>T
T>C
C>T
T>C
C>G
C>T
G>A
G>A
C>T
A>T
A>T
G>A
C>G
C>T
A>C
A>G
C>T
C>G
G>C
C>G
G>T
C>A
G>A
G>C
C>A
C>T
T>A
C>T
C>G
A>G
G>A
G>A
C>A
C>T
A>-
C>A
C>T
G>A
G>A
T>A
T>C
T>C
C>T
C>A
C>G
A>G
G>A
G>A
G>A
->G
G>C
T>-
G>A
G>T
G>T
G>A
C>A
T>-
T>C
A>T
A>G
T>C
G>T
A>T
C>T
T>C
T>C
G>A
A>G
G>T
C>T
C>T
G>A
C>T
G>A
T>A
C>T
A>G
C>G
G>C
G>C
C>G
T>C
C>G
A>G
T>C
C>T
C>T
C>T
G>A
A>T
C>T
C>T
C>T
G>C
C>A
A>T
C>A
G>T
C>T
C>T
G>T
A>G
G>T
A>C
G>A
C>G
C>G
G>A
A>C
C>T
T>C
G>A
G>A
T>A
C>T
G>C
G>A
G>T
T>C
C>T
C>T
C>A
C>A
T>A
G>T
G>C
T>A
G>C
G>C
C>T
G>C
G>C
T>C
A>T
T>C
A>C
C>T
C>T
T>G
C>G
G>T
G>-
C>T
A>T
C>T
T>G
C>T
G>C
G>T
G>T
C>T
G>C
T>G
C>T
G>A
G>A
G>C
C>G
A>G
C>G
C>T
C>G
G>T
A>G
C>G
A>C
G>T
G>T
T>C
C>T
G>T
G>C
A>G
G>A
C>T
G>A
C>T
C>A
C>A
C>T
T>C
C>G
C>G
C>G
C>T
G>T
C>A
G>A
G>A
A>T
C>G
G>T
C>T
A>G
C>A
ATAA>-
A>G


C>G
C>G
G>T
T>A
C>A
T>-
T>-
G>A
C>A
A>-
T>C
A>-
G>C
A>C
->T
G>T
C>T
A>G
G>A
A>C
G>C
T>A
A>G
C>T
C>A
C>G
C>T
A>T
G>C
G>A
A>-
T>A
G>A
G>A
G>C
C>T
G>C
T>A
A>C
C>T
C>T
G>A
G>A
G>A
G>C
T>C
C>T
C>T
T>C
G>A
G>T
A>G
C>T
C>G
C>T
A>C
C>T
G>A
->A
A>T
G>T
T>A
C>G
C>G
A>C
G>C
->A
T>C
G>A
C>T
->T
C>T
A>C
A>C
->A
T>C
C>A
G>C
A>G
A>C
C>T
A>C
C>T
T>G
A>G
C>G
T>-
A>-
G>A
G>C
T>-
G>A
T>C
C>G
C>G
A>C
G>A
C>G
A>G
A>G
C>G
C>G
C>T
C>G
G>-
C>G
C>G
G>C
C>G
->T
C>T
G>A
C>A
G>A
T>A
G>A
C>T
GAGGCTCGGG>-
T>G
A>T
C>T
C>T
C>T
CTT>-
T>-
A>G
T>A
G>A
A>C
G>T
G>A
C>G
C>A
G>A
T>-
C>G
CT>-
T>A
T>A
C>G
G>A
C>G
->T
A>C
C>T
T>G
T>A
A>G
C>A
C>T
G>A
G>T
T>G
T>G
T>C
C>T
C>A
T>G
C>T
G>C
C>G
C>G
G>A
A>-
G>A
A>T
G>A
A>G
T>C
->G
G>C
G>T
G>T
A>T
G>C
T>G
A>C
C>T
C>T
C>A
T>-
C>A
T>C
C>T
GCACTCATAAAGATAATCAC>-
C>T
A>G
AC>-
C>G
G>C
G>C
A>G
G>C
G>A
C>T
TGAAAC>TGAAA
A>-
C>T
C>T
C>G
T>C
C>G
A>G
A>-
A>-
G>T
G>A
C>G
G>T
C>G
C>A
G>A
A>-
A>-
G>T
G>A
C>G
A>C
A>G
A>C
T>C
C>A
T>C
G>A
C>T
A>T
G>A
C>A
T>C
C>T
C>G
G>T
C>T
C>T
C>A
G>C
T>C
G>A
->A
A>G

G>A
T>C
T>C
G>A
C>G
C>T
C>T
G>A
A>T
C>G
G>A
G>A
G>A
G>C
C>G
A>C
A>G
T>C
G>A
G>C
C>G
C>G
G>A
A>G
G>A
C>T
T>C
T>A
A>-
C>A
C>T
A>G
A>G
C>G
G>A
T>A
T>C
C>T
C>T
C>T
G>A
G>T
C>T
T>C
C>T
G>C
T>A
A>G
T>C
G>C
A>C
G>A
G>A
G>C
G>C
A>G
A>T
G>A
->A
C>G
C>G
C>T
G>A
G>A
T>C
C>T
C>G
A>T
G>C
G>C
G>C
A>G
G>A
A>T
G>T
G>C
G>A
G>A
G>A
A>G
G>T
G>A
T>C
T>G
C>T
C>G
T>G
C>A
T>A
G>A
A>-
A>T
G>C
T>C
G>A
A>T
G>A
T>C
G>A
TCTGATT>-
C>T
A>G
A>-
A>G
G>T
A>G
G>A
C>T
C>T
C>T
C>G
C>T
G>A
A>T
A>C
T>C
C>T
C>A
T>G
T>C
C>A
A>-
G>C
C>T
G>C
T>A
T>A
G>T
A>T
C>T
T>C
G>A
T>-
T>G
T>C
G>-
C>A
->C
A>G
G>T
A>T
C>A
C>T
G>A
G>C
G>A
TT>-
T>-
C>T
G>A
->A
->A
G>C
C>T
C>A
G>A
A>C
T>-
G>A
T>-
C>T
C>G
A>T
C>T
A>T
G>T
A>C
G>T
T>C
C>G
G>C
G>A
A>G
CT>-
T>-
T>G
C>T
A>C
C>A
C>A
G>-
G>T
C>G
T>A
A>G
C>A
G>C
A>G
C>T
G>A
G>A
A>G
T>C
A>G
T>G
G>C
G>A
A>G
C>T
G>A
G>T
T>C
G>A
G>A
A>G
A>C
G>-
T>C
G>C
G>A
A>C
A>T
A>C
C>T
G>A
G>A
T>C
C>T
A>C
G>A
A>T
G>C
G>-
T>C
G>A
A>C
G>T
A>G
C>T
T>A
G>C
G>A
C>G
A>T
G>A
A>G
T>-
->T
C>G
G>C
A>G
T>C
G>T
A>C
C>T
C>G
G>A
C>T


G>A
T>A
T>C
C>T
G>A
C>T
G>A
A>T
A>G
A>C
A>G
T>A
G>C
T>C
T>C
G>A
C>G
C>T
C>T
T>-
C>A
A>G
G>T
T>A
G>A
A>G
->A
G>C
G>C
T>-
->TG
T>A
T>G
C>T
A>T
T>-
G>A
A>T
C>A
G>A
C>G
T>A
C>G
A>G
G>A
T>C
C>T
C>A
C>T
C>T
G>C
G>A
T>A
T>-
G>A
A>T
C>G
A>G
T>C
G>T
G>A
G>A
C>G
A>-
A>T
A>C
C>T
T>-
T>C
T>A
C>G
A>T
G>A
T>G
A>T
T>-
G>A
G>A
T>C
C>T
C>T
C>T
C>T
G>A
C>G
G>C
C>T
A>G
A>G
C>G
G>T
G>T
A>G
T>A
->A
T>C
G>C
C>T
C>G
G>T
C>T
C>G
C>A
C>T
T>-
T>A
T>G
A>G
G>C
C>T
G>C
C>T
G>C
C>T
G>A
G>T
G>C
G>C
G>C
C>G
C>G
T>-
C>T
TTGTTTATATTCAAAGGTTAA>-
T>A
G>C
C>T
T>G
G>A
T>C
C>A
G>C
C>G
C>A
C>T
C>A
G>A
G>C
G>C
T>-
TGAGCAAAAAGT>-
C>A
A>T
A>G
A>G
T>A
G>T
A>G
T>G
G>-
C>T
G>T
T>C
TG>-
A>G
G>C
G>C
T>A
C>T
A>G
G>C
G>C
A>G
G>A
->A
C>A
C>T
AATATTTCTT>-
T>G
A>-
T>-
A>C
C>T
T>A
G>T
G>A
C>G
G>C
A>C
C>A
T>A
A>G
G>C
T>A
T>C
A>G
A>T
G>A
G>A
G>C
G>A
C>T
T>A
A>G
C>T
A>C
A>G
T>A
G>T
G>C
->A
C>A
T>C
A>T
G>C
G>T
A>G
G>A
T>-
C>T
G>A
G>A
C>G
->CAAGGATTGGTTT
G>A
C>G
G>C
C>G
C>T
T>G
C>G
G>A
C>T
C>A
A>-
A>C
C>T
T>C
C>A
C>T
C>T
G>C
A>T
C>A
A>G
C>T
G>

T>C
CACTTGCTGGCTC>-
C>T
A>G
C>A
->T
C>G
G>A
T>-
C>G
A>T
C>A
A>T
A>G
C>T
C>T
G>A
A>C
C>T
G>A
A>G
T>-
T>A
G>A
A>G
A>G
A>-
C>T
C>G
T>G
A>C
->AA
AA>-
->T
T>-
C>A
G>C
T>G
->T
T>A
T>A
T>C
G>A
TGCTGTTG>-
G>A
C>G
C>T
C>G
C>T
G>A
A>C
G>A
G>T
C>G
G>A
G>A
G>A
G>T
A>T
A>G
G>-
C>T
G>C
A>T
->T
A>G
T>-
T>C
C>G
T>-
->A
T>A
G>C
G>A
C>T
C>A
G>A
G>A
C>T
A>T
A>G
G>C
T>C
T>A
G>C
A>T
A>G
C>G
T>A
C>G
C>G
T>C
G>T
A>T
C>A
T>A
T>C
C>T
A>-
G>A
T>A
A>-
C>A
G>A
G>A
A>T
C>G
A>-
T>G
A>T
T>-
C>A
G>A
A>T
G>A
A>T
C>G
G>A
C>G
C>A
T>A
C>A
T>-
T>G
A>T
C>G
A>C
C>G
G>T
C>G
C>A
C>T
->A
G>A
->T
C>-
C>T
C>G
G>T
A>-
G>-
C>T
T>C
T>A
G>C
C>T
G>A
T>A
T>G
C>A
C>T
A>G
T>C
C>A
C>G
A>G
G>A
A>G
A>G
C>T
C>A
G>T
A>G
T>C
G>A
T>G
A>G
T>A
G>T
T>G
->C
C>-
C>G
A>T
G>T
C>T
G>A
C>G
T>-
C>T
C>T
G>C
C>G
T>C
C>T
->T
C>A
A>T
A>G
A>C
A>T
T>A
C>G
TATAA>-
C>T
C>T
G>A
C>T
C>T
G>A
C>G
G>T
T>C
G>T
A>G
C>T
A>G
G>C
A>G
C>G
C>G
G>A
C>A
A>C
T>-
G>T
G>C
T>G
C>T
C>G
C>T
C>T
T>A
G>A
C>T
C>T
A>T
A>-
G>A
G>C
G>C
G>A
C>T
C>A
C>A
A>C
G>C
A>T
G>A
T>C
A>-
T>-
A>G
C>T

C>A
->T
G>A
C>T
A>T
C>T
A>C
A>G
G>A
G>C
G>C
A>G
A>-
G>C
G>T
C>T
A>C
C>T
A>T
A>T
T>G
C>T
C>T
C>G
->A
C>T
T>-
C>G
T>C
T>C
T>C
A>T
G>C
G>A
G>A
A>G
C>A
G>C
G>C
A>T
G>C
A>-
G>C
G>A
C>T
G>A
C>T
G>A
A>-
C>A
C>A
G>C
C>T
C>A
C>T
AGATACCTCTG>-
A>G
A>C
A>-
G>C
G>C
C>T
C>G
G>C
T>C
C>T
C>T
A>G
G>C
A>T
G>C
A>C
A>T
G>C
C>A
C>G
G>A
T>-
T>C
G>T
T>C
C>T
C>T
T>G
C>G
G>T
G>T
G>T
C>T
C>T
->T
C>T
G>C
C>T
T>-
AAG>-
T>-
G>A
A>T
G>T
T>G
G>A
G>C
A>G
G>A
A>G
G>T
G>A
T>A
C>G
C>T
C>T
G>A
->A
A>C
T>C
T>C
C>T
C>T
T>G
T>G
T>A
G>T
TGT>-
C>T
GTTTCATGCTGTGCACTTATGTTAGAACAAATTATTGTTTTCAA>-
C>A
A>T
T>G
C>A
G>A
G>T
G>A
TTTTG>-
C>T
C>T
G>T
G>C
A>-
G>C
TATAT>-
A>G
C>A
A>G
T>-
A>-
A>-
C>-
C>T
G>C
C>A
T>G
G>A
T>-
G>T
G>A
G>C
T>G
G>A
G>A
T>A
C>T
C>G
C>G
T>C
C>G
C>G
T>C
G>A
T>-
A>T
T>G
T>-
A>C
C>G
T>A
C>T
G>C
A>G
C>T
->A
G>C
A>C
A>T
G>A
C>T
T>C
C>G
A>-
G>A
TTTTATGAATGTTGTGTATTAATAACTTTGACTTCCTTC>-
->TATA
C>T
G>A
T>A
G>C
G>A
C>A
A>G
G>A
G>T
A>G
A>C
A>G
C>T
T>-
G>T
G>A
C>A
C>T
C>A
G>C
C>T
G>C
T>A
A>C
T>-
C>T
A>G
C>T
A>T
A>T
G>C
A>

G>C
G>T
A>T
C>A
G>A
T>C
T>-
C>T
C>A
G>A
C>G
T>C
G>C
C>T
G>A
G>A
A>G
C>A
C>G
C>T
T>A
C>T
T>A
T>A
T>C
C>A
G>T
C>T
T>C
G>T
C>G
A>G
G>A
C>T
G>C
G>C
C>T
G>C
C>T
G>T
->G
G>C
A>-
C>G
C>G
T>-
C>T
T>G
A>G
C>A
G>C
C>T
C>A
T>C
T>C
T>A
C>T
G>C
C>A
C>A
A>G
C>A
T>C
C>G
T>-
T>A
->A
G>A
T>-
A>T
A>G
C>A
A>T
T>A
A>G
A>G
T>C
C>T
G>A
G>A
G>T
T>A
C>G
G>C
G>A
T>A
C>T
G>C
C>T
G>C
C>T
T>-
G>C
G>C
T>C
C>T
G>A
C>A
G>C
A>G
G>A
G>T
T>C
A>T
GAG>-
C>T
C>G
G>A
A>G
G>C
G>A
T>A
G>T
T>C
G>C
C>G
G>C
C>T
G>A
G>T
C>T
A>-
C>T
T>C
G>C
TATGGACTAGTTTCTTCA>-
C>T
G>-
A>G
A>T
G>A
C>T
G>C
C>G
C>T
A>G
C>T
A>-
A>G
C>T
T>A
G>A
C>T
A>T
A>G
C>G
->A
T>A
G>A
T>C
T>A
G>A
A>G
C>G
A>G
C>G
C>A
T>C
C>G
G>C
C>G
G>A
C>G
T>A
C>G
G>A
C>A
C>A
A>-
G>C
A>-
AAAG>-
->T
C>G
G>A
G>A
C>G
G>A
T>A
T>C
C>G
A>T
C>A
G>-
G>A
T>-
C>T
T>A
T>A
C>G
C>T
T>C
T>G
G>T
C>G
A>-
A>T
A>C
C>T
C>G
C>A
C>G
C>T
G>T
C>T
G>C
->A
T>C
T>A
G>C
T>G
A>-
G>A
T>A
ATGTGTCGAC>-
AAACAAGTAAATACATA>-
A>G
GGTGGTGTTAGCAA>-
G>C
C>T
A>G
T>C
C>G
C>T
C>G
G>T
A>G
G>A
C>G
G>T
A>G
T>G
T>-
G>A
A>T


C>T
C>G
C>T
T>A
A>C
A>G
G>C
G>A
G>A
C>G
C>-
G>C
G>A
A>T
C>A
G>A
G>C
G>A
T>G
G>A
C>G
T>-
T>-
C>T
T>-
T>A
G>A
->T
A>G
A>-
C>G
G>T
G>A
C>G
C>G
G>C
G>A
TG>-
C>T
G>A
C>G
G>A
A>T
G>C
G>A
G>A
G>A
C>T
C>G
T>G
T>-
->A
C>G
A>T
C>T
G>C
G>T
G>A
A>C
A>G
G>C
A>G
C>T
G>T
G>A
C>T
C>G
G>A
C>A
G>A
C>T
C>T
C>T
C>A
C>T
G>C
G>A
C>T
G>T
C>G
A>G
G>C
A>T
G>A
A>T
G>C
C>T
G>A
G>A
A>T
TGT>-
C>T
->GT
C>T
G>A
C>G
C>T
G>T
C>T
G>C
C>T
C>T
C>T
C>T
G>A
A>G
C>G
C>G
A>T
C>T
T>G
C>G
T>A
A>T
G>T
G>A
C>T
G>C
G>A
G>A
G>C
T>G
A>G
G>A
G>A
C>T
T>G
G>T
C>T
T>C
T>G
C>G
T>C
G>C
A>-
G>A
A>G
G>T
C>G
C>-
A>G
C>A
T>A
T>C
G>A
A>G
C>G
G>T
C>T
C>A
G>A
C>A
T>C
G>T
A>T
G>A
A>G
A>-
T>C
G>A
G>A
G>C
G>C
C>T
G>C
T>A
G>C
G>C
C>T
C>T
A>C
T>-
C>T
C>G
C>A
C>G
C>G
TATC>-
A>T
C>A
C>T
C>T
T>G
T>-
C>T
G>A
T>C
AATCCC>-
G>A
T>C
T>A
G>C
->A
A>T
G>A
G>A
G>A
T>G
->A
C>T
A>G
G>A
C>T
C>A
T>A
G>C
A>G
T>-
C>G
C>A
G>T
A>G
C>T
G>A
G>A
G>T
C>T
C>T
C>G
A>T
T>C
C>A
C>T
A>T
G>T
A>T
G>A
G>T
A>C
T>C
C>T
C>A
A>T
A>G
C>A
C>T
C>G
G>T
C>A
G>A
A>T
C>T
G>A
G>T
G>C
T>G
T>A


G>A
G>C
T>C
C>G
A>T
C>T
->A
A>G
C>A
C>T
G>A
T>C
C>G
TAA>-
C>T
C>T
T>A
G>C
T>C
T>A
A>T
T>C
T>G
A>-
G>C
A>G
C>T
C>T
G>T
C>T
A>-
G>A
C>T
G>A
C>G
A>-
C>T
A>C
T>C
G>C
G>A
C>A
C>T
A>-
G>C
G>A
C>T
A>T
C>G
C>G
C>G
G>A
C>A
T>G
TTTGGTATTGATTTTTTTGAACAACTATC>TTTTTGAACAACTATCAAATTAA
T>C
T>A
C>T
T>C
C>G
C>T
G>C
G>A
T>C
G>T
C>G
C>A
A>G
G>A
T>-
T>C
T>C
C>G
T>A
C>T
A>G
C>T
G>C
A>C
A>-
T>A
G>C
C>A
T>C
A>T
T>C
G>C
C>A
C>G
A>T
G>T
T>A
A>T
C>T
A>G
ACTC>-
C>T
C>T
T>C
A>C
T>C
C>T
C>A
T>C
C>T
T>G
C>T
C>G
T>C
C>T
->C
A>G
C>T
C>G
C>A
CCGCAGCCATTTCTT>-
C>A
C>T
G>A
C>A
C>G
A>T
G>C
G>C
A>T
C>T
TGT>-
A>T
C>A
->A
C>A
G>A
G>A
C>T
T>-
C>G
T>G
C>A
C>G
A>T
T>C
T>-
T>C
C>A
G>-
C>T
T>A
T>G
T>-
C>G
G>A
A>G
G>C
G>T
A>T
G>A
C>G
G>A
G>A
G>A
C>A
G>A
T>C
G>T
G>A
G>A
A>-
T>G
C>G
A>T
C>A
C>A
G>A
G>A
T>A
A>T
A>T
C>G
A>G
C>G
C>G
A>G
C>T
T>G
T>C
T>A
C>T
C>G
C>G
T>C
A>T
G>T
T>C
C>G
G>C
G>C
C>A
A>G
A>C
T>A
A>C
A>G
->AAAT
G>C
A>G
A>G
G>T
A>G
C>T
T>-
G>T
G>C
G>C
C>T
C>T
A>T
G>C
A>T
G>A
C>T
C>T
A>T
T>A
A>G
T>C
A>-
G>C
G>A
C>T
C>G
G>A
C>

T>C
A>C
C>A
T>-
C>A
C>G
G>C
C>A
G>A
->A
A>-
C>A
G>C
T>A
->C
C>A
C>T
C>G
A>C
G>A
G>A
C>-
T>C
G>A
C>T
C>T
G>T
C>G
C>T
T>C
C>A
T>C
T>A
C>A
C>T
G>T
C>T
A>C
A>-
C>A
G>A
T>A
T>A
C>A
C>G
C>T
G>A
T>C
A>T
T>A
C>A
A>-
C>G
C>T
A>-
A>C
G>C
G>C
G>C
C>A
T>A
C>G
C>A
T>C
C>G
T>A
T>C
G>C
A>T
C>T
T>C
T>C
T>C
A>-
G>A
G>A
A>G
C>T
A>C
->A
C>G
G>T
C>A
C>T
G>C
T>G
C>A
A>G
G>C
A>G
C>A
->A
A>G
T>G
C>G
G>A
T>C
C>G
T>A
A>T
T>-
G>C
C>A
G>A
A>G
T>A
ATT>-
C>A
G>T
CCTCCTTTTTTGGGCACATTTCTCTTTCCT>-
C>T
C>T
C>A
G>A
A>T
G>C
A>G
A>G
A>T
C>A
A>C
G>C
A>C
G>A
A>T
G>T
T>C
A>G
C>A
T>A
T>C
G>A
AAGTCACACACAAA>-
G>T
C>T
G>T
G>T
T>C
T>A
C>T
C>G
C>T
C>T
C>T
G>T
T>C
G>A
G>A
C>A
G>C
A>G
G>A
C>T
T>G
T>A
->T
G>C
->T
G>A
C>T
C>G
A>C
G>C
G>T
->T
G>C
C>G
A>T
G>C
G>T
T>C
A>-
C>T
A>T
->A
->C
T>A
A>T
T>-
T>A
T>A
G>A
C>T
C>-
G>C
A>G
T>G
A>T
C>A
G>A
C>A
G>C
G>C
C>A
->A
G>A
C>T
C>T
A>C
A>T
AG>-
C>A
A>C
A>T
G>A
C>T
G>C
C>G
G>A
C>T
G>A
->T
T>A
G>C
C>T
G>A
C>G
G>T
A>T
G>A
G>A
A>C
C>A
T>A
C>A
G>T
T>G
A>T
G>A
C>T
C>A
A>-
A>G
->T
G>T
C>A
G>A
C>G
TTA

T>-
C>T
G>A
G>A
G>C
T>C
A>C
G>A
C>A
G>T
G>C
A>C
G>T
T>G
C>A
A>C
C>G
C>T
C>T
A>G
C>T
C>T
C>A
C>G
T>A
T>G
A>G
T>-
G>A
A>T
G>C
G>A
C>T
T>C
A>G
G>A
G>C
C>G
T>A
G>C
G>A
G>T
G>C
A>-
A>-
A>T
C>A
G>A
G>A
G>C
G>A
G>C
G>T
C>A
A>G
C>T
C>A
G>C
G>C
C>T
G>A
G>C
C>T
T>A
T>C
C>T
G>C
G>A
A>G
G>C
A>G
T>C
A>G
T>C
A>-
A>T
G>C
G>T
T>C
C>G
G>A
G>A
A>G
T>C
T>A
A>G
T>G
C>T
T>C
C>A
G>A
A>G
A>-
T>G
G>A
T>A
G>T
->T
G>A
T>-
C>G
C>G
C>T
G>A
G>A
C>G
A>G
A>G
C>T
C>T
C>A
C>G
T>C
G>C
G>T
A>C
T>-
A>G
C>T
A>T
C>T
A>G
->A
G>A
C>T
C>T
T>A
C>A
G>A
C>A
C>T
ATG>-
G>T
G>A
C>A
G>A
A>G
A>T
G>C
G>A
A>G
T>A
T>A
C>T
T>C
C>A
G>T
->T
T>A
T>C
G>C
A>T
A>G
A>G
C>G
C>T
C>G
G>A
G>-
T>C
T>C
T>C
G>A
C>G
C>A
C>A
C>T
G>T
G>T
G>T
G>A
->T
T>-
A>G
T>A
C>T
G>T
A>G
G>A
A>T
G>A
ACCCCCTTC>-
C>G
G>C
G>C
T>C
A>G
A>-
A>-
A>T
C>A
G>C
T>C
T>A
T>C
A>T
C>T
T>C
C>T
T>C
G>C
G>A
G>T
G>A
G>A
T>-
C>T
A>T
G>T
T>C
G>T
C>A
A>T
C>A
A>-
C>T
G>T
G>A
->T
G>A
C>A
C>T
G>A
G>A
AGAG>-
G>C
T>C
A>T
C>T
C>T
G>A
C>T
G>T
G>T
G>A
C>T
G>T
T>C
A>C
A>G
T>G
C>T
C>G
T>G
T>C
G>C
G>A

A>G
T>C
A>G
G>T
G>A
G>T
C>A
A>T
C>G
G>A
T>C
G>T
T>A
G>A
G>C
T>A
C>A
A>T
A>G
T>A
T>-
A>G
G>T
C>T
T>C
G>A
C>G
A>T
A>G
G>T
A>C
C>G
C>A
G>T
C>T
C>T
T>C
G>A
C>T
G>C
A>C
G>A
A>G
C>T
C>T
T>A
C>A
G>C
C>G
G>A
C>G
G>A
G>T
C>A
G>A
G>A
C>G
A>G
->T
T>C
C>A
G>A
->T
G>T
C>G
T>A
T>C
C>G
C>A
A>T
T>C
C>G
C>T
AA>-
A>G
G>T
G>C
C>T
G>A
T>C
C>T
C>T
A>C
A>C
G>A
G>T
G>C
A>C
G>T
G>T
T>-
A>G
G>C
A>-
G>A
C>T
G>A
C>A
C>T
C>T
C>T
G>C
A>G
G>C
T>-
G>A
T>C
C>G
G>A
C>T
A>-
T>A
C>G
C>T
G>A
C>G
G>A
G>A
C>T
C>T
C>T
G>C
C>G
C>T
T>C
->T
AGA>-
G>A
G>A
C>T
G>A
C>T
G>T
G>A
T>C
C>G
C>G
G>C
C>T
G>T
AATTCAGTACCCAGCTTTAAA>-
C>A
A>T
G>A
G>C
A>-
C>T
C>G
->T
C>A
G>C
C>T
G>C
->A
T>C
G>C
C>T
C>A
G>A
A>C
C>G
A>G
G>A
G>A
T>A
C>T
T>A
G>T
C>A
A>T
A>G
C>G
G>A
G>A
G>C
G>A
T>C
C>G
G>C
G>C
G>A
C>G
T>G
C>G
C>G
C>T
C>A
A>G
G>C
G>A
C>G
G>C
G>T
T>-
G>A
C>G
A>G
G>A
G>T
A>C
A>G
A>T
G>A
A>C
C>G
G>C
A>G
C>G
G>A
T>G
C>G
C>T
G>T
C>T
G>A
C>T
G>A
G>A
G>C
C>G
C>T
T>C
G>C
ATT>-
TT>-
G>T
A>T
G>T
A>T
A>G
G>A
C>T
T>C
TTG>-
G>A
G>A
G>A
T>C
G>T
G>A
C>G
G>T
G>C


G>A
C>T
C>T
A>C
C>A
T>G
A>T
T>A
C>G
G>C
A>G
C>T
G>A
C>G
G>T
C>G
T>C
G>A
TC>-
A>G
G>A
T>-
C>T
G>A
G>C
C>T
C>G
C>G
G>A
T>G
C>G
C>A
A>G
AG>-
C>T
G>A
A>G
C>G
C>T
C>G
T>-
C>A
G>A
C>T
C>G
C>T
G>C
T>C
AGA>-
G>C
A>T
C>T
C>T
C>G
T>C
C>T
G>C
ATA>-
G>A
T>G
G>A
T>G
A>T
A>G
C>T
A>C
G>C
G>A
G>A
A>G
C>T
G>C
A>T
G>C
C>A
T>G
G>C
A>T
G>C
G>C
G>A
C>A
C>T
C>T
C>T
G>A
G>C
C>T
A>G
A>C
G>C
G>C
A>G
C>T
T>A
T>-
->T
A>T
A>C
C>A
G>A
G>C
C>T
A>-
G>A
C>A
A>T
G>T
C>T
G>A
G>C
G>-
C>T
G>C
G>C
G>A
G>A
C>G
A>G
G>A
C>T
G>A
->A
C>G
->T
T>C
A>G
G>A
TCACT>TAGA
C>G
T>A
C>G
C>T
C>T
C>G
G>A
T>-
TGACCACAGGGATAAAA>-
G>A
C>T
G>A
G>A
G>C
C>T
G>A
G>C
A>-
T>C
C>A
C>A
ACTAGAGGTTTAGTTTAAGGC>-
G>C
G>A
G>A
T>A
C>T
C>A
A>G
A>G
T>C
C>A
G>C
C>G
G>T
C>T
C>T
G>C
G>A
A>T
A>C
A>-
G>T
T>C
G>C
A>C
C>T
G>C
C>T
C>G
A>G
G>A
C>T
->A
GGGAAAAATTTAAAGTTTAAAGCAAC>-
G>C
A>G
T>A
C>T
T>-
T>G
G>C
G>A
A>C
C>A
T>C
C>T
G>A
C>T
C>G
C>T
G>C
G>A
A>G
CCTCCTTACTC>-
G>A
C>G
G>C
G>A
T>C
G>A
G>C
T>G
A>-
G>A
A>C
G>A
G>A
A>G
T>C
C>T
A>G
G>C
T>C
C>T
G>T
C>A
A>-
T>A
G>A


C>G
T>-
A>T
T>A
C>T
C>T
A>T
G>C
C>T
C>G
C>T
C>G
C>T
C>A
C>G
A>G
C>T
C>A
A>T
C>T
C>A
G>T
G>T
G>C
C>T
C>G
T>C
C>T
A>C
G>A
T>C
A>-
A>T
C>G
A>C
C>A
G>A
C>T
->A
A>-
T>C
C>T
A>C
G>C
C>G
G>A
T>G
G>A
A>T
T>C
C>T
T>G
T>A
A>T
A>G
C>T
G>T
G>T
C>T
A>T
C>A
A>T
T>A
->A
T>C
G>C
C>T
C>T
T>C
CCCTG>-
C>T
G>A
T>C
C>A
A>C
C>G
T>C
G>A
C>G
C>A
C>T
A>G
A>T
C>T
C>A
A>-
->A
T>A
T>C
T>G
T>A
C>G
A>G
C>T
G>T
A>T
A>G
C>G
T>C
A>-
C>T
C>T
G>T
C>G
G>A
G>A
A>T
A>-
->A
G>A
T>A
C>A
C>T
G>A
T>G
C>A
G>A
G>A
C>T
T>-
->T
T>G
G>T
T>A
T>C
C>G
->A
C>T
G>T
A>T
G>T
G>A
G>T
A>G
T>C
C>T
C>T
C>T
C>T
G>A
C>T
C>T
C>T
G>C
C>T
C>T
G>T
T>C
A>T
T>A
G>A
A>T
G>A
A>T
C>T
C>G
C>G
A>T
A>T
C>G
G>C
T>A
T>C
G>A
C>A
T>C
C>T
C>T
T>C
C>A
C>A
C>T
T>C
T>G
A>G
G>A
C>T
G>A
A>G
C>T
C>G
T>C
G>A
G>A
G>T
C>T
C>T
C>G
G>A
T>A
C>T
G>A
G>A
->A
C>A
G>A
A>T
T>G
C>A
C>G
C>A
T>A
G>A
T>A
G>A
T>G
A>T
C>G
T>C
C>T
G>A
C>T
G>A
C>A
C>G
A>-
C>T
T>C
G>T
C>T
T>C
C>G
G>A
C>T
C>T
G>C
G>T
G>A
G>T
GAATTGCAAGCTAAAACACTTATAG>-
A>C
A>-
C>G
G>C
T>G
C>A
T>G
C>T
A>C
A>G
G>C
G>A
G>T


TC>-
A>G
G>A
T>C
G>C
A>-
G>C
C>T
C>T
C>T
TGCACAGAGAAGGAACCTATAGCC>-
A>G
T>C
G>C
C>G
A>G
AT>-
G>T
C>A
C>A
C>T
C>G
C>T
T>A
A>G
C>G
A>T
C>T
C>T
->T
C>G
C>T
C>T
G>T
G>A
G>C
C>G
T>G
C>T
C>T
G>C
G>C
T>C
G>A
T>G
A>C
A>T
AGGCCGCGGGG>-
G>A
G>T
G>T
G>A
T>A
C>G
C>T
T>G
G>A
C>G
C>T
T>A
A>C
C>T
G>T
G>T
G>A
C>T
C>T
G>T
G>A
G>C
G>A
G>A
C>A
G>C
G>A
C>T
G>T
G>A
G>A
T>C
G>T
C>G
T>C
A>T
A>G
G>A
G>A
C>T
A>-
C>G
A>C
G>A
G>T
C>G
T>G
G>C
C>T
G>T
G>C
T>C
C>G
G>C
A>T
T>C
C>G
C>T
A>G
C>T
C>A
G>A
G>T
G>A
C>G
G>T
G>C
G>C
C>T
C>T
T>A
G>C
C>T
C>T
A>G
C>T
C>T
C>G
G>C
C>T
C>G
C>G
C>G
C>T
C>G
G>C
G>A
C>T
C>G
C>T
C>T
A>T
T>A
T>-
G>A
C>T
C>T
A>C
G>A
A>C
G>A
G>T
C>T
C>A
G>T
G>T
G>A
C>G
G>C
G>A
G>A
C>A
C>T
C>A
A>G
G>C
G>C
G>C
C>T
G>C
G>C
C>T
C>T
G>C
T>C
C>G
T>-
G>C
G>C
T>-
C>T
G>C
C>T
T>A
G>T
->AGAACAAGG
TCCA>-
G>A
->T
G>C
C>T
C>T
C>T
C>A
G>T
G>C
C>T
G>A
A>-
G>A
G>A
C>T
G>A
C>T
->C
G>C
T>-
G>A
->T
G>A
C>T
T>-
C>G
G>C
G>C
G>C
G>A
A>G
A>-
C>G
A>T
T>C
C>-
C>T
G>T
C>T
C>T
C>T
->AA
->A
C>G
G>C
C>T
G>T
T>C
C>G
G>A
T>A
G>T
GA>-


T>C
T>C
T>G
G>C
C>G
A>-
AGAAT>-
->A
C>T
G>C
C>A
C>T
C>T
C>G
C>G
G>A
C>G
C>T
C>G
C>G
A>G
T>A
A>G
G>T
C>T
C>G
C>G
A>G
C>G
C>A
T>A
G>A
C>G
T>A
C>G
A>C
A>G
A>G
C>G
C>T
C>G
A>C
C>T
G>C
C>A
C>T
T>G
G>A
G>A
G>C
A>-
AG>-
T>C
C>T
G>A
T>C
G>A
T>A
G>A
A>G
C>-
T>C
G>A
C>A
C>T
C>G
C>T
C>T
C>T
A>T
T>G
C>G
T>-
->T
A>-
C>T
G>A
G>C
G>C
AT>-
T>A
A>-
G>A
G>A
A>-
C>T
A>G
T>A
A>G
C>T
A>G
T>C
C>T
T>-
A>T
A>T
A>-
C>G
G>A
G>A
T>A
C>G
C>A
C>G
G>A
C>G
C>T
G>A
->T
G>C
G>A
C>T
A>T
G>C
C>T
CCAAAAAAAAAAAAAATTAG>-
T>-
G>T
C>T
G>T
G>A
G>T
A>T
A>G
T>-
G>T
G>A
T>C
T>A
A>C
A>G
T>A
C>T
G>A
C>G
A>T
G>C
G>A
G>-
G>-
G>T
T>C
G>C
C>T
G>T
C>T
C>A
G>A
T>-
->T
C>T
G>-
G>A
G>A
C>T
C>T
T>A
G>C
T>G
C>G
A>-
C>G
C>A
->T
G>A
A>G
A>C
G>C
G>C
A>T
T>-
T>C
A>T
A>C
C>T
C>A
T>C
T>A
A>-
A>G
C>G
C>T
G>T
G>A
G>T
C>G
C>A
G>A
C>A
T>C
T>A
A>T
T>-
C>G
G>C
G>C
G>A
C>G
G>T
C>G
T>C
C>G
C>T
C>T
G>A
C>A
C>T
T>A
C>T
G>C
G>C
A>-
G>A
G>T
C>T
G>A
C>A
A>T
G>C
G>A
G>A
G>A
T>C
C>G
C>A
A>-
T>C
C>A
G>A
A>G
T>C
A>G
A>-
G>C
C>A
A>C
G>A
C>T
G>C
T>-
G>A
G>A
C>T
G>A

C>G
C>G
C>G
T>-
C>T
T>C
G>A
T>A
A>G
T>G
A>C
G>A
C>G
G>A
G>A
C>A
A>T
G>T
A>G
C>G
A>G
G>A
G>C
G>T
->A
G>A
T>C
T>C
C>T
C>A
G>T
C>A
C>T
C>G
G>T
G>A
C>T
C>T
G>A
C>T
G>A
G>A
A>G
T>C
C>G
G>A
T>G
C>G
C>G
G>C
G>C
AGA>-
G>A
T>A
G>C
->T
C>G
C>T
T>A
T>C
A>C
A>T
C>G
T>A
T>A
C>T
C>T
C>G
C>A
T>G
TG>-
C>T
C>A
T>C
G>A
C>T
G>A
C>A
T>G
C>T
G>C
C>A
A>C
C>T
G>C
C>T
G>T
G>T
A>C
T>C
T>G
CT>-
C>G
A>G
T>G
C>G
C>G
C>T
T>C
C>T
T>C
G>A
C>T
G>A
C>T
C>T
G>A
C>A
A>C
T>A
C>A
G>A
C>T
G>A
C>T
T>G
G>T
C>T
T>A
C>T
G>C
G>A
C>A
G>A
T>G
G>A
C>G
C>T
C>G
T>-
G>A
T>C
A>T
G>T
C>G
G>T
C>T
G>C
T>C
G>T
G>A
G>C
T>C
G>C
C>A
T>G
TC>-
C>T
T>G
C>A
C>A
T>A
C>A
T>-
G>A
C>T
C>T
G>A
C>G
A>-
A>G
C>A
G>A
A>T
C>G
A>G
T>A
G>C
C>T
C>G
C>G
C>T
C>T
G>A
C>A
C>T
G>A
T>A
C>A
G>C
A>G
A>G
A>C
C>T
C>T
C>A
T>C
G>A
A>T
G>A
C>G
C>A
C>G
T>G
C>G
C>G
TTTCTCCTTTCCTAGTTTTGCC>-
T>A
G>T
C>G
T>C
C>T
C>T
G>A
G>A
A>G
C>A
G>T
C>G
T>C
G>C
C>T
G>T
T>C
G>A
G>C
T>G
C>G
T>A
C>A
G>T
AGA>-
G>C
G>C
C>T
C>G
A>T
G>A
C>T
T>A
C>G
G>A
C>T
C>T
G>C
G>A
A>C
G>A
G>C
G>A
G>A
A>-
T>-


G>A
C>T
C>T
A>G
C>T
C>T
G>C
T>-
A>G
T>-
A>G
T>-
C>G
G>A
G>C
G>A
G>A
G>A
C>T
C>A
G>C
A>G
A>G
G>A
G>C
A>G
C>T
G>T
C>T
G>A
G>A
A>G
C>T
G>C
C>T
G>A
->A
C>A
G>C
C>G
C>G
C>A
C>G
C>T
A>G
A>G
C>A
T>C
C>A
G>C
C>T
->G
A>G
G>T
A>T
C>G
T>C
G>C
C>T
G>C
G>C
C>T
->T
T>C
C>T
G>T
TG>-
G>A
C>T
G>C
C>G
G>A
C>G
->T
C>G
C>T
G>C
G>C
T>A
C>T
C>T
G>C
G>C
G>A
C>T
G>A
G>C
G>T
C>T
C>T
T>-
T>C
G>A
C>G
G>C
G>C
T>C
T>G
G>C
T>G
G>T
C>T
G>C
A>G
C>T
G>A
G>C
A>T
C>A
T>C
C>G
C>G
G>A
G>A
C>T
T>C
T>C
C>G
G>C
G>T
C>A
G>C
C>A
C>T
C>T
C>T
A>G
C>A
G>C
G>T
T>C
A>G
G>T
T>A
C>T
T>-
A>G
C>G
C>A
G>C
C>T
G>T
G>T
A>G
A>-
A>G
A>C
G>C
T>A
G>A
G>A
A>T
T>C
G>A
C>-
C>T
G>T
G>T
G>T
TCCA>TTC
C>T
G>T
G>A
A>T
G>C
G>T
C>G
C>-
G>A
A>T
A>T
T>G
G>C
G>C
A>T
G>T
A>G
C>T
A>C
A>G
GGGGGTACAGGCATTG>-
G>A
T>A
T>G
A>G
A>C
G>A
C>T
G>A
G>T
G>A
C>T
A>G
A>-
G>A
G>T
T>G
C>T
C>A
C>T
T>C
A>G
A>-
A>C
A>T
C>T
C>T
G>A
C>G
G>A
A>-
G>T
G>C
A>G
G>T
G>A
G>A
T>A
G>A
C>T
G>T
G>A
C>T
C>A
C>G
C>G
G>A
C>G
A>G
G>A
C>T
C>G
C>A
G>A
A>T
C>T
G>A
G>A
C>T
G>A
C>A
A>T
C>G
G>A
G>A

T>-
G>A
T>C
G>T
C>G
G>A
C>A
C>T
G>A
C>A
C>G
T>-
A>T
T>A
G>A
G>A
G>A
C>T
C>T
G>A
GTG>-
T>G
A>T
T>-
G>C
A>G
TTATGCACCCACCAGC>-
G>A
->T
T>-
C>T
->TTG
G>T
G>A
->A
G>A
C>T
C>T
->T
C>G
T>A
T>A
C>G
C>T
T>-
C>T
G>C
T>G
A>G
C>T
A>C
C>T
G>A
C>T
G>A
TG>-
C>T
C>T
G>-
C>G
A>C
T>-
C>G
G>A
A>C
A>-
A>C
C>T
G>A
G>C
A>C
G>A
A>-
C>A
G>A
G>T
->T
C>T
A>-
T>G
G>A
C>T
G>C
C>G
G>A
T>C
G>C
C>T
G>T
C>T
A>T
A>G
T>C
C>T
T>C
A>G
TTAG>-
T>A
C>T
C>G
G>C
C>T
C>T
G>T
A>C
G>C
C>G
C>T
T>-
A>C
C>T
G>A
A>G
T>G
G>T
G>T
G>A
C>T
C>T
G>A
A>T
T>C
T>C
C>G
T>C
T>A
T>C
C>-
C>G
A>C
C>A
A>C
C>A
G>T
G>A
A>T
C>A
C>T
T>A
C>T
C>-
C>T
A>T
A>G
G>C
G>A
T>A
G>A
->T
C>A
G>A
C>G
T>-
C>T
T>C
A>T
G>A
T>G
C>G
G>C
C>A
CAAATTAAACTTG>-
G>A
G>A
T>A
T>C
C>T
C>T
A>G
C>G
G>C
T>G
G>A
T>G
C>G
T>C
C>T
T>G
G>C
G>C
A>-
A>-
TGCAAATCAATAAATAGAT>-
T>C
A>C
G>C
T>A
C>T
C>G
G>T
G>C
C>T
G>C
A>-
C>T
C>T
G>A
T>A
G>A
A>-
A>C
A>G
G>A
A>G
A>C
A>T
G>T
C>G
C>G
G>A
G>A
C>T
G>T
G>T
G>C
G>A
G>C
G>A
G>A
G>C
G>A
G>A
G>C
C>T
G>A
G>C
C>A
CTCA>-
T>A
G>T
C>A
C>G
G>A
C>T
A>G
A>C


A>-
T>A
C>T
TTATCAATTGATAAGTC>-
C>T
A>-
G>T
A>G
T>C
G>A
G>A
G>A
A>G
A>T
->T
G>A
C>G
T>C
G>A
C>T
T>C
C>A
C>T
T>C
A>G
C>A
C>T
G>A
C>G
G>T
C>A
G>A
G>T
T>C
G>A
C>G
G>C
A>T
C>T
G>A
A>G
C>T
C>A
T>-
C>A
TGTTATGCCTGTTATATATAACACATGTGTTATGCCTGTTATATATAACACATG>TTATACA
->T
C>A
G>A
T>A
G>C
T>C
G>A
G>C
G>T
G>A
C>T
G>C
A>T
G>A
C>G
A>C
A>G
C>A
A>C
C>T
T>A
A>G
C>G
C>T
C>T
T>G
G>A
A>G
C>G
G>A
C>G
A>G
G>A
T>C
C>G
C>T
T>A
C>A
A>G
AG>-
T>A
T>A
C>T
C>A
C>A
T>C
G>A
C>G
C>T
A>G
G>C
C>G
G>C
A>-
G>T
G>C
C>T
T>-
C>G
T>C
T>C
C>G
T>G
G>T
C>G
T>G
G>A
A>C
C>A
G>T
C>A
T>C
->A
T>C
C>A
C>A
T>A
C>G
A>T
C>T
A>-
T>C
G>A
T>A
A>C
A>C
A>T
A>T
G>A
G>C
T>C
C>A
T>C
T>C
G>A
C>A
C>G
G>T
A>G
->AT
C>T
C>T
A>G
G>A
G>C
T>A
G>C
->T
G>A
A>G
G>A
G>C
A>C
G>A
C>G
AG>-
G>C
C>A
G>C
C>T
A>C
C>A
C>G
C>T
C>A
G>A
A>G
AG>-
C>T
G>C
C>T
A>T
A>-
A>C
T>A
G>T
->TA
A>G
G>C
T>C
G>-
C>T
G>-
C>A
C>T
A>C
GT>-
T>A
C>G
C>T
C>G
T>G
C>T
G>C
C>G
C>G
T>G
C>T
G>T
C>G
C>T
G>T
C>A
T>G
G>A
G>C
C>A
C>T
G>T
C>G
C>T
C>A
G>A
A>C
G>A
G>C
C>T
T>G
->A
A>C
A>T
G>A
C>T
C>G

G>C
C>G
C>T
T>C
A>T
A>-
T>C
G>A
G>T
G>A
C>T
C>T
C>T
T>G
C>T
T>-
G>A
A>C
G>A
A>G
G>T
A>T
C>A
C>A
G>T
C>A
A>T
C>T
A>T
A>G
G>A
C>G
T>-
T>C
T>-
C>G
G>T
->A
T>G
G>T
G>C
C>T
G>T
G>T
G>T
T>C
G>A
G>C
A>G
->T
AG>-
G>A
C>A
A>T
A>-
C>T
G>C
G>T
T>G
T>G
G>A
G>A
G>C
C>G
G>C
A>G
G>A
A>T
C>T
C>T
C>T
A>G
C>T
G>A
G>T
T>C
C>T
T>A
C>G
A>T
C>A
C>T
A>G
G>C
C>G
C>A
G>C
T>-
T>C
G>C
TT>-
C>G
T>G
C>T
C>T
A>G
G>A
C>G
C>A
G>T
C>T
C>A
C>T
C>G
A>T
A>G
C>G
G>T
T>-
C>G
A>T
C>T
A>G
C>T
T>A
C>T
T>C
->AATA
A>T
C>A
T>C
G>C
G>A
C>G
C>T
A>T
A>-
G>C
C>G
T>-
A>G
C>A
T>A
C>T
A>G
A>C
C>G
A>G
T>C
G>A
T>G
G>A
G>A
C>A
T>G
C>A
C>A
C>T
G>T
T>G
G>C
C>G
C>T
T>C
C>T
T>C
C>A
G>A
C>T
G>C
G>A
G>A
C>G
A>G
C>G
G>A
A>T
G>C
C>T
C>T
->A
GAC>-
C>-
G>C
G>A
G>A
C>T
C>A
G>T
C>A
T>A
C>T
T>C
T>C
T>C
C>T
A>T
C>T
T>C
C>G
C>T
C>G
C>T
A>G
C>T
A>T
T>C
CCCCAACAACTCAAATATC>-
C>G
A>-
->A
C>T
C>T
G>A
G>C
C>G
->T
T>C
CCATGAAA>-
T>A
G>C
T>C
G>C
C>A
T>C
G>C
T>A
C>T
C>A
C>G
G>A
C>G
G>C
G>T
G>A
T>C
A>C
A>G
C>A
C>A
C>T
C>A
A>-
C>T
A>-
T>A
G>C
C>A
C>T
G>C
A>T
C>G


C>G
C>G
G>A
G>C
C>G
C>T
A>G
G>T
G>C
A>G
C>A
C>A
A>-
C>A
G>C
G>T
G>T
G>A
A>C
C>T
G>T
C>T
A>G
TC>-
C>G
->A
C>T
C>G
C>-
G>C
TG>-
G>C
G>A
C>A
G>C
C>G
G>T
G>T
G>C
C>T
A>C
T>C
T>C
C>T
T>G
C>G
T>C
G>C
A>G
G>T
A>T
G>A
A>C
G>A
C>G
AA>-
G>A
G>A
C>A
C>A
G>C
G>A
C>T
A>G
C>A
C>A
G>A
A>-
A>T
C>T
G>C
G>A
G>A
C>A
G>A
C>A
G>C
C>G
G>C
T>G
G>A
A>G
C>T
C>T
G>C
G>A
C>G
T>C
T>C
C>T
G>C
A>T
C>A
C>G
G>T
G>C
A>C
C>T
T>A
C>A
A>-
A>C
A>-
->T
G>C
G>A
G>C
T>A
C>T
C>A
C>T
C>G
G>C
C>G
G>C
C>G
C>G
G>A
T>A
A>C
A>T
A>G
->A
C>A
C>G
G>C
C>T
T>-
T>C
G>C
G>C
G>C
G>C
C>G
G>C
->G
A>G
G>C
C>T
G>A
C>T
G>T
G>A
A>T
C>A
T>A
C>A
G>A
G>A
G>C
G>C
G>A
G>T
G>C
C>T
G>C
->A
C>G
G>A
G>C
C>G
A>-
G>A
G>A
A>-
G>A
G>C
G>C
T>-
G>C
G>A
T>C
G>T
G>C
G>A
A>G
C>A
T>A
G>A
G>A
G>A
T>G
A>C
C>T
->A
A>C
G>A
C>A
A>-
C>A
A>G
A>T
G>A
G>T
C>T
G>C
C>G
G>A
C>A
C>G
T>C
G>A
A>G
C>T
G>A
T>A
C>A
C>G
C>T
G>T
C>T
->T
C>A
T>A
G>A
C>T
G>A
A>C
T>C
T>A
G>C
->T
C>T
T>G
G>A
A>G
A>T
G>C
C>T
T>A
A>G
G>A
C>A
G>A
TGATTG>TAC
G>A
C>A
G>C
C>G
A>G
G>T
G>C
A>G
T>C
G>A
C>T
A>T
C>

G>A
A>C
C>T
C>G
C>G
G>C
G>A
G>C
C>T
G>C
G>A
G>C
C>G
G>C
G>C
A>C
C>T
C>T
C>G
G>A
G>A
C>G
T>A
C>G
G>C
G>A
G>C
A>C
ATTCCTCTGAACATTAGTGATCTCATCT>-
T>A
G>T
C>A
G>C
A>G
CTC>-
T>-
A>G
->AT
T>G
G>C
C>T
A>G
G>A
C>G
G>A
G>A
G>A
G>C
G>C
G>T
A>G
G>C
G>A
C>G
A>-
T>A
C>G
T>A
G>A
T>C
C>A
ATT>-
G>A
T>C
C>G
T>C
AG>-
A>T
C>T
G>A
A>-
T>G
G>C
G>T
G>A
C>T
T>C
G>A
G>C
A>T
G>A
C>T
C>T
A>G
A>G
C>T
C>G
C>A
G>T
G>A
G>T
C>T
C>G
C>T
T>A
C>G
C>G
C>T
C>T
G>A
C>T
->T
C>G
C>T
C>T
G>C
C>G
A>C
G>T
G>C
A>G
G>A
G>C
C>T
T>C
C>G
GCAGGCCCCACACT>-
C>A
G>A
G>A
C>A
C>T
G>T
G>A
G>A
C>T
C>T
C>T
G>A
T>C
C>G
C>T
G>A
C>T
AAAAATTAGCCAGGCGTGGTGGCACGCACCTGT>AAATTAGCTGGGCATGGTGGCGCATGCCTGT
C>T
C>T
G>A
C>A
T>A
T>C
C>T
C>G
A>C
G>A
C>G
G>A
C>G
G>A
G>T
T>G
A>T
G>A
T>A
C>G
C>T
T>G
T>G
T>C
C>A
T>C
A>T
A>C
G>C
T>C
TG>-
G>C
C>T
C>A
G>A
G>A
G>A
C>G
C>A
C>A
A>G
C>A
A>C
C>T
C>T
C>G
G>A
G>A
G>A
G>A
CCCAATA>-
A>G
C>A
G>T
C>T
C>T
C>G
C>T
T>G
A>G
->A
T>A
C>G
C>G
G>A
G>T
C>G
C>T
G>T
T>G
G>A
T>C
G>T
T>-
T>G
A>-
A>C
T>C
C>T
G>A
T>-
C>T
C>T
G>C
C>T
G>A
G

C>G
G>C
T>C
T>A
T>G
T>A
G>T
A>G
G>A
T>-
C>A
G>C
T>C
TTG>-
A>C
G>A
T>-
G>A
A>C
A>C
G>T
C>T
C>G
T>C
A>T
T>-
A>C
T>G
C>A
T>A
G>A
C>G
C>A
C>T
T>A
G>A
G>C
C>-
G>C
G>A
G>A
G>C
T>A
G>-
G>A
TG>-
A>T
G>T
A>-
C>A
T>A
A>T
C>A
T>C
C>T
A>G
C>A
G>C
G>C
T>G
G>T
C>A
G>A
C>A
A>T
A>T
C>T
C>T
A>T
T>A
G>C
G>A
A>G
C>A
G>T
G>C
T>C
T>G
G>C
A>T
T>C
G>A
C>T
T>C
A>-
C>G
G>C
C>T
G>-
A>C
C>G
A>C
C>A
G>C
A>G
G>A
C>A
->AC
C>G
A>G
C>G
->T
T>-
G>C
C>T
C>G
C>A
G>A
T>-
C>T
C>A
AT>-
A>G
C>G
C>G
A>C
G>A
T>C
A>-
A>C
C>G
G>T
T>C
ACCATATTCTCATTCCTCTGCC>-
C>T
G>A
A>G
C>T
G>A
C>T
A>T
A>T
C>T
C>T
G>C
G>C
C>G
C>A
G>T
C>A
C>G
A>G
G>A
G>A
C>T
G>C
A>G
->A
C>T
G>A
C>G
C>G
C>T
->T
G>C
C>T
A>G
C>T
C>T
T>C
C>A
T>C
C>T
A>C
G>T
T>A
C>T
A>C
G>C
G>A
C>T
A>G
A>G
A>T
C>G
C>A
G>A
G>T
C>T
G>T
G>T
C>T
G>A
G>T
G>T
C>A
G>A
C>T
G>A
A>G
A>G
T>A
A>T
G>A
T>C
T>A
C>T
C>A
A>G
T>G
G>A
G>A
A>T
G>T
G>A
A>G
G>A
A>T
C>A
A>G
G>A
CTTCCTCATCCAGGAGATGGACATAAATAGTAC>-
C>T
C>A
C>G
C>G
A>G
TA>-
A>C
A>-
A>G
C>T
G>A
->A
T>A
T>C
T>C
T>A
A>C
TTAAGCAGCCACCAGTTATTTC>-


T>C
G>C
G>A
C>A
T>-
T>A
G>T
G>A
G>T
T>C
G>T
G>A
G>C
A>G
C>A
->A
T>A
A>T
T>G
G>T
C>T
T>C
C>T
TT>-
A>G
C>G
A>G
G>T
G>T
G>A
A>C
T>C
A>G
G>T
C>T
G>C
C>A
T>C
T>A
T>C
A>G
C>T
G>A
G>T
A>C
A>C
T>G
A>T
G>C
C>T
G>A
C>A
T>C
C>-
T>C
A>C
C>A
A>T
T>A
T>G
C>T
C>G
A>G
G>A
A>C
T>C
G>A
A>G
T>A
C>T
G>C
T>G
G>A
C>A
C>T
T>C
G>A
G>C
A>G
G>T
C>T
G>T
G>A
A>C
A>-
T>A
T>C
G>T
G>C
G>A
G>T
A>T
C>-
A>G
G>T
C>T
C>T
A>G
T>C
A>T
A>T
A>G
G>C
T>A
A>G
G>T
C>A
C>T
T>A
T>A
C>A
->T
A>C
G>C
A>T
A>G
T>A
G>C
C>G
C>T
T>C
A>T
G>A
C>T
A>G
C>T
C>A
C>A
G>T
T>-
C>G
C>T
G>A
G>A
AAGAAACACGCGAATGG>-
C>G
A>T
T>-
G>T
G>T
A>C
A>C
G>T
C>T
A>T
A>-
C>G
A>-
T>G
T>A
C>T
A>G
G>A
G>C
C>A
C>G
A>C
G>A
GCAAACTATCACAAGGACAAAAAACCACACACC>-
G>A
G>A
A>G
T>C
->CTGCCCTCCTAT
C>G
G>C
T>A
T>-
C>T
C>T
G>T
T>C
A>T
T>C
C>T
T>-
->T
C>A
G>T
A>C
C>G
A>T
G>A
G>T
G>T
G>A
C>G
C>T
A>C
C>A
C>A
A>T
C>T
G>T
A>T
C>T
G>C
C>T
C>T
A>T
A>T
G>C
T>-
A>G
A>G
C>T
G>A
G>A
G>C
C>T
G>A
C>G
T>A
G>A
C>T
TA>-
C>T
G>A
G>A
A>G
C>T
T>G
G>A
T>-
T>-
C>A
A>G
G>T
T>A
T>C
C>G
T>-
C>G
A>C
A>-

C>T
G>A
T>C
A>T
T>A
G>T
G>C
G>A
G>A
C>T
T>C
C>T
T>C
A>G
T>C
G>T
A>-
T>C
C>T
G>T
->T
C>T
C>A
G>A
C>T
C>G
C>T
C>G
T>G
A>G
C>G
G>A
G>T
T>-
T>A
C>G
G>C
G>T
G>A
C>T
C>G
G>A
C>A
->GT
G>T
T>G
C>G
A>G
C>A
C>T
G>A
C>G
G>T
G>A
TCTACTTGTCTGTCA>-
G>T
A>-
G>C
T>G
C>G
G>T
C>T
C>G
A>G
C>G
A>T
A>-
->T
G>C
T>C
A>G
C>G
A>-
C>A
T>C
G>A
A>G
G>A
G>C
G>T
T>C
A>G
C>G
G>C
G>A
C>A
T>C
C>A
C>G
T>-
T>C
G>A
C>G
A>T
A>T
A>T
C>A
T>G
C>T
C>A
G>A
C>T
T>-
G>A
A>T
G>A
T>-
C>G
G>A
A>G
G>A
G>A
C>A
T>C
G>A
C>T
C>A
C>T
A>C
G>A
A>G
A>G
C>T
C>G
G>T
A>-
->A
T>C
C>T
A>T
C>G
C>G
T>A
A>C
C>T
T>A
G>C
T>C
G>A
G>A
C>A
G>C
G>C
G>A
A>C
G>A
G>A
T>G
T>A
A>G
C>G
G>C
C>T
C>T
C>A
C>A
G>A
T>-
C>G
T>C
G>C
G>A
G>A
C>T
CCT>-
G>A
A>-
G>A
C>A
C>T
C>G
C>T
C>A
C>A
T>C
G>A
C>A
A>G
C>A
C>G
T>C
C>G
A>T
G>A
A>T
C>T
C>T
A>T
T>A
G>A
A>G
C>T
T>A
G>C
G>A
A>G
T>A
A>T
C>A
T>-
A>G
C>G
A>T
CCCTTCTCTTTGCATAAAATGAAGTAGA>-
C>G
A>T
G>A
G>A
G>A
G>A
C>T
->T
C>T
C>G
G>C
G>T
G>A
G>A
G>T
A>C
T>C
G>C
C>T
T>A
G>T
T>C
A>G
G>A
A>G
->T
T>-
T>C
G>T
C>T
A>-
A>G
C>T
A>G
C>G


G>C
A>G
G>A
G>T
A>G
C>T
G>C
C>A
G>C
C>T
G>A
A>G
A>G
A>T
T>G
C>T
A>C
G>C
G>C
G>C
G>A
G>T
T>A
C>G
C>G
G>A
C>A
A>G
G>C
A>G
C>A
C>T
C>A
T>A
C>G
G>C
GA>-
G>-
G>A
A>T
A>T
C>T
G>A
C>T
G>A
G>A
C>T
A>C
C>G
G>C
G>C
T>G
T>A
C>T
TT>-
T>-
C>T
C>G
G>A
T>G
T>G
->TTATGTTGATGTAGAAC
G>A
C>G
C>T
T>C
G>A
A>-
C>A
T>-
G>A
G>A
A>T
C>A
G>A
G>C
A>G
T>G
C>T
T>G
C>G
C>G
C>G
C>T
C>G
C>G
C>G
C>T
C>T
C>T
C>G
C>A
C>T
G>A
C>T
->A
G>A
A>G
C>A
A>T
C>T
G>T
A>T
G>C
T>C
C>T
A>T
A>C
G>C
G>C
G>T
A>G
A>-
G>A
C>G
G>A
G>A
C>G
G>A
G>C
A>G
C>T
G>A
G>A
C>G
T>C
T>C
A>G
T>C
G>A
C>T
G>T
T>A
G>T
G>C
A>G
C>T
A>T
C>G
C>A
A>C
T>A
G>A
C>T
CTT>-
C>G
C>T
G>A
G>A
A>G
G>A
G>A
T>-
C>T
A>G
A>G
C>A
T>C
C>T
C>G
C>T
C>G
G>C
C>T
C>A
C>A
T>G
C>T
C>A
G>A
G>A
G>A
A>G
T>A
C>T
G>C
G>T
C>T
G>A
C>T
T>G
T>A
G>C
T>A
A>G
A>G
G>C
T>G
C>T
A>G
G>A
T>G
G>T
C>A
T>C
G>A
G>C
C>A
TCTGAGAATA>-
G>T
T>C
A>C
C>T
G>T
G>C
C>A
AT>-
G>C
G>C
G>A
A>T
G>-
G>A
C>A
C>T
G>T
G>A
T>C
T>A
C>A
G>A
T>A
C>T
C>G
G>C
A>T
A>-
C>T
T>G
T>-
G>A
C>T
T>G
C>T
C>T
G>A
G>A
G>A
GT>-
G>T
T>A
A>C
-

G>A
A>G
C>T
C>T
A>C
G>C
G>C
A>T
A>G
A>G
G>C
T>G
T>C
C>G
A>G
G>A
G>T
T>C
A>C
C>T
A>G
A>T
G>T
G>A
T>C
C>T
A>T
A>T
G>A
C>A
G>C
G>A
C>A
G>C
T>G
->T
T>C
G>C
G>C
C>G
A>G
A>-
G>T
A>-
G>A
T>C
C>A
C>T
G>C
G>A
C>T
G>A
G>A
C>T
G>C
C>T
G>A
C>T
C>A
G>A
C>T
T>A
T>A
G>T
A>T
A>G
A>G
T>C
T>A
T>-
->T
G>A
G>T
G>A
C>A
A>G
G>A
C>G
C>T
G>A
G>A
C>G
G>-
C>T
C>T
G>A
G>T
A>G
TA>-
G>C
C>A
A>-
T>G
T>C
C>T
C>A
C>T
T>C
G>T
A>G
T>G
G>T
C>T
G>C
A>C
G>T
A>C
T>C
A>G
T>C
C>T
A>-
A>G
G>A
T>G
T>C
C>T
G>C
C>T
C>A
A>G
G>A
C>T
G>C
C>T
C>T
T>C
G>A
A>T
G>T
T>C
A>C
G>C
C>A
G>C
G>A
C>T
C>T
A>G
AGACTTAA>-
AC>-
C>G
C>T
T>G
A>T
C>A
T>C
C>T
AAATTAA>-
G>T
C>G
T>C
C>T
G>C
C>T
T>C
A>G
T>C
A>-
G>C
G>C
G>A
G>C
G>C
T>C
A>-
G>C
A>T
C>T
C>A
G>A
C>G
C>G
A>C
C>T
T>A
G>C
G>T
C>G
G>A
G>C
A>T
G>C
T>G
T>C
C>T
G>T
T>C
C>A
A>T
C>A
G>T
->A
T>C
G>A
G>A
G>T
G>C
C>T
C>A
C>G
C>A
T>A
C>G
C>A
G>A
T>-
C>G
C>G
A>G
C>T
G>C
C>T
G>A
T>C
T>C
G>T
A>-
C>T
A>C
G>A
G>A
->G
G>T
G>T
A>G
A>G
T>C
G>A
G>C
G>A
G>C
G>A
T>C
A>T
A>C
T>A
T>-
C>T
A>T
G>A
A>G
G>C
C>A
A>G
AC>-


C>T
C>T
G>C
A>G
A>C
C>T
C>A
G>A
C>T
G>A
C>T
T>-
A>C
A>G
T>C
C>T
G>A
T>G
C>A
A>G
C>T
C>T
T>C
T>A
A>G
A>-
C>G
C>G
A>C
T>C
A>-
C>T
G>A
A>-
C>G
C>T
G>T
C>G
T>A
A>C
T>-
G>A
T>C
G>C
G>A
C>A
C>T
C>G
A>G
->A
C>A
C>A
G>A
T>G
G>C
C>A
A>G
T>G
A>T
ACTA>-
G>A
G>T
G>C
G>C
C>T
C>T
G>C
T>A
C>A
G>C
G>T
T>A
G>C
C>G
G>T
G>C
G>T
C>T
A>G
C>T
C>T
C>T
G>T
G>C
G>T
G>C
G>T
C>G
C>G
G>A
T>C
G>A
A>C
C>-
->T
G>C
G>T
C>G
C>A
C>G
C>A
G>A
->T
A>G
C>A
A>-
C>T
C>A
G>C
C>A
G>C
C>G
C>G
A>G
G>C
C>G
G>T
T>A
A>-
->A
A>C
C>G
A>C
C>T
G>A
C>T
G>A
C>T
G>T
G>T
A>T
G>T
A>G
C>T
T>C
G>A
C>T
C>T
C>T
G>T
C>T
C>T
G>A
C>A
G>A
G>A
C>G
C>A
G>A
C>T
G>C
G>T
G>A
G>T
A>T
C>T
C>G
T>G
T>A
T>G
T>-
G>A
C>A
AAG>-
G>A
G>C
A>T
->T
C>A
T>G
T>A
C>G
C>A
A>T
A>T
A>T
G>A
C>G
A>T
C>T
G>A
G>A
C>A
A>C
A>T
C>A
G>A
C>G
A>G
G>A
A>C
G>A
C>T
T>G
C>G
A>G
G>C
A>-
T>-
G>C
T>A
A>G
C>G
A>T
C>T
G>A
G>A
A>G
T>C
G>A
G>A
G>T
G>T
G>A
G>A
TCTTAGCTGTTCAGCACTCAGGCTA>-
T>C
T>C
T>C
A>T
T>A
G>C
G>C
C>T
G>C
G>T
G>C
C>G
G>C
C>T
C>G
G>A
G>C
T>-
A>G
A>G
C>G
T>C
C>A
C>T
G>C
G>C
A>G

G>C
G>A
C>A
C>T
G>A
T>A
C>A
CT>-
C>T
T>A
T>-
T>C
C>A
G>C
A>-
G>C
G>A
AG>-
G>A
A>G
C>G
A>G
G>T
A>T
C>G
C>T
G>-
C>G
C>T
T>C
C>G
G>A
G>A
T>C
C>A
C>-
G>A
A>G
G>A
G>A
A>G
A>G
T>G
G>A
C>A
C>T
C>T
T>A
A>G
C>T
C>T
G>A
A>T
A>G
G>A
T>C
G>C
C>G
G>A
C>A
C>T
C>T
->T
G>A
G>A
C>T
T>A
T>C
A>T
C>A
G>C
T>-
T>C
T>C
C>G
C>G
A>T
C>G
C>G
C>G
C>G
A>G
T>G
->T
C>T
C>G
G>A
G>A
G>A
G>T
A>T
C>G
T>A
T>A
C>-
->T
T>C
T>A
G>T
G>A
G>A
C>T
C>T
A>G
T>G
G>T
G>C
->T
C>T
A>T
C>T
A>-
G>A
G>T
T>A
A>C
C>T
C>T
C>T
A>G
T>G
C>G
T>A
C>T
G>T
G>A
A>G
C>G
A>T
T>A
A>T
A>G
A>-
A>-
T>A
G>T
C>T
G>T
C>G
A>G
G>A
C>T
A>-
A>C
G>T
G>C
C>T
T>C
ACTGCAAATGATATTATTTTGTTTTTCT>-
C>T
C>T
G>T
C>T
C>A
C>A
C>T
T>G
C>T
G>A
C>T
G>C
G>C
C>G
G>C
G>A
C>T
A>G
AC>-
C>A
C>T
C>T
C>A
C>T
C>G
C>G
G>A
C>T
A>G
T>C
T>C
C>G
G>A
G>T
T>C
G>C
T>C
C>A
C>G
->T
T>-
T>A
A>C
G>A
A>T
C>T
G>T
G>C
G>A
C>T
CTTTATTTTA>-
A>T
C>T
C>T
T>A
T>G
C>G
C>A
G>A
G>A
CTT>-
C>A
T>A
C>T
A>G
C>T
G>A
C>T
C>A
C>G
C>T
G>A
G>-
C>G
G>C
ATGTTCTTAG>-
C>A
C>T
T>C
C>T
A>C
C>T
G>T
C>A
T>C
G>T
C>T
T>C
A>

G>A
C>G
G>A
A>C
->CA
G>A
T>-
C>T
C>-
->A
AGAT>-
->A
G>C
CATCCATGTCCTTGCAAAGGA>-
C>T
TTGG>-
A>C
C>G
G>A
A>G
C>T
G>A
G>C
G>T
->A
G>A
C>T
T>C
G>C
C>T
T>C
G>A
G>T
A>G
A>G
G>A
G>C
->C
C>G
G>C
T>-
T>A
G>A
G>A
G>A
A>C
G>A
C>T
G>T
T>A
T>A
C>T
T>C
T>G
G>C
C>T
G>A
G>A
A>G
G>C
C>T
G>A
A>G
C>A
C>G
G>T
G>C
G>C
C>A
G>A
C>G
C>T
G>A
G>A
G>C
C>T
C>A
G>A
C>T
C>G
G>C
A>G
G>A
G>A
G>T
G>T
A>G
G>T
->C
G>C
G>A
C>T
C>A
G>C
T>-
G>C
G>T
C>G
C>T
C>T
G>C
G>C
G>C
G>A
G>A
C>T
C>T
T>A
T>G
C>T
G>T
A>C
G>C
C>T
G>A
C>T
G>T
C>G
A>G
C>A
G>T
C>T
C>T
C>T
G>A
T>G
G>A
G>A
T>G
C>A
T>G
G>A
C>T
C>T
C>T
G>A
A>G
G>C
T>A
C>A
G>T
G>A
A>G
T>C
G>C
G>C
A>C
A>C
A>C
T>A
G>A
A>G
G>C
G>C
T>A
C>T
G>C
T>A
G>A
G>C
C>T
C>T
G>A
C>T
G>A
->A
G>T
G>C
G>A
T>C
C>A
G>A
A>T
G>A
G>A
C>G
T>C
C>T
TAGTCCCAAATCT>-
G>C
G>C
G>T
A>T
G>C
G>A
C>G
G>T
A>G
A>T
C>T
G>A
G>C
G>A
G>C
T>C
G>A
C>T
G>A
T>G
C>T
C>A
A>T
A>C
A>G
G>T
G>T
A>G
G>A
C>T
T>G
A>T
C>G
G>C
T>-
C>T
C>G
T>-
T>G
T>A
C>T
T>C
C>A
ATT>-
C>T
T>A
G>A
T>G
G>T
G>C
T>G
G>T
C>T
C>T
G>A
A>C
C>T
C>T
C>T
G>A
T>-

A>T
C>A
G>A
A>G
C>G
G>C
T>A
->T
T>A
C>T
A>G
T>C
C>T
T>A
G>T
A>G
T>A
A>G
G>C
G>T
A>T
A>G
C>A
G>A
G>T
G>A
C>G
C>G
C>T
C>G
T>A
C>G
T>C
G>A
G>C
G>A
TCCCACATGTTTAGCAT>-
T>G
G>A
T>-
C>T
C>G
A>G
T>C
G>C
T>A
C>T
T>C
G>A
C>T
T>G
G>T
C>T
A>T
C>-
C>T
C>T
->TA
->T
C>T
G>A
C>G
C>T
A>T
G>A
G>T
C>T
C>T
C>G
C>T
G>T
C>T
C>G
C>T
C>T
G>A
G>C
C>T
A>G
G>A
TCCCTCCCAT>-
T>-
->T
G>A
G>A
G>A
AAAC>-
G>T
G>C
A>G
T>C
G>A
T>C
C>G
C>A
C>A
C>T
T>A
C>G
C>T
A>-
G>C
A>G
A>-
A>-
G>T
G>C
G>T
G>A
C>T
C>G
C>T
C>G
C>G
C>T
A>-
T>G
C>A
G>A
A>G
G>T
T>C
G>A
T>C
G>A
T>A
G>C
G>T
C>T
G>A
G>A
A>G
A>G
G>A
T>C
C>G
G>C
C>T
A>T
G>C
C>T
C>T
G>A
C>A
C>G
G>A
C>T
G>C
C>G
G>T
G>C
G>C
G>T
C>T
G>A
G>A
->A
T>-
G>T
C>T
C>T
C>T
T>A
G>A
G>T
G>C
G>A
T>C
C>T
A>G
G>A
T>C
C>A
G>A
C>A
C>T
C>T
G>T
G>C
C>G
G>C
A>G
C>T
T>C
A>C
A>C
G>T
G>A
C>A
G>A
G>T
A>G
A>T
G>T
G>T
G>A
G>T
C>G
T>A
A>T
C>G
C>G
G>C
A>G
G>A
T>C
G>C
G>A
A>T
A>G
C>G
G>T
C>T
G>A
G>A
A>T
T>C
G>C
C>A
C>T
G>C
G>A
G>T
A>T
C>T
A>G
C>G
G>A
A>T
C>G
C>T
C>T
G>T
C>T
G>T
G>C
G>A
C>T
C>A
C>T
G>A
T>G
G>A

A>T
C>T
G>C
C>T
G>A
C>T
C>T
T>G
->T
C>G
A>C
G>A
T>-
G>A
G>C
C>T
T>-
A>-
C>T
C>T
G>C
T>G
A>G
T>C
G>C
G>C
G>C
G>A
C>T
G>C
A>G
G>A
G>A
C>T
G>C
G>A
T>C
G>A
G>C
C>G
G>T
G>A
G>A
T>C
A>C
A>G
G>A
C>-
G>A
C>T
C>G
C>G
G>T
T>C
A>T
A>G
G>C
C>T
C>G
C>T
A>G
C>A
G>A
C>A
G>A
C>G
A>G
G>A
C>T
G>C
G>A
G>C
C>T
C>T
G>C
A>T
G>A
C>T
G>A
->T
G>T
G>A
G>T
G>T
G>A
G>C
G>C
G>C
G>C
G>T
G>C
C>T
C>G
G>C
G>C
A>G
C>T
G>C
G>T
G>T
G>A
G>T
T>C
C>T
T>A
G>C
C>G
G>A
G>C
G>-
->GT
C>T
G>A
G>C
C>A
C>G
C>T
C>T
C>G
G>A
G>A
C>T
C>T
C>T
G>T
A>T
C>G
C>T
G>T
->C
G>A
C>G
G>A
C>T
G>A
C>G
G>C
C>G
G>A
G>C
C>T
->C
C>A
C>T
A>-
C>T
G>T
T>C
A>G
C>A
C>T
A>C
C>A
C>G
G>C
G>C
G>C
C>T
C>G
A>G
C>G
G>T
C>T
G>-
A>G
C>T
T>A
G>A
G>A
C>G
G>C
C>T
T>A
G>A
A>G
C>G
T>G
C>T
->GGGTGCCCTGCGGCCCGGCTCTCCTCGGTTCCCCTGATCCGTCCAGAGAACAAACGCCAG
C>T
T>C
C>T
C>G
C>T
G>C
G>A
G>A
T>C
T>C
G>A
C>T
C>G
G>C
AGTGAGA>-
A>-
C>T
A>-
T>G
C>-
C>G
C>T
T>C
C>G
A>-
C>T
C>G
C>G
AAG>-
A>G
G>T
C>T
G>A
G>C
C>T
C>A
C>G
A>T
C>T
C>G
C>G
G>A
T>A
G>C
C>T
->A
T>C
C>G
A>-
T>A
C>G
G>A
CCACA>-


A>T
G>T
C>T
G>A
C>A
T>C
C>G
T>A
T>C
G>A
TAAGT>-
C>T
T>-
C>T
TTG>-
T>A
A>G
C>T
->ATAC
T>-
G>A
C>T
T>C
C>T
C>G
C>G
A>T
A>T
G>A
T>C
C>G
T>G
C>G
C>G
A>C
G>C
C>T
T>G
G>T
->A
A>C
G>A
T>G
->T
C>G
C>T
G>A
G>C
T>-
G>C
C>T
C>T
A>G
CTACCCCTGGTCCCCAGGCAGCCA>-
C>T
T>-
A>G
T>A
G>C
C>T
T>-
T>C
A>C
A>G
G>A
C>T
G>A
C>G
A>T
C>G
C>G
C>G
->A
T>A
T>C
T>C
T>-
G>C
C>G
G>C
C>A
T>A
C>G
G>A
T>C
C>A
G>A
A>-
A>C
C>T
C>T
G>A
T>G
T>G
T>C
T>C
G>A
G>T
C>G
C>G
T>C
G>A
G>C
T>A
C>T
G>A
G>A
C>G
C>T
C>A
G>T
G>A
C>T
G>A
A>T
G>C
T>-
C>A
C>T
C>G
G>A
C>T
->A
G>T
G>T
G>C
C>G
G>A
A>G
C>A
C>T
G>T
C>G
A>G
T>C
G>A
C>T
->G
C>T
C>T
A>G
G>A
C>T
G>C
G>A
->T
T>-
->A
G>T
G>A
C>T
C>T
C>T
C>T
C>T
G>A
C>T
G>C
G>A
G>A
G>A
G>T
C>G
G>A
G>C
C>A
G>C
C>T
C>A
C>T
G>C
G>A
->A
C>T
C>A
C>T
A>G
G>C
T>A
G>A
C>T
C>A
T>A
G>A
T>C
T>C
T>G
A>T
G>A
G>-
A>T
C>G
C>G
G>A
G>A
C>T
T>-
G>A
->AT
G>C
C>A
C>T
A>G
G>C
C>G
G>A
G>A
A>C
C>G
G>T
G>A
G>C
C>T
C>G
C>G
G>A
C>A
C>A
G>A
G>A
A>G
->A
A>G
C>T
A>-
T>-
G>C
G>A
C>T
G>A
G>C
C>T
G>C
G>A
G>A
G>T
A>G
C>G
G>C
C>G
G>T
A>C

G>T
G>T
T>C
A>G
G>C
G>C
C>G
C>A
A>-
C>G
T>C
G>C
G>T
A>G
G>A
C>A
G>C
C>G
G>C
G>C
C>G
C>T
G>A
T>-
T>-
C>A
G>C
GGAAGGCCGAGGAGGGCAGATCACAAG>-
C>T
T>-
G>A
T>C
G>T
A>C
G>A
C>T
A>G
C>G
C>T
C>A
T>G
C>A
A>-
C>T
C>T
G>C
G>C
G>A
C>A
T>A
G>C
G>A
C>T
C>A
C>G
G>A
G>C
G>T
T>G
C>A
C>A
C>G
A>-
G>A
G>A
C>T
G>A
C>A
T>-
A>C
A>C
G>A
G>T
C>T
G>A
T>C
C>G
C>T
AACA>-
G>A
T>A
G>T
T>-
T>A
T>C
T>-
C>A
T>A
C>T
A>-
G>T
C>G
C>G
C>G
C>T
G>T
C>T
G>A
C>A
T>A
C>G
T>G
T>-
T>A
C>T
A>-
C>G
T>-
A>T
C>G
C>G
T>C
T>A
A>G
T>G
C>G
TATTACTAACAAATAGTCATGTAACTGA>-
C>A
C>T
C>G
G>A
C>T
C>A
T>C
C>G
G>A
G>A
G>A
A>G
A>T
G>T
T>A
T>-
C>G
T>C
T>G
T>A
C>G
C>T
T>A
T>G
C>G
G>A
T>-
C>T
C>T
G>-
G>A
C>T
G>C
A>-
T>A
G>A
C>T
C>G
G>A
G>C
A>G
A>G
T>G
C>G
A>-
A>-
G>C
C>G
C>T
A>T
T>C
->A
->T
A>-
C>T
TTTAG>-
G>A
C>T
G>T
T>C
T>-
T>C
C>G
AG>-
->T
T>-
G>T
C>T
T>A
G>A
C>G
G>A
A>G
C>T
G>A
C>T
T>C
G>A
TT>-
A>-
G>T
T>C
C>T
C>G
T>A
T>A
G>A
T>C
C>G
A>C
A>G
AT>-
G>A
A>-
G>C
T>C
T>C
T>G
T>-
C>T
T>A
C>A
G>A
T>C
G>A
G>A
G>A
C>T
C>G
CATGT>-
T>C
A>C
G>A
C>T
G>A
G>T
G

C>G
C>G
T>A
G>A
A>-
C>A
C>T
T>-
->T
C>T
A>-
G>T
G>A
T>-
C>T
T>C
A>G
C>G
A>T
G>T
T>C
C>G
G>A
C>T
C>T
C>A
T>-
C>G
T>G
C>G
C>A
C>G
C>G
T>C
C>T
C>T
A>G
C>A
G>A
G>A
G>A
T>C
C>T
C>A
G>C
A>-
C>T
->AT
A>T
C>T
C>G
C>T
A>T
A>T
C>G
T>A
C>G
G>T
A>C
C>G
A>G
C>G
G>C
G>A
C>T
A>G
G>A
T>-
C>T
C>T
G>-
C>T
G>C
T>C
T>-
G>A
C>T
C>A
C>G
C>G
A>T
A>G
T>-
A>G
G>T
G>A
G>-
T>G
C>A
G>C
G>A
C>G
C>T
T>C
AGGTGGGGTAAGATGGCGAA>-
A>T
C>A
C>T
G>T
T>C
C>A
G>A
C>A
T>C
C>T
C>G
G>T
T>A
C>G
C>G
A>-
C>A
C>T
G>T
T>G
A>G
T>C
C>A
C>A
G>A
G>A
G>C
C>G
T>C
G>T
G>C
C>T
C>T
G>C
C>T
A>T
T>C
C>T
G>T
C>T
G>A
C>T
C>A
G>A
C>G
G>A
C>T
G>C
G>A
T>C
C>T
G>A
G>A
C>G
G>C
C>G
G>A
T>A
G>A
G>A
C>T
C>T
G>C
A>T
C>A
C>G
C>G
G>A
A>T
G>A
T>C
G>A
G>A
T>-
C>T
C>T
A>G
C>G
T>A
C>T
A>T
C>T
C>T
C>G
T>A
G>C
C>A
T>C
C>A
G>A
A>C
G>C
T>G
C>T
C>T
C>T
G>A
C>T
C>G
T>C
G>C
G>A
C>T
C>-
G>A
G>T
T>G
T>G
G>A
G>A
T>G
G>A
C>T
G>A
C>T
C>G
T>G
T>C
C>T
C>T
C>T
G>A
C>G
A>G
G>A
C>A
C>T
G>A
C>G
G>A
C>T
G>T
A>G
G>A
C>G
G>A
->TC
G>A
C>G
C>G
T>G
C>T
T>A
G>A
G>T
T>C
C>G
A>G
G>A
A>T

KeyboardInterrupt: 

In [167]:
fields = []
for mutation_field in mutations:
    fields += [mutation_field]
fields

['MUTATION_ID',
 'MUTATION',
 'POSITION_GRCh37',
 'POSITION_GRCh38',
 'RELATIVE_POSITION',
 'OVERLAPPED_GENES',
 'CONSEQUENCE(S)',
 'PROJECT(S)']

In [72]:
m = list(mut_table.find(mutation_id=['MU64619292', 'MU82202760', 'MU38161712']))
for mut in m:
    print(mut)

OrderedDict([('id', 27323876), ('mutation_id', 'MU38161712'), ('chromosome', '2'), ('GRCh37_pos', 169942588), ('reference_allele', 'ATC'), ('mutated_allele', 'A'), ('quality', '.'), ('filter', '.'), ('occurrence_global', 27323876)])
OrderedDict([('id', 27323401), ('mutation_id', 'MU64619292'), ('chromosome', '2'), ('GRCh37_pos', 16992349), ('reference_allele', 'AT'), ('mutated_allele', 'A'), ('quality', '.'), ('filter', '.'), ('occurrence_global', 27323401)])
OrderedDict([('id', 4), ('mutation_id', 'MU82202760'), ('chromosome', '1'), ('GRCh37_pos', 100000110), ('reference_allele', 'G'), ('mutated_allele', 'A'), ('quality', '.'), ('filter', '.'), ('occurrence_global', 4)])


In [73]:
m_df = pd.DataFrame([dict(mut) for mut in list(m)])
m_df

Unnamed: 0,GRCh37_pos,chromosome,filter,id,mutated_allele,mutation_id,occurrence_global,quality,reference_allele
0,169942588,2,.,27323876,A,MU38161712,27323876,.,ATC
1,16992349,2,.,27323401,A,MU64619292,27323401,.,AT
2,100000110,1,.,4,A,MU82202760,4,.,G


In [78]:
m_df.query('mutation_id == "MU82202760"')

Unnamed: 0,GRCh37_pos,chromosome,filter,id,mutated_allele,mutation_id,occurrence_global,quality,reference_allele
2,100000110,1,.,4,A,MU82202760,4,.,G


In [79]:
m_df[m_df.mutation_id == "MU82202760"]

Unnamed: 0,GRCh37_pos,chromosome,filter,id,mutated_allele,mutation_id,occurrence_global,quality,reference_allele
2,100000110,1,.,4,A,MU82202760,4,.,G


In [123]:
# Open the file for reading chunk by chunk
# And to experiment before the actual processing
mutations_reader = pd.read_table('~/results/all.BRCA-EU.mutations-context.tsv', 
                                 header=1, 
                                 delim_whitespace=True,
                                 chunksize=1000)
mutations = mutations_reader.get_chunk(10)

In [124]:
mutations

Unnamed: 0,MUTATION_ID,MUTATION,POSITION_GRCh37,POSITION_GRCh38,RELATIVE_POSITION,OVERLAPPED_GENES,CONSEQUENCE(S),PROJECT(S)
0,MU64868974,C>G,chr2:169922536,chr2:169066026,INTRONIC,ENSG00000073737(DHRS9),"ENSG00000073737(DHRS9):intron_variant,ENSG0000...",BRCA-EU
1,MU67221130,C>T,chr2:1699231,chr2:1695459,INTRONIC,ENSG00000130508(PXDN),"ENSG00000130508(PXDN):intron_variant,ENSG00000...",BRCA-EU
2,MU64619292,T>-,chr2:16992349,chr2:16811082,INTERGENIC,BRCA-EU,,
3,MU66012277,C>T,chr2:169923756,chr2:169067246,NON-CODING-EXONIC,ENSG00000073737(DHRS9),"ENSG00000073737(DHRS9):intron_variant,ENSG0000...",BRCA-EU
4,MU66538864,A>T,chr2:169925859,chr2:169069349,INTRONIC,ENSG00000073737(DHRS9),"ENSG00000073737(DHRS9):intron_variant,ENSG0000...",BRCA-EU
5,MU63433685,T>A,chr2:169926262,chr2:169069752,INTRONIC,ENSG00000073737(DHRS9),"ENSG00000073737(DHRS9):intron_variant,ENSG0000...",BRCA-EU
6,MU64418202,C>G,chr2:169926829,chr2:169070319,NON-CODING-EXONIC,ENSG00000073737(DHRS9),"ENSG00000073737(DHRS9):5_prime_UTR_variant,ENS...",BRCA-EU
7,MU66019785,T>C,chr2:169927702,chr2:169071192,INTRONIC,ENSG00000073737(DHRS9),"ENSG00000073737(DHRS9):intron_variant,ENSG0000...",BRCA-EU
8,MU63584668,G>A,chr2:16992866,chr2:16811599,INTERGENIC,BRCA-EU,,
9,MU66013896,T>G,chr2:169928818,chr2:169072308,INTRONIC,ENSG00000073737(DHRS9),"ENSG00000073737(DHRS9):intron_variant,ENSG0000...",BRCA-EU


In [99]:
IDs = list(mutations['MUTATION_ID'])
IDs

['MU64868974',
 'MU67221130',
 'MU64619292',
 'MU66012277',
 'MU66538864',
 'MU63433685',
 'MU64418202',
 'MU66019785',
 'MU63584668',
 'MU66013896']

In [100]:
# Find the records in the database that correspond to those mutations
records = mut_table.find(mutation_id=IDs)
records_l = list(records)

for record in records_l:
    print(record)

OrderedDict([('id', 27323460), ('mutation_id', 'MU63433685'), ('chromosome', '2'), ('GRCh37_pos', 169926262), ('reference_allele', 'T'), ('mutated_allele', 'A'), ('quality', '.'), ('filter', '.'), ('occurrence_global', 27323460)])
OrderedDict([('id', 27323513), ('mutation_id', 'MU63584668'), ('chromosome', '2'), ('GRCh37_pos', 16992866), ('reference_allele', 'G'), ('mutated_allele', 'A'), ('quality', '.'), ('filter', '.'), ('occurrence_global', 27323513)])
OrderedDict([('id', 27323471), ('mutation_id', 'MU64418202'), ('chromosome', '2'), ('GRCh37_pos', 169926829), ('reference_allele', 'C'), ('mutated_allele', 'G'), ('quality', '.'), ('filter', '.'), ('occurrence_global', 27323471)])
OrderedDict([('id', 27323401), ('mutation_id', 'MU64619292'), ('chromosome', '2'), ('GRCh37_pos', 16992349), ('reference_allele', 'AT'), ('mutated_allele', 'A'), ('quality', '.'), ('filter', '.'), ('occurrence_global', 27323401)])
OrderedDict([('id', 27323381), ('mutation_id', 'MU64868974'), ('chromosome', 

In [125]:
# Fetch the actual mutation strings
mut_strings = {}
for record in records_l:
    mut_string = record['reference_allele'] + '>' + record['mutated_allele']
    mut_strings[record['mutation_id']] = mut_string
mut_strings

{'MU63433685': 'T>A',
 'MU63584668': 'G>A',
 'MU64418202': 'C>G',
 'MU64619292': 'AT>A',
 'MU64868974': 'C>G',
 'MU66012277': 'C>T',
 'MU66013896': 'T>G',
 'MU66019785': 'T>C',
 'MU66538864': 'A>T',
 'MU67221130': 'C>T'}

In [102]:
# Before changing...
mutations

Unnamed: 0,MUTATION_ID,MUTATION,POSITION_GRCh37,POSITION_GRCh38,RELATIVE_POSITION,OVERLAPPED_GENES,CONSEQUENCE(S),PROJECT(S)
0,MU64868974,C>G,chr2:169922536,chr2:169066026,INTRONIC,ENSG00000073737(DHRS9),"ENSG00000073737(DHRS9):intron_variant,ENSG0000...",BRCA-EU
1,MU67221130,C>T,chr2:1699231,chr2:1695459,INTRONIC,ENSG00000130508(PXDN),"ENSG00000130508(PXDN):intron_variant,ENSG00000...",BRCA-EU
2,MU64619292,T>-,chr2:16992349,chr2:16811082,INTERGENIC,BRCA-EU,,
3,MU66012277,C>T,chr2:169923756,chr2:169067246,NON-CODING-EXONIC,ENSG00000073737(DHRS9),"ENSG00000073737(DHRS9):intron_variant,ENSG0000...",BRCA-EU
4,MU66538864,A>T,chr2:169925859,chr2:169069349,INTRONIC,ENSG00000073737(DHRS9),"ENSG00000073737(DHRS9):intron_variant,ENSG0000...",BRCA-EU
5,MU63433685,T>A,chr2:169926262,chr2:169069752,INTRONIC,ENSG00000073737(DHRS9),"ENSG00000073737(DHRS9):intron_variant,ENSG0000...",BRCA-EU
6,MU64418202,C>G,chr2:169926829,chr2:169070319,NON-CODING-EXONIC,ENSG00000073737(DHRS9),"ENSG00000073737(DHRS9):5_prime_UTR_variant,ENS...",BRCA-EU
7,MU66019785,T>C,chr2:169927702,chr2:169071192,INTRONIC,ENSG00000073737(DHRS9),"ENSG00000073737(DHRS9):intron_variant,ENSG0000...",BRCA-EU
8,MU63584668,G>A,chr2:16992866,chr2:16811599,INTERGENIC,BRCA-EU,,
9,MU66013896,T>G,chr2:169928818,chr2:169072308,INTRONIC,ENSG00000073737(DHRS9),"ENSG00000073737(DHRS9):intron_variant,ENSG0000...",BRCA-EU


In [111]:
?mutations.where

In [129]:
# Change...
ordered_mut_strings = [mut_strings[mut_id] for mut_id in mutations['MUTATION_ID']]

mutations.MUTATION = ordered_mut_strings

In [130]:
# After change..
mutations

Unnamed: 0,MUTATION_ID,MUTATION,POSITION_GRCh37,POSITION_GRCh38,RELATIVE_POSITION,OVERLAPPED_GENES,CONSEQUENCE(S),PROJECT(S)
0,MU64868974,C>G,chr2:169922536,chr2:169066026,INTRONIC,ENSG00000073737(DHRS9),"ENSG00000073737(DHRS9):intron_variant,ENSG0000...",BRCA-EU
1,MU67221130,C>T,chr2:1699231,chr2:1695459,INTRONIC,ENSG00000130508(PXDN),"ENSG00000130508(PXDN):intron_variant,ENSG00000...",BRCA-EU
2,MU64619292,AT>A,chr2:16992349,chr2:16811082,INTERGENIC,BRCA-EU,,
3,MU66012277,C>T,chr2:169923756,chr2:169067246,NON-CODING-EXONIC,ENSG00000073737(DHRS9),"ENSG00000073737(DHRS9):intron_variant,ENSG0000...",BRCA-EU
4,MU66538864,A>T,chr2:169925859,chr2:169069349,INTRONIC,ENSG00000073737(DHRS9),"ENSG00000073737(DHRS9):intron_variant,ENSG0000...",BRCA-EU
5,MU63433685,T>A,chr2:169926262,chr2:169069752,INTRONIC,ENSG00000073737(DHRS9),"ENSG00000073737(DHRS9):intron_variant,ENSG0000...",BRCA-EU
6,MU64418202,C>G,chr2:169926829,chr2:169070319,NON-CODING-EXONIC,ENSG00000073737(DHRS9),"ENSG00000073737(DHRS9):5_prime_UTR_variant,ENS...",BRCA-EU
7,MU66019785,T>C,chr2:169927702,chr2:169071192,INTRONIC,ENSG00000073737(DHRS9),"ENSG00000073737(DHRS9):intron_variant,ENSG0000...",BRCA-EU
8,MU63584668,G>A,chr2:16992866,chr2:16811599,INTERGENIC,BRCA-EU,,
9,MU66013896,T>G,chr2:169928818,chr2:169072308,INTRONIC,ENSG00000073737(DHRS9),"ENSG00000073737(DHRS9):intron_variant,ENSG0000...",BRCA-EU


Processing the whole file
-----------------------------

In [146]:
# Open the file for reading chunk by chunk
mutations_reader = pd.read_table('~/results/all.BRCA-EU.mutations-context.tsv', 
                                 header=1, 
                                 delim_whitespace=True,
                                 chunksize=100)
# The output file
outname = '~/results/corrected_mutations2.tsv'
# Process the mutations
for i, chunk in enumerate(mutations_reader):
    # Get the mutation ID's 
    # Reading one large chunk from the DB is more efficient
    # than reading several small chunks
    IDs = list(chunk['MUTATION_ID'])
    # Fetch the corresponding records from the DB
    records = list(mut_table.find(mutation_id=IDs))
    # Process the corrected mutation strings
    # Fetch the actual mutation strings
    mut_strings = {record['mutation_id'] : record['reference_allele'] + '>' + record['mutated_allele']
                       for record in records}
    # Order the mut_strings
    ordered_mut_strings = [mut_strings[mut_id] for mut_id in chunk['MUTATION_ID']]
    # Correct the mutation strings in the chunk
    chunk['MUTATION'] = ordered_mut_strings
    # Finally, output to the file
    write_header = True if i==0 else False
    chunk.to_csv(outname,
                 sep='\t', 
                 index=False, 
                 mode='a',
                 header=write_header)
    

Validating the corrected file
=================

The final step is to read the corrected file and validate it.

In [147]:
import json
from collections import defaultdict
import requests, sys


# < --- Request server and headers

grch38_server = "http://rest.ensembl.org"
grch37_server = "http://grch37.rest.ensembl.org" # We are using the GRCh37 server

ext = "/sequence/region/human"
headers={ "Content-Type" : "application/json", 
          "Accept" : "application/json"}


# < --- Validation

def validate_mutations(mutations, verbose=False, assembly='GRCh37'):
    "Return the mutation IDs that doesn't match the reference."
    invalid = []  # Here we will accumulate those that didn't match
    
    # < --- Assemble the request data
    request_data = defaultdict(list)
    for mutation in mutations.itertuples():
        # Add region string to the data dictionary
        # at the automatically created 'region' key
        request_data['regions'].append( to_region_str(mutation, assembly) )

    # < --- Make the request
    server = grch37_server if assembly=='GRCh37' else grch38_server
    
    # Request the sequences at the position
    r = requests.post(server+ext, 
                      headers=headers, 
                      data=json.dumps(request_data)) # Here goes the data

    # < --- Check the response
    if not r.ok:
        r.raise_for_status()

    # < --- Validate the mutations
    for r_item, mut in zip(r.json(), mutations.itertuples()):
        # The sequence from the file
        file_seq = mut.MUTATION.split('>')[0]
        # The sequence from the request
        req_seq = r_item['seq']
        
        # < --- Parse the mutation's start position
        if assembly == 'GRCh37':
            chrom_str, pos_str = mut.POSITION_GRCh37.split(':')
        else:
            chrom_str, pos_str = mut.POSITION_GRCh38.split(':')
        
        if file_seq != req_seq[1:-1]:
            invalid.append({'ID':mut.MUTATION_ID,
                            'expected':file_seq,
                            'got':req_seq,
                            'pos':int(pos_str)})
        if verbose:
            print('ID: ', mut.MUTATION_ID,
                  'Expected: ', file_seq,
                  'Got: ', req_seq)
        
    return invalid
# ---


In [148]:
mutations_reader = pd.read_table('~/results/corrected_mutations2.tsv',  
                                 delim_whitespace=True,
                                 iterator=True)
corrected = mutations_reader.get_chunk(50)
corrected

Unnamed: 0,MUTATION_ID,MUTATION,POSITION_GRCh37,POSITION_GRCh38,RELATIVE_POSITION,OVERLAPPED_GENES,CONSEQUENCE(S),PROJECT(S)
0,MU64868974,C>G,chr2:169922536,chr2:169066026,INTRONIC,ENSG00000073737(DHRS9),"ENSG00000073737(DHRS9):intron_variant,ENSG0000...",BRCA-EU
1,MU67221130,C>T,chr2:1699231,chr2:1695459,INTRONIC,ENSG00000130508(PXDN),"ENSG00000130508(PXDN):intron_variant,ENSG00000...",BRCA-EU
2,MU64619292,AT>A,chr2:16992349,chr2:16811082,INTERGENIC,BRCA-EU,,
3,MU66012277,C>T,chr2:169923756,chr2:169067246,NON-CODING-EXONIC,ENSG00000073737(DHRS9),"ENSG00000073737(DHRS9):intron_variant,ENSG0000...",BRCA-EU
4,MU66538864,A>T,chr2:169925859,chr2:169069349,INTRONIC,ENSG00000073737(DHRS9),"ENSG00000073737(DHRS9):intron_variant,ENSG0000...",BRCA-EU
5,MU63433685,T>A,chr2:169926262,chr2:169069752,INTRONIC,ENSG00000073737(DHRS9),"ENSG00000073737(DHRS9):intron_variant,ENSG0000...",BRCA-EU
6,MU64418202,C>G,chr2:169926829,chr2:169070319,NON-CODING-EXONIC,ENSG00000073737(DHRS9),"ENSG00000073737(DHRS9):5_prime_UTR_variant,ENS...",BRCA-EU
7,MU66019785,T>C,chr2:169927702,chr2:169071192,INTRONIC,ENSG00000073737(DHRS9),"ENSG00000073737(DHRS9):intron_variant,ENSG0000...",BRCA-EU
8,MU63584668,G>A,chr2:16992866,chr2:16811599,INTERGENIC,BRCA-EU,,
9,MU66013896,T>G,chr2:169928818,chr2:169072308,INTRONIC,ENSG00000073737(DHRS9),"ENSG00000073737(DHRS9):intron_variant,ENSG0000...",BRCA-EU


In [149]:
mutations_reader = pd.read_table('~/results/corrected_mutations2.tsv',  
                                 delim_whitespace=True,
                                 chunksize=50)

In [150]:
from pprint import pprint

invalid_grch37 = []
invalid_grch38 = []

# Iterate through each chunk
for i, mutation_chunk in enumerate(mutations_reader):
    # Validate the chunk
    new_invalid_grch37 = validate_mutations(mutation_chunk, assembly='GRCh37')
    new_invalid_grch38 = validate_mutations(mutation_chunk, assembly='GRCh38')
    print('Chunk: ', i, '-'*20)
    pprint(new_invalid_grch37)
    print('\n')
    pprint(new_invalid_grch38)
    
    invalid_grch37 += new_invalid_grch37
    invalid_grch38 += new_invalid_grch38

pprint(invalid_grch37)
pprint(invalid_grch38)

KeyboardInterrupt: 

In [152]:
from pprint import pprint

invalid_grch37 = []

# Iterate through each chunk
for i, mutation_chunk in enumerate(mutations_reader):
    # Validate the chunk
    new_invalid_grch37 = validate_mutations(mutation_chunk, assembly='GRCh37')
    print('Chunk: ', i, '-'*20)
    pprint(new_invalid_grch37)
    invalid_grch37 += new_invalid_grch37

pprint(invalid_grch37)

Chunk:  0 --------------------
[]
Chunk:  1 --------------------
[]
Chunk:  2 --------------------
[]
Chunk:  3 --------------------
[]
Chunk:  4 --------------------
[]
Chunk:  5 --------------------
[]
Chunk:  6 --------------------
[]
Chunk:  7 --------------------
[]
Chunk:  8 --------------------
[]
Chunk:  9 --------------------
[]
Chunk:  10 --------------------
[]
Chunk:  11 --------------------
[]
Chunk:  12 --------------------
[]
Chunk:  13 --------------------
[]
Chunk:  14 --------------------
[]
Chunk:  15 --------------------
[]
Chunk:  16 --------------------
[]
Chunk:  17 --------------------
[]
Chunk:  18 --------------------
[]
Chunk:  19 --------------------
[]
Chunk:  20 --------------------
[]
Chunk:  21 --------------------
[]
Chunk:  22 --------------------
[]
Chunk:  23 --------------------
[]
Chunk:  24 --------------------
[]
Chunk:  25 --------------------
[]
Chunk:  26 --------------------
[]
Chunk:  27 --------------------
[]
Chunk:  28 -------------------

Chunk:  231 --------------------
[]
Chunk:  232 --------------------
[]
Chunk:  233 --------------------
[]
Chunk:  234 --------------------
[]
Chunk:  235 --------------------
[]
Chunk:  236 --------------------
[]
Chunk:  237 --------------------
[]
Chunk:  238 --------------------
[]
Chunk:  239 --------------------
[]
Chunk:  240 --------------------
[]
Chunk:  241 --------------------
[]
Chunk:  242 --------------------
[]
Chunk:  243 --------------------
[]
Chunk:  244 --------------------
[]
Chunk:  245 --------------------
[]
Chunk:  246 --------------------
[]
Chunk:  247 --------------------
[]
Chunk:  248 --------------------
[]
Chunk:  249 --------------------
[]
Chunk:  250 --------------------
[]
Chunk:  251 --------------------
[]
Chunk:  252 --------------------
[]
Chunk:  253 --------------------
[]
Chunk:  254 --------------------
[]
Chunk:  255 --------------------
[]
Chunk:  256 --------------------
[]
Chunk:  257 --------------------
[]
Chunk:  258 ----------------

Chunk:  459 --------------------
[]
Chunk:  460 --------------------
[]
Chunk:  461 --------------------
[]
Chunk:  462 --------------------
[]
Chunk:  463 --------------------
[]
Chunk:  464 --------------------
[]
Chunk:  465 --------------------
[]
Chunk:  466 --------------------
[]
Chunk:  467 --------------------
[]
Chunk:  468 --------------------
[]
Chunk:  469 --------------------
[]
Chunk:  470 --------------------
[]
Chunk:  471 --------------------
[]
Chunk:  472 --------------------
[]
Chunk:  473 --------------------
[]
Chunk:  474 --------------------
[]
Chunk:  475 --------------------
[]
Chunk:  476 --------------------
[]
Chunk:  477 --------------------
[]
Chunk:  478 --------------------
[]
Chunk:  479 --------------------
[]
Chunk:  480 --------------------
[]
Chunk:  481 --------------------
[]
Chunk:  482 --------------------
[]
Chunk:  483 --------------------
[]
Chunk:  484 --------------------
[]
Chunk:  485 --------------------
[]
Chunk:  486 ----------------

Chunk:  687 --------------------
[]
Chunk:  688 --------------------
[]
Chunk:  689 --------------------
[]
Chunk:  690 --------------------
[]
Chunk:  691 --------------------
[]
Chunk:  692 --------------------
[]
Chunk:  693 --------------------
[]
Chunk:  694 --------------------
[]
Chunk:  695 --------------------
[]
Chunk:  696 --------------------
[]
Chunk:  697 --------------------
[]
Chunk:  698 --------------------
[]
Chunk:  699 --------------------
[]
Chunk:  700 --------------------
[]
Chunk:  701 --------------------
[]
Chunk:  702 --------------------
[]
Chunk:  703 --------------------
[]
Chunk:  704 --------------------
[]
Chunk:  705 --------------------
[]
Chunk:  706 --------------------
[]
Chunk:  707 --------------------
[]
Chunk:  708 --------------------
[]
Chunk:  709 --------------------
[]
Chunk:  710 --------------------
[]
Chunk:  711 --------------------
[]
Chunk:  712 --------------------
[]
Chunk:  713 --------------------
[]
Chunk:  714 ----------------

Chunk:  915 --------------------
[]
Chunk:  916 --------------------
[]
Chunk:  917 --------------------
[]
Chunk:  918 --------------------
[]
Chunk:  919 --------------------
[]
Chunk:  920 --------------------
[]
Chunk:  921 --------------------
[]
Chunk:  922 --------------------
[]
Chunk:  923 --------------------
[]
Chunk:  924 --------------------
[]
Chunk:  925 --------------------
[]
Chunk:  926 --------------------
[]
Chunk:  927 --------------------
[]
Chunk:  928 --------------------
[]
Chunk:  929 --------------------
[]
Chunk:  930 --------------------
[]
Chunk:  931 --------------------
[]
Chunk:  932 --------------------
[]
Chunk:  933 --------------------
[]
Chunk:  934 --------------------
[]
Chunk:  935 --------------------
[]
Chunk:  936 --------------------
[]
Chunk:  937 --------------------
[]
Chunk:  938 --------------------
[]
Chunk:  939 --------------------
[]
Chunk:  940 --------------------
[]
Chunk:  941 --------------------
[]
Chunk:  942 ----------------

Chunk:  1139 --------------------
[]
Chunk:  1140 --------------------
[]
Chunk:  1141 --------------------
[]
Chunk:  1142 --------------------
[]
Chunk:  1143 --------------------
[]
Chunk:  1144 --------------------
[]
Chunk:  1145 --------------------
[]
Chunk:  1146 --------------------
[]
Chunk:  1147 --------------------
[]
Chunk:  1148 --------------------
[]
Chunk:  1149 --------------------
[]
Chunk:  1150 --------------------
[]
Chunk:  1151 --------------------
[]
Chunk:  1152 --------------------
[]
Chunk:  1153 --------------------
[]
Chunk:  1154 --------------------
[]
Chunk:  1155 --------------------
[]
Chunk:  1156 --------------------
[]
Chunk:  1157 --------------------
[]
Chunk:  1158 --------------------
[]
Chunk:  1159 --------------------
[]
Chunk:  1160 --------------------
[]
Chunk:  1161 --------------------
[]
Chunk:  1162 --------------------
[]
Chunk:  1163 --------------------
[]
Chunk:  1164 --------------------
[]
Chunk:  1165 --------------------
[]
C

Chunk:  1361 --------------------
[]
Chunk:  1362 --------------------
[]
Chunk:  1363 --------------------
[]
Chunk:  1364 --------------------
[]
Chunk:  1365 --------------------
[]
Chunk:  1366 --------------------
[]
Chunk:  1367 --------------------
[]
Chunk:  1368 --------------------
[]
Chunk:  1369 --------------------
[]
Chunk:  1370 --------------------
[]
Chunk:  1371 --------------------
[]
Chunk:  1372 --------------------
[]
Chunk:  1373 --------------------
[]
Chunk:  1374 --------------------
[]
Chunk:  1375 --------------------
[]
Chunk:  1376 --------------------
[]
Chunk:  1377 --------------------
[]
Chunk:  1378 --------------------
[]
Chunk:  1379 --------------------
[]
Chunk:  1380 --------------------
[]
Chunk:  1381 --------------------
[]
Chunk:  1382 --------------------
[]
Chunk:  1383 --------------------
[]
Chunk:  1384 --------------------
[]
Chunk:  1385 --------------------
[]
Chunk:  1386 --------------------
[]
Chunk:  1387 --------------------
[]
C

Chunk:  1583 --------------------
[]
Chunk:  1584 --------------------
[]
Chunk:  1585 --------------------
[]
Chunk:  1586 --------------------
[]
Chunk:  1587 --------------------
[]
Chunk:  1588 --------------------
[]
Chunk:  1589 --------------------
[]
Chunk:  1590 --------------------
[]
Chunk:  1591 --------------------
[]
Chunk:  1592 --------------------
[]
Chunk:  1593 --------------------
[]
Chunk:  1594 --------------------
[]
Chunk:  1595 --------------------
[]
Chunk:  1596 --------------------
[]
Chunk:  1597 --------------------
[]
Chunk:  1598 --------------------
[]
Chunk:  1599 --------------------
[]
Chunk:  1600 --------------------
[]
Chunk:  1601 --------------------
[]
Chunk:  1602 --------------------
[]
Chunk:  1603 --------------------
[]
Chunk:  1604 --------------------
[]
Chunk:  1605 --------------------
[]
Chunk:  1606 --------------------
[]
Chunk:  1607 --------------------
[]
Chunk:  1608 --------------------
[]
Chunk:  1609 --------------------
[]
C

Chunk:  1805 --------------------
[]
Chunk:  1806 --------------------
[]
Chunk:  1807 --------------------
[]
Chunk:  1808 --------------------
[]
Chunk:  1809 --------------------
[]
Chunk:  1810 --------------------
[]
Chunk:  1811 --------------------
[]
Chunk:  1812 --------------------
[]
Chunk:  1813 --------------------
[]
Chunk:  1814 --------------------
[]
Chunk:  1815 --------------------
[]
Chunk:  1816 --------------------
[]
Chunk:  1817 --------------------
[]
Chunk:  1818 --------------------
[]
Chunk:  1819 --------------------
[]
Chunk:  1820 --------------------
[]
Chunk:  1821 --------------------
[]
Chunk:  1822 --------------------
[]
Chunk:  1823 --------------------
[]
Chunk:  1824 --------------------
[]
Chunk:  1825 --------------------
[]
Chunk:  1826 --------------------
[]
Chunk:  1827 --------------------
[]
Chunk:  1828 --------------------
[]
Chunk:  1829 --------------------
[]
Chunk:  1830 --------------------
[]
Chunk:  1831 --------------------
[]
C

Chunk:  2027 --------------------
[]
Chunk:  2028 --------------------
[]
Chunk:  2029 --------------------
[]
Chunk:  2030 --------------------
[]
Chunk:  2031 --------------------
[]
Chunk:  2032 --------------------
[]
Chunk:  2033 --------------------
[]
Chunk:  2034 --------------------
[]
Chunk:  2035 --------------------
[]
Chunk:  2036 --------------------
[]
Chunk:  2037 --------------------
[]
Chunk:  2038 --------------------
[]
Chunk:  2039 --------------------
[]
Chunk:  2040 --------------------
[]
Chunk:  2041 --------------------
[]
Chunk:  2042 --------------------
[]
Chunk:  2043 --------------------
[]
Chunk:  2044 --------------------
[]
Chunk:  2045 --------------------
[]
Chunk:  2046 --------------------
[]
Chunk:  2047 --------------------
[]
Chunk:  2048 --------------------
[]
Chunk:  2049 --------------------
[]
Chunk:  2050 --------------------
[]
Chunk:  2051 --------------------
[]
Chunk:  2052 --------------------
[]
Chunk:  2053 --------------------
[]
C

Chunk:  2249 --------------------
[]
Chunk:  2250 --------------------
[]
Chunk:  2251 --------------------
[]
Chunk:  2252 --------------------
[]
Chunk:  2253 --------------------
[]
Chunk:  2254 --------------------
[]
Chunk:  2255 --------------------
[]
Chunk:  2256 --------------------
[]
Chunk:  2257 --------------------
[]
Chunk:  2258 --------------------
[]
Chunk:  2259 --------------------
[]
Chunk:  2260 --------------------
[]
Chunk:  2261 --------------------
[]
Chunk:  2262 --------------------
[]
Chunk:  2263 --------------------
[]
Chunk:  2264 --------------------
[]
Chunk:  2265 --------------------
[]
Chunk:  2266 --------------------
[]
Chunk:  2267 --------------------
[]
Chunk:  2268 --------------------
[]
Chunk:  2269 --------------------
[]
Chunk:  2270 --------------------
[]
Chunk:  2271 --------------------
[]
Chunk:  2272 --------------------
[]
Chunk:  2273 --------------------
[]
Chunk:  2274 --------------------
[]
Chunk:  2275 --------------------
[]
C

Chunk:  2471 --------------------
[]
Chunk:  2472 --------------------
[]
Chunk:  2473 --------------------
[]
Chunk:  2474 --------------------
[]
Chunk:  2475 --------------------
[]
Chunk:  2476 --------------------
[]
Chunk:  2477 --------------------
[]
Chunk:  2478 --------------------
[]
Chunk:  2479 --------------------
[]
Chunk:  2480 --------------------
[]
Chunk:  2481 --------------------
[]
Chunk:  2482 --------------------
[]
Chunk:  2483 --------------------
[]
Chunk:  2484 --------------------
[]
Chunk:  2485 --------------------
[]
Chunk:  2486 --------------------
[]
Chunk:  2487 --------------------
[]
Chunk:  2488 --------------------
[]
Chunk:  2489 --------------------
[]
Chunk:  2490 --------------------
[]
Chunk:  2491 --------------------
[]
Chunk:  2492 --------------------
[]
Chunk:  2493 --------------------
[]
Chunk:  2494 --------------------
[]
Chunk:  2495 --------------------
[]
Chunk:  2496 --------------------
[]
Chunk:  2497 --------------------
[]
C

Chunk:  2693 --------------------
[]
Chunk:  2694 --------------------
[]
Chunk:  2695 --------------------
[]
Chunk:  2696 --------------------
[]
Chunk:  2697 --------------------
[]
Chunk:  2698 --------------------
[]
Chunk:  2699 --------------------
[]
Chunk:  2700 --------------------
[]
Chunk:  2701 --------------------
[]
Chunk:  2702 --------------------
[]
Chunk:  2703 --------------------
[]
Chunk:  2704 --------------------
[]
Chunk:  2705 --------------------
[]
Chunk:  2706 --------------------
[]
Chunk:  2707 --------------------
[]
Chunk:  2708 --------------------
[]
Chunk:  2709 --------------------
[]
Chunk:  2710 --------------------
[]
Chunk:  2711 --------------------
[]
Chunk:  2712 --------------------
[]
Chunk:  2713 --------------------
[]
Chunk:  2714 --------------------
[]
Chunk:  2715 --------------------
[]
Chunk:  2716 --------------------
[]
Chunk:  2717 --------------------
[]
Chunk:  2718 --------------------
[]
Chunk:  2719 --------------------
[]
C

Chunk:  2915 --------------------
[]
Chunk:  2916 --------------------
[]
Chunk:  2917 --------------------
[]
Chunk:  2918 --------------------
[]
Chunk:  2919 --------------------
[]
Chunk:  2920 --------------------
[]
Chunk:  2921 --------------------
[]
Chunk:  2922 --------------------
[]
Chunk:  2923 --------------------
[]
Chunk:  2924 --------------------
[]
Chunk:  2925 --------------------
[]
Chunk:  2926 --------------------
[]
Chunk:  2927 --------------------
[]
Chunk:  2928 --------------------
[]
Chunk:  2929 --------------------
[]
Chunk:  2930 --------------------
[]
Chunk:  2931 --------------------
[]
Chunk:  2932 --------------------
[]
Chunk:  2933 --------------------
[]
Chunk:  2934 --------------------
[]
Chunk:  2935 --------------------
[]
Chunk:  2936 --------------------
[]
Chunk:  2937 --------------------
[]
Chunk:  2938 --------------------
[]
Chunk:  2939 --------------------
[]
Chunk:  2940 --------------------
[]
Chunk:  2941 --------------------
[]
C

Chunk:  3137 --------------------
[]
Chunk:  3138 --------------------
[]
Chunk:  3139 --------------------
[]
Chunk:  3140 --------------------
[]
Chunk:  3141 --------------------
[]
Chunk:  3142 --------------------
[]
Chunk:  3143 --------------------
[]
Chunk:  3144 --------------------
[]
Chunk:  3145 --------------------
[]
Chunk:  3146 --------------------
[]
Chunk:  3147 --------------------
[]
Chunk:  3148 --------------------
[]
Chunk:  3149 --------------------
[]
Chunk:  3150 --------------------
[]
Chunk:  3151 --------------------
[]
Chunk:  3152 --------------------
[]
Chunk:  3153 --------------------
[]
Chunk:  3154 --------------------
[]
Chunk:  3155 --------------------
[]
Chunk:  3156 --------------------
[]
Chunk:  3157 --------------------
[]
Chunk:  3158 --------------------
[]
Chunk:  3159 --------------------
[]
Chunk:  3160 --------------------
[]
Chunk:  3161 --------------------
[]
Chunk:  3162 --------------------
[]
Chunk:  3163 --------------------
[]
C

Chunk:  3359 --------------------
[]
Chunk:  3360 --------------------
[]
Chunk:  3361 --------------------
[]
Chunk:  3362 --------------------
[]
Chunk:  3363 --------------------
[]
Chunk:  3364 --------------------
[]
Chunk:  3365 --------------------
[]
Chunk:  3366 --------------------
[]
Chunk:  3367 --------------------
[]
Chunk:  3368 --------------------
[]
Chunk:  3369 --------------------
[]
Chunk:  3370 --------------------
[]
Chunk:  3371 --------------------
[]
Chunk:  3372 --------------------
[]
Chunk:  3373 --------------------
[]
Chunk:  3374 --------------------
[]
Chunk:  3375 --------------------
[]
Chunk:  3376 --------------------
[]
Chunk:  3377 --------------------
[]
Chunk:  3378 --------------------
[]
Chunk:  3379 --------------------
[]
Chunk:  3380 --------------------
[]
Chunk:  3381 --------------------
[]
Chunk:  3382 --------------------
[]
Chunk:  3383 --------------------
[]
Chunk:  3384 --------------------
[]
Chunk:  3385 --------------------
[]
C

Chunk:  3581 --------------------
[]
Chunk:  3582 --------------------
[]
Chunk:  3583 --------------------
[]
Chunk:  3584 --------------------
[]
Chunk:  3585 --------------------
[]
Chunk:  3586 --------------------
[]
Chunk:  3587 --------------------
[]
Chunk:  3588 --------------------
[]
Chunk:  3589 --------------------
[]
Chunk:  3590 --------------------
[]
Chunk:  3591 --------------------
[]
Chunk:  3592 --------------------
[]
Chunk:  3593 --------------------
[]
Chunk:  3594 --------------------
[]
Chunk:  3595 --------------------
[]
Chunk:  3596 --------------------
[]
Chunk:  3597 --------------------
[]
Chunk:  3598 --------------------
[]
Chunk:  3599 --------------------
[]
Chunk:  3600 --------------------
[]
Chunk:  3601 --------------------
[]
Chunk:  3602 --------------------
[]
Chunk:  3603 --------------------
[]
Chunk:  3604 --------------------
[]
Chunk:  3605 --------------------
[]
Chunk:  3606 --------------------
[]
Chunk:  3607 --------------------
[]
C

Chunk:  3803 --------------------
[]
Chunk:  3804 --------------------
[]
Chunk:  3805 --------------------
[]
Chunk:  3806 --------------------
[]
Chunk:  3807 --------------------
[]
Chunk:  3808 --------------------
[]
Chunk:  3809 --------------------
[]
Chunk:  3810 --------------------
[]
Chunk:  3811 --------------------
[]
Chunk:  3812 --------------------
[]
Chunk:  3813 --------------------
[]
Chunk:  3814 --------------------
[]
Chunk:  3815 --------------------
[]
Chunk:  3816 --------------------
[]
Chunk:  3817 --------------------
[]
Chunk:  3818 --------------------
[]
Chunk:  3819 --------------------
[]
Chunk:  3820 --------------------
[]
Chunk:  3821 --------------------
[]
Chunk:  3822 --------------------
[]
Chunk:  3823 --------------------
[]
Chunk:  3824 --------------------
[]
Chunk:  3825 --------------------
[]
Chunk:  3826 --------------------
[]
Chunk:  3827 --------------------
[]
Chunk:  3828 --------------------
[]
Chunk:  3829 --------------------
[]
C

Chunk:  4025 --------------------
[]
Chunk:  4026 --------------------
[]
Chunk:  4027 --------------------
[]
Chunk:  4028 --------------------
[]
Chunk:  4029 --------------------
[]
Chunk:  4030 --------------------
[]
Chunk:  4031 --------------------
[]
Chunk:  4032 --------------------
[]
Chunk:  4033 --------------------
[]
Chunk:  4034 --------------------
[]
Chunk:  4035 --------------------
[]
Chunk:  4036 --------------------
[]
Chunk:  4037 --------------------
[]
Chunk:  4038 --------------------
[]
Chunk:  4039 --------------------
[]
Chunk:  4040 --------------------
[]
Chunk:  4041 --------------------
[]
Chunk:  4042 --------------------
[]
Chunk:  4043 --------------------
[]
Chunk:  4044 --------------------
[]
Chunk:  4045 --------------------
[]
Chunk:  4046 --------------------
[]
Chunk:  4047 --------------------
[]
Chunk:  4048 --------------------
[]
Chunk:  4049 --------------------
[]
Chunk:  4050 --------------------
[]
Chunk:  4051 --------------------
[]
C

Chunk:  4247 --------------------
[]
Chunk:  4248 --------------------
[]
Chunk:  4249 --------------------
[]
Chunk:  4250 --------------------
[]
Chunk:  4251 --------------------
[]
Chunk:  4252 --------------------
[]
Chunk:  4253 --------------------
[]
Chunk:  4254 --------------------
[]
Chunk:  4255 --------------------
[]
Chunk:  4256 --------------------
[]
Chunk:  4257 --------------------
[]
Chunk:  4258 --------------------
[]
Chunk:  4259 --------------------
[]
Chunk:  4260 --------------------
[]
Chunk:  4261 --------------------
[]
Chunk:  4262 --------------------
[]
Chunk:  4263 --------------------
[]
Chunk:  4264 --------------------
[]
Chunk:  4265 --------------------
[]
Chunk:  4266 --------------------
[]
Chunk:  4267 --------------------
[]
Chunk:  4268 --------------------
[]
Chunk:  4269 --------------------
[]
Chunk:  4270 --------------------
[]
Chunk:  4271 --------------------
[]
Chunk:  4272 --------------------
[]
Chunk:  4273 --------------------
[]
C

Chunk:  4469 --------------------
[]
Chunk:  4470 --------------------
[]
Chunk:  4471 --------------------
[]
Chunk:  4472 --------------------
[]
Chunk:  4473 --------------------
[]
Chunk:  4474 --------------------
[]
Chunk:  4475 --------------------
[]
Chunk:  4476 --------------------
[]
Chunk:  4477 --------------------
[]
Chunk:  4478 --------------------
[]
Chunk:  4479 --------------------
[]
Chunk:  4480 --------------------
[]
Chunk:  4481 --------------------
[]
Chunk:  4482 --------------------
[]
Chunk:  4483 --------------------
[]
Chunk:  4484 --------------------
[]
Chunk:  4485 --------------------
[]
Chunk:  4486 --------------------
[]
Chunk:  4487 --------------------
[]
Chunk:  4488 --------------------
[]
Chunk:  4489 --------------------
[]
Chunk:  4490 --------------------
[]
Chunk:  4491 --------------------
[]
Chunk:  4492 --------------------
[]
Chunk:  4493 --------------------
[]
Chunk:  4494 --------------------
[]
Chunk:  4495 --------------------
[]
C

Chunk:  4691 --------------------
[]
Chunk:  4692 --------------------
[]
Chunk:  4693 --------------------
[]
Chunk:  4694 --------------------
[]
Chunk:  4695 --------------------
[]
Chunk:  4696 --------------------
[]
Chunk:  4697 --------------------
[]
Chunk:  4698 --------------------
[]
Chunk:  4699 --------------------
[]
Chunk:  4700 --------------------
[]
Chunk:  4701 --------------------
[]
Chunk:  4702 --------------------
[]
Chunk:  4703 --------------------
[]
Chunk:  4704 --------------------
[]
Chunk:  4705 --------------------
[]
Chunk:  4706 --------------------
[]
Chunk:  4707 --------------------
[]
Chunk:  4708 --------------------
[]
Chunk:  4709 --------------------
[]
Chunk:  4710 --------------------
[]
Chunk:  4711 --------------------
[]
Chunk:  4712 --------------------
[]
Chunk:  4713 --------------------
[]
Chunk:  4714 --------------------
[]
Chunk:  4715 --------------------
[]
Chunk:  4716 --------------------
[]
Chunk:  4717 --------------------
[]
C

Chunk:  4913 --------------------
[]
Chunk:  4914 --------------------
[]
Chunk:  4915 --------------------
[]
Chunk:  4916 --------------------
[]
Chunk:  4917 --------------------
[]
Chunk:  4918 --------------------
[]
Chunk:  4919 --------------------
[]
Chunk:  4920 --------------------
[]
Chunk:  4921 --------------------
[]
Chunk:  4922 --------------------
[]
Chunk:  4923 --------------------
[]
Chunk:  4924 --------------------
[]
Chunk:  4925 --------------------
[]
Chunk:  4926 --------------------
[]
Chunk:  4927 --------------------
[]
Chunk:  4928 --------------------
[]
Chunk:  4929 --------------------
[]
Chunk:  4930 --------------------
[]
Chunk:  4931 --------------------
[]
Chunk:  4932 --------------------
[]
Chunk:  4933 --------------------
[]
Chunk:  4934 --------------------
[]
Chunk:  4935 --------------------
[]
Chunk:  4936 --------------------
[]
Chunk:  4937 --------------------
[]
Chunk:  4938 --------------------
[]
Chunk:  4939 --------------------
[]
C

Chunk:  5135 --------------------
[]
Chunk:  5136 --------------------
[]
Chunk:  5137 --------------------
[]
Chunk:  5138 --------------------
[]
Chunk:  5139 --------------------
[]
Chunk:  5140 --------------------
[]
Chunk:  5141 --------------------
[]
Chunk:  5142 --------------------
[]
Chunk:  5143 --------------------
[]
Chunk:  5144 --------------------
[]
Chunk:  5145 --------------------
[]
Chunk:  5146 --------------------
[]
Chunk:  5147 --------------------
[]
Chunk:  5148 --------------------
[]
Chunk:  5149 --------------------
[]
Chunk:  5150 --------------------
[]
Chunk:  5151 --------------------
[]
Chunk:  5152 --------------------
[]
Chunk:  5153 --------------------
[]
Chunk:  5154 --------------------
[]
Chunk:  5155 --------------------
[]
Chunk:  5156 --------------------
[]
Chunk:  5157 --------------------
[]
Chunk:  5158 --------------------
[]
Chunk:  5159 --------------------
[]
Chunk:  5160 --------------------
[]
Chunk:  5161 --------------------
[]
C

Chunk:  5357 --------------------
[]
Chunk:  5358 --------------------
[]
Chunk:  5359 --------------------
[]
Chunk:  5360 --------------------
[]
Chunk:  5361 --------------------
[]
Chunk:  5362 --------------------
[]
Chunk:  5363 --------------------
[]
Chunk:  5364 --------------------
[]
Chunk:  5365 --------------------
[]
Chunk:  5366 --------------------
[]
Chunk:  5367 --------------------
[]
Chunk:  5368 --------------------
[]
Chunk:  5369 --------------------
[]
Chunk:  5370 --------------------
[]
Chunk:  5371 --------------------
[]
Chunk:  5372 --------------------
[]
Chunk:  5373 --------------------
[]
Chunk:  5374 --------------------
[]
Chunk:  5375 --------------------
[]
Chunk:  5376 --------------------
[]
Chunk:  5377 --------------------
[]
Chunk:  5378 --------------------
[]
Chunk:  5379 --------------------
[]
Chunk:  5380 --------------------
[]
Chunk:  5381 --------------------
[]
Chunk:  5382 --------------------
[]
Chunk:  5383 --------------------
[]
C

Chunk:  5579 --------------------
[]
Chunk:  5580 --------------------
[]
Chunk:  5581 --------------------
[]
Chunk:  5582 --------------------
[]
Chunk:  5583 --------------------
[]
Chunk:  5584 --------------------
[]
Chunk:  5585 --------------------
[]
Chunk:  5586 --------------------
[]
Chunk:  5587 --------------------
[]
Chunk:  5588 --------------------
[]
Chunk:  5589 --------------------
[]
Chunk:  5590 --------------------
[]
Chunk:  5591 --------------------
[]
Chunk:  5592 --------------------
[]
Chunk:  5593 --------------------
[]
Chunk:  5594 --------------------
[]
Chunk:  5595 --------------------
[]
Chunk:  5596 --------------------
[]
Chunk:  5597 --------------------
[]
Chunk:  5598 --------------------
[]
Chunk:  5599 --------------------
[]
Chunk:  5600 --------------------
[]
Chunk:  5601 --------------------
[]
Chunk:  5602 --------------------
[]
Chunk:  5603 --------------------
[]
Chunk:  5604 --------------------
[]
Chunk:  5605 --------------------
[]
C

Chunk:  5801 --------------------
[]
Chunk:  5802 --------------------
[]
Chunk:  5803 --------------------
[]
Chunk:  5804 --------------------
[]
Chunk:  5805 --------------------
[]
Chunk:  5806 --------------------
[]
Chunk:  5807 --------------------
[]
Chunk:  5808 --------------------
[]
Chunk:  5809 --------------------
[]
Chunk:  5810 --------------------
[]
Chunk:  5811 --------------------
[]
Chunk:  5812 --------------------
[]
Chunk:  5813 --------------------
[]
Chunk:  5814 --------------------
[]
Chunk:  5815 --------------------
[]
Chunk:  5816 --------------------
[]
Chunk:  5817 --------------------
[]
Chunk:  5818 --------------------
[]
Chunk:  5819 --------------------
[]
Chunk:  5820 --------------------
[]
Chunk:  5821 --------------------
[]
Chunk:  5822 --------------------
[]
Chunk:  5823 --------------------
[]
Chunk:  5824 --------------------
[]
Chunk:  5825 --------------------
[]
Chunk:  5826 --------------------
[]
Chunk:  5827 --------------------
[]
C

Chunk:  6023 --------------------
[]
Chunk:  6024 --------------------
[]
Chunk:  6025 --------------------
[]
Chunk:  6026 --------------------
[]
Chunk:  6027 --------------------
[]
Chunk:  6028 --------------------
[]
Chunk:  6029 --------------------
[]
Chunk:  6030 --------------------
[]
Chunk:  6031 --------------------
[]
Chunk:  6032 --------------------
[]
Chunk:  6033 --------------------
[]
Chunk:  6034 --------------------
[]
Chunk:  6035 --------------------
[]
Chunk:  6036 --------------------
[]
Chunk:  6037 --------------------
[]
Chunk:  6038 --------------------
[]
Chunk:  6039 --------------------
[]
Chunk:  6040 --------------------
[]
Chunk:  6041 --------------------
[]
Chunk:  6042 --------------------
[]
Chunk:  6043 --------------------
[]
Chunk:  6044 --------------------
[]
Chunk:  6045 --------------------
[]
Chunk:  6046 --------------------
[]
Chunk:  6047 --------------------
[]
Chunk:  6048 --------------------
[]
Chunk:  6049 --------------------
[]
C

Chunk:  6245 --------------------
[]
Chunk:  6246 --------------------
[]
Chunk:  6247 --------------------
[]
Chunk:  6248 --------------------
[]
Chunk:  6249 --------------------
[]
Chunk:  6250 --------------------
[]
Chunk:  6251 --------------------
[]
Chunk:  6252 --------------------
[]
Chunk:  6253 --------------------
[]
Chunk:  6254 --------------------
[]
Chunk:  6255 --------------------
[]
Chunk:  6256 --------------------
[]
Chunk:  6257 --------------------
[]
Chunk:  6258 --------------------
[]
Chunk:  6259 --------------------
[]
Chunk:  6260 --------------------
[]
Chunk:  6261 --------------------
[]
Chunk:  6262 --------------------
[]
Chunk:  6263 --------------------
[]
Chunk:  6264 --------------------
[]
Chunk:  6265 --------------------
[]
Chunk:  6266 --------------------
[]
Chunk:  6267 --------------------
[]
Chunk:  6268 --------------------
[]
Chunk:  6269 --------------------
[]
Chunk:  6270 --------------------
[]
Chunk:  6271 --------------------
[]
C

Chunk:  6467 --------------------
[]
Chunk:  6468 --------------------
[]
Chunk:  6469 --------------------
[]
Chunk:  6470 --------------------
[]
Chunk:  6471 --------------------
[]
Chunk:  6472 --------------------
[]
Chunk:  6473 --------------------
[]
Chunk:  6474 --------------------
[]
Chunk:  6475 --------------------
[]
Chunk:  6476 --------------------
[]
Chunk:  6477 --------------------
[]
Chunk:  6478 --------------------
[]
Chunk:  6479 --------------------
[]
Chunk:  6480 --------------------
[]
Chunk:  6481 --------------------
[]
Chunk:  6482 --------------------
[]
Chunk:  6483 --------------------
[]
Chunk:  6484 --------------------
[]
Chunk:  6485 --------------------
[]
Chunk:  6486 --------------------
[]
Chunk:  6487 --------------------
[]
Chunk:  6488 --------------------
[]
Chunk:  6489 --------------------
[]
Chunk:  6490 --------------------
[]
Chunk:  6491 --------------------
[]
Chunk:  6492 --------------------
[]
Chunk:  6493 --------------------
[]
C

Chunk:  6689 --------------------
[]
Chunk:  6690 --------------------
[]
Chunk:  6691 --------------------
[]
Chunk:  6692 --------------------
[]
Chunk:  6693 --------------------
[]
Chunk:  6694 --------------------
[]
Chunk:  6695 --------------------
[]
Chunk:  6696 --------------------
[]
Chunk:  6697 --------------------
[]
Chunk:  6698 --------------------
[]
Chunk:  6699 --------------------
[]
Chunk:  6700 --------------------
[]
Chunk:  6701 --------------------
[]
Chunk:  6702 --------------------
[]
Chunk:  6703 --------------------
[]
Chunk:  6704 --------------------
[]
Chunk:  6705 --------------------
[]
Chunk:  6706 --------------------
[]
Chunk:  6707 --------------------
[]
Chunk:  6708 --------------------
[]
Chunk:  6709 --------------------
[]
Chunk:  6710 --------------------
[]
Chunk:  6711 --------------------
[]
Chunk:  6712 --------------------
[]
Chunk:  6713 --------------------
[]
Chunk:  6714 --------------------
[]
Chunk:  6715 --------------------
[]
C

Chunk:  6911 --------------------
[]
Chunk:  6912 --------------------
[]
Chunk:  6913 --------------------
[]
Chunk:  6914 --------------------
[]
Chunk:  6915 --------------------
[]
Chunk:  6916 --------------------
[]
Chunk:  6917 --------------------
[]
Chunk:  6918 --------------------
[]
Chunk:  6919 --------------------
[]
Chunk:  6920 --------------------
[]
Chunk:  6921 --------------------
[]
Chunk:  6922 --------------------
[]
Chunk:  6923 --------------------
[]
Chunk:  6924 --------------------
[]
Chunk:  6925 --------------------
[]
Chunk:  6926 --------------------
[]
Chunk:  6927 --------------------
[]
Chunk:  6928 --------------------
[]
Chunk:  6929 --------------------
[]
Chunk:  6930 --------------------
[]
Chunk:  6931 --------------------
[]
Chunk:  6932 --------------------
[]
Chunk:  6933 --------------------
[]
Chunk:  6934 --------------------
[]
Chunk:  6935 --------------------
[]
Chunk:  6936 --------------------
[]
Chunk:  6937 --------------------
[]
C

HTTPError: 429 Client Error: Too Many Requests for url: http://grch37.rest.ensembl.org/sequence/region/human

In [153]:
# Total validated mutations
7041*50

352050

In [155]:
# Total mutations in the file
!wc -l ~/results/corrected_mutations2.tsv

3801761 /home/agarcia/results/corrected_mutations2.tsv
