In [2]:
import ensembl

In [3]:
client = ensembl.Client()

In [4]:
client.assembly_map(region=ensembl.region_str('X', start=1000000, end=1000100),
                    from_assembly='GRCh37',
                    to_assembly='GRCh38')

{'mappings': [{'mapped': {'assembly': 'GRCh38',
    'coord_system': 'chromosome',
    'end': 1039365,
    'seq_region_name': 'X',
    'start': 1039265,
    'strand': 1},
   'original': {'assembly': 'GRCh37',
    'coord_system': 'chromosome',
    'end': 1000100,
    'seq_region_name': 'X',
    'start': 1000000,
    'strand': 1}}]}

In [5]:
assembly_info = client.assembly_info()
assembly_info

{'assembly_accession': 'GCA_000001405.25',
 'assembly_date': '2013-12',
 'assembly_name': 'GRCh38.p10',
 'coord_system_versions': ['GRCh38', 'GRCh37', 'NCBI36', 'NCBI35', 'NCBI34'],
 'default_coord_system_version': 'GRCh38',
 'genebuild_initial_release_date': '2014-07',
 'genebuild_last_geneset_update': '2017-06',
 'genebuild_method': 'full_genebuild',
 'genebuild_start_date': '2014-01-Ensembl',
 'karyotype': ['1',
  '2',
  '3',
  '4',
  '5',
  '6',
  '7',
  '8',
  '9',
  '10',
  '11',
  '12',
  '13',
  '14',
  '15',
  '16',
  '17',
  '18',
  '19',
  '20',
  '21',
  '22',
  'X',
  'Y',
  'MT'],
 'top_level_region': [{'coord_system': 'scaffold',
   'length': 71251,
   'name': 'KI270757.1'},
  {'coord_system': 'scaffold', 'length': 157432, 'name': 'KI270741.1'},
  {'coord_system': 'scaffold', 'length': 79590, 'name': 'KI270756.1'},
  {'coord_system': 'scaffold', 'length': 112551, 'name': 'KI270730.1'},
  {'coord_system': 'scaffold', 'length': 73985, 'name': 'KI270739.1'},
  {'coord_syste

In [6]:
chrom_info = [item for item in assembly_info['top_level_region']
                  if item['coord_system'] == 'chromosome']
chrom_info

[{'coord_system': 'chromosome', 'length': 57227415, 'name': 'Y'},
 {'coord_system': 'chromosome', 'length': 64444167, 'name': '20'},
 {'coord_system': 'chromosome', 'length': 156040895, 'name': 'X'},
 {'coord_system': 'chromosome', 'length': 114364328, 'name': '13'},
 {'coord_system': 'chromosome', 'length': 50818468, 'name': '22'},
 {'coord_system': 'chromosome', 'length': 133797422, 'name': '10'},
 {'coord_system': 'chromosome', 'length': 170805979, 'name': '6'},
 {'coord_system': 'chromosome', 'length': 58617616, 'name': '19'},
 {'coord_system': 'chromosome', 'length': 107043718, 'name': '14'},
 {'coord_system': 'chromosome', 'length': 80373285, 'name': '18'},
 {'coord_system': 'chromosome', 'length': 242193529, 'name': '2'},
 {'coord_system': 'chromosome', 'length': 190214555, 'name': '4'},
 {'coord_system': 'chromosome', 'length': 46709983, 'name': '21'},
 {'coord_system': 'chromosome', 'length': 138394717, 'name': '9'},
 {'coord_system': 'chromosome', 'length': 135086622, 'name':

In [8]:
from collections import defaultdict
assembly_map = defaultdict(list)

for chrom in chrom_info:
    print("Querying chrom", chrom['name'])
    region = ensembl.region_str(chrom['name'], 1, chrom['length'])
    map_ = client.assembly_map(region, 
                               from_assembly='GRCh37', 
                               to_assembly='GRCh38')
    
    for item in map_['mappings']:
        from_ = item['original']
        from_region = from_['start'],from_['end']
        
        to = item['mapped']
        to_region = to['start'],to['end']
        
        assembly_map[chrom['name']].append((from_region,to_region))
        
    assembly_map[chrom['name']].sort(key=lambda x: x[0][0])

Querying chrom Y
Querying chrom 20
Querying chrom X
Querying chrom 13
Querying chrom 22
Querying chrom 10
Querying chrom 6
Querying chrom 19
Querying chrom 14
Querying chrom 18
Querying chrom 2
Querying chrom 4
Querying chrom 21
Querying chrom 9
Querying chrom 11
Querying chrom 17
Querying chrom 8
Querying chrom 7
Querying chrom 15
Querying chrom 12
Querying chrom 1
Querying chrom 16
Querying chrom 5
Querying chrom 3
Querying chrom MT


In [9]:
len(assembly_map)

25

In [10]:
for chrom in assembly_map:
    print(len(assembly_map[chrom]))

181
203
3171
46
707
553
1304
920
1350
86
830
271
242
930
418
364
567
1656
737
105
2862
46
123
465
1


In [12]:
assembly_map['1'][:10]

[((10001, 177417), (10001, 177417)),
 ((227418, 267719), (257667, 297968)),
 ((317720, 471368), (347969, 501617)),
 ((521369, 1566075), (585989, 1630695)),
 ((1566076, 1569784), (1630697, 1634405)),
 ((1569785, 1570918), (1634409, 1635542)),
 ((1570919, 1570922), (1635547, 1635550)),
 ((1570923, 1574299), (1635561, 1638937)),
 ((1574300, 1583669), (1638939, 1648308)),
 ((1583670, 1583878), (1648310, 1648518))]

# Trying to minimize queries using an interval tree

In [17]:
import intervaltree as it

In [18]:
from collections import defaultdict
assembly_map = defaultdict(IntervalTree)

for chrom in chrom_info:
    print("Querying chrom", chrom['name'])
    region = ensembl.region_str(chrom['name'], 
                                start=1, end=chrom['length'])
    map_ = client.assembly_map(region, 
                               from_assembly='GRCh37', 
                               to_assembly='GRCh38')
    
    for item in map_['mappings']:
        from_ = item['original']
        # Need to modify to represent a half open
        # interval (as [a,b) instead of [a,b])
        from_region = from_['start'],from_['end']+1
        
        to = item['mapped']
        to_region = to['start'],to['end']
        
        try:
            assembly_map[chrom['name']].addi(*from_region, 
                                         data=to_region)
        except ValueError:
            print(item)
            raise

Querying chrom Y
Querying chrom 20
Querying chrom X
Querying chrom 13
Querying chrom 22
Querying chrom 10
Querying chrom 6
Querying chrom 19
Querying chrom 14
Querying chrom 18
Querying chrom 2
Querying chrom 4
Querying chrom 21
Querying chrom 9
Querying chrom 11
Querying chrom 17
Querying chrom 8
Querying chrom 7
Querying chrom 15
Querying chrom 12
Querying chrom 1
Querying chrom 16
Querying chrom 5
Querying chrom 3
Querying chrom MT


In [21]:
# Helper function to test
from itertools import islice

def head(iterable, items=10):
    'Return the first items of an iterator.'
    iterator = iter(iterable)
    return islice(iterator, items)
# ---

import ...src.Python.ICGC_data_parser as dp
reader = dp.SSM_Reader(filename='data/ssm_sample.vcf')

for record in head(reader.parse(filters=['BRCA-EU'])):
    print(record.ID, record.CHROM, record.POS)

SyntaxError: invalid syntax (<ipython-input-21-7fa30efc291e>, line 10)