Setting up a database with the following schema:
![schema](database_schema.png)

In [17]:
from pprint import pprint

In [6]:
def open_vcf(filename):
    """A generator to handle VCF files easily"""
    # Open the VCF file
    with open(filename) as file:
        # Yield only the lines that aren't comments
        for line in file:
            if not line.startswith('#'):
                yield line
    

In [4]:
%ls ../data

[0m[01;32mall_affected_genes.list[0m*  [34;42mdata_release_25[0m/     [01;32mssm_head500.vcf[0m*
[01;32mall_projects.list[0m*        [01;32mmain_genes.list[0m*     [01;32mssm_head50.vcf[0m*
[34;42mdata_release_22[0m/          [01;32mmain_projects.list[0m*  [01;32mssm_mixed.vcf[0m*


In [58]:
ssm_head50 = open_vcf('../data/ssm_head50.vcf')

In [59]:
for mutation in ssm_head50:
    pprint(mutation.split('\t'))

['1',
 '100000022',
 'MU39532371',
 'C',
 'T',
 '.',
 '.',
 'CONSEQUENCE=||||||intergenic_region||,RP11-413P11.1|ENSG00000224445|1|RP11-413P11.1-001|ENST00000438829||upstream_gene_variant||;OCCURRENCE=SKCA-BR|1|70|0.01429;affected_donors=1;mutation=C>T;project_count=1;tested_donors=10638\n']
['1',
 '100000181',
 'MU259333',
 'G',
 'A',
 '.',
 '.',
 'CONSEQUENCE=||||||intergenic_region||,RP11-413P11.1|ENSG00000224445|1|RP11-413P11.1-001|ENST00000438829||upstream_gene_variant||;OCCURRENCE=ESAD-UK|1|203|0.00493;affected_donors=1;mutation=G>A;project_count=1;tested_donors=10638\n']
['1',
 '100000340',
 'MU64917116',
 'AGGAATAGGGTGGGTCTGTGGCATTTAATCAGCGG',
 'AGTATAGAGTGTAAAGAGT',
 '.',
 '.',
 'CONSEQUENCE=||||||intergenic_region||,RP11-413P11.1|ENSG00000224445|1|RP11-413P11.1-001|ENST00000438829||upstream_gene_variant||;OCCURRENCE=BRCA-EU|1|560|0.00179;affected_donors=1;mutation=AGGAATAGGGTGGGTCTGTGGCATTTAATCAGCGG>AGTATAGAGTGTAAAGAGT;project_count=1;tested_donors=10638\n']
['1',
 '100000409

In [146]:
# < -- Functions to parse each mutation

def parse_mutation(raw_mutation):
    """Decompose a raw mutation line into fields, returns it as a dict"""
    # The fields the mutation is composed of
    fields = ['chromosome', 'GRCh37_pos', 
              'mutation_id', 'reference_allele', 
              'mutated_allele', 'quality', 
              'filter', 'INFO'
             ]
    # Split the mutation into fields and eliminate newline
    mutation_splitted = raw_mutation.strip().split('\t')
    # Assemble the mutation
    mutation = dict( zip(fields, mutation_splitted) )
    # Clean the fields
    mutation['GRCh37_pos'] = int(mutation['GRCh37_pos'])
    mutation.update( parse_INFO(mutation['INFO']) ) # Parse the INFO field
    del mutation['INFO']
    # Return the mutation
    return mutation
# ---


# < < -- Functions to parse the INFO field


def parse_INFO(raw_INFO):
    """Parse the raw text INFO field of a mutation into:
        - consecuences
            + gene_symbol
            + gene_affected
            + gene_strand
            + transcript_name
            + transcript_affected
            + protein_affected
            + consequence_type
            + cds_mutation
            + aa_mutation
            
        - occurrence_by_project
            + project_code
            + affected_donors
            + tested_donors
            + frequency
            
        - occurrence_global
            + affected_donors
            + tested_donors
            + mutation
            + frequency
            
    Returns the dictionary with the corresponding data
    """
    # The fields we'll split into
    fields = ['consequences', 
              'occurrence_by_project', 
              'occurrence_global'
             ]
    # Split the data into fields
    consequences, occurrence_by_project, *occurrence_global = raw_INFO.split(';')
    data = consequences, occurrence_by_project, occurrence_global
    INFO = dict( zip(fields, data) )
    # Clean the fields
    INFO['consequences'] = parse_consequences(INFO['consequences'])
    INFO['occurrence_by_project'] = parse_occurrences_by_project(
                                        INFO['occurrence_by_project']
                                    )
    INFO['occurrence_global'] = parse_occurrence_global(
                                        INFO['occurrence_global']
                                    )
    return INFO
# ---


# < < < -- Functions to parse the CONSEQUENCE subfield


def parse_consequences(raw_consequences):
    """Splits the raw comma-sepparated consequences into fields:
        + gene_symbol
        + gene_affected
        + gene_strand
        + transcript_name
        + transcript_affected
        + protein_affected
        + consequence_type
        + cds_mutation
        + aa_mutation
    * These are pipe-sepparated (|) fields
    """
    # Remove the trailing 'CONSEQUENCE=' string
    _ , trimmed_consequences = raw_consequences.split('=')
    # Separate each consequence
    consequences = trimmed_consequences.split(',')
    # Parse each consequence field
    return list( map( parse_consequence, consequences ) )
# ---


def parse_consequence(raw_consequence):
    """Splits the raw pipe-sepparated ('|') consequence into fields:
        + gene_symbol
        + gene_affected
        + gene_strand
        + transcript_name
        + transcript_affected
        + protein_affected
        + consequence_type
        + cds_mutation
        + aa_mutation
    """
    # The fields we'll split into
    fields = ['gene_symbol', 'gene_affected',
              'gene_strand', 'transcript_name', 
              'transcript_affected', 'protein_affected', 
              'consequence_type', 'cds_mutation', 
              'aa_mutation'
             ]
    # Split into fields
    consequence_splitted = raw_consequence.split('|')
    # Assemble consequence
    consequence = dict( zip(fields, consequence_splitted) )
    return consequence
# ---


# < < < -- Functions to parse the OCCURRENCE subfield (corresponding to occurrence per project)


def parse_occurrences_by_project(raw_occurrences):
    """Splits the raw comma-sepparated occurrences into fields:
        + project_code
        + affected_donors
        + tested_donors
        + frequency
    * These are pipe-sepparated (|) fields
    """
    # Remove the trailing 'OCCURRENCE=' string
    _ , trimmed_occurrences = raw_occurrences.split('=')
    # Separate each occurrence
    occurrences = trimmed_occurrences.split(',')
    # Parse each occurrence field
    return list( map( parse_occurrence_by_project, occurrences ) )
# ---


def parse_occurrence_by_project(raw_occurrence):
    """Splits the raw pipe-separated ('|') consequence into fields:
        + project_code
        + affected_donors
        + tested_donors
        + frequency
    """
    # The fields we'll split into
    fields = ['project_code', 'affected_donors',
              'tested_donors', 'frequency'
             ]
    # Split into fields
    occurrence_splitted = raw_occurrence.split('|')
    # Assemble consequence
    occurrence = dict( zip(fields, occurrence_splitted) )
    # Clean fields
    occurrence['affected_donors'] = int(occurrence['affected_donors'])
    occurrence['tested_donors'] = int(occurrence['tested_donors'])
    occurrence['frequency'] = float(occurrence['frequency'])
    return occurrence
# ---


# < < < -- Functions to parse the global occurrence

def parse_occurrence_global(raw_occurrence):
    """Splits the raw comma-separated fields of the global occurrence:
        + affected_donors
        + mutation
        + project_count
        + tested_donors
        + frequency
    """
    # Separate into the corresponding fields
    occurrence = dict([ keyvalue.split('=') for keyvalue in raw_occurrence ])
    # Clean the fields
    occurrence['affected_donors'] = int(occurrence['affected_donors'])
    occurrence['tested_donors'] = int(occurrence['tested_donors'])
    occurrence['project_count'] = int(occurrence['project_count'])
    occurrence['frequency'] = occurrence['affected_donors']/occurrence['tested_donors']
    return occurrence
# ---

# < -- Test
for raw_mutation in open_vcf('../data/ssm_head500.vcf'):
    mutation = parse_mutation(raw_mutation)
    pprint(mutation)

{'GRCh37_pos': 100000022,
 'chromosome': '1',
 'consequences': [{'aa_mutation': '',
                   'cds_mutation': '',
                   'consequence_type': 'intergenic_region',
                   'gene_affected': '',
                   'gene_strand': '',
                   'gene_symbol': '',
                   'protein_affected': '',
                   'transcript_affected': '',
                   'transcript_name': ''},
                  {'aa_mutation': '',
                   'cds_mutation': '',
                   'consequence_type': 'upstream_gene_variant',
                   'gene_affected': 'ENSG00000224445',
                   'gene_strand': '1',
                   'gene_symbol': 'RP11-413P11.1',
                   'protein_affected': '',
                   'transcript_affected': 'ENST00000438829',
                   'transcript_name': 'RP11-413P11.1-001'}],
 'filter': '.',
 'mutated_allele': 'T',
 'mutation_id': 'MU39532371',
 'occurrence_by_project': [{'affected_donors': 1

                   'protein_affected': '',
                   'transcript_affected': 'ENST00000403197',
                   'transcript_name': 'NMNAT1-002'},
                  {'aa_mutation': '',
                   'cds_mutation': '',
                   'consequence_type': 'upstream_gene_variant',
                   'gene_affected': 'ENSG00000228150',
                   'gene_strand': '+',
                   'gene_symbol': 'RP11-84A14.4',
                   'protein_affected': '',
                   'transcript_affected': 'ENST00000445884',
                   'transcript_name': 'RP11-84A14.4-001'},
                  {'aa_mutation': '',
                   'cds_mutation': '',
                   'consequence_type': 'upstream_gene_variant',
                   'gene_affected': 'ENSG00000173614',
                   'gene_strand': '+',
                   'gene_symbol': 'NMNAT1',
                   'protein_affected': '',
                   'transcript_affected': 'ENST00000462686',
            

                   'transcript_affected': 'ENST00000488540',
                   'transcript_name': 'LZIC-004'},
                  {'aa_mutation': '',
                   'cds_mutation': '',
                   'consequence_type': 'upstream_gene_variant',
                   'gene_affected': 'ENSG00000173614',
                   'gene_strand': '+',
                   'gene_symbol': 'NMNAT1',
                   'protein_affected': '',
                   'transcript_affected': 'ENST00000492735',
                   'transcript_name': 'NMNAT1-004'},
                  {'aa_mutation': '',
                   'cds_mutation': '',
                   'consequence_type': 'intron_variant',
                   'gene_affected': 'ENSG00000162441',
                   'gene_strand': '1',
                   'gene_symbol': 'LZIC',
                   'protein_affected': '',
                   'transcript_affected': 'ENST00000541052',
                   'transcript_name': 'LZIC-202'}],
 'filter': '.',
 'mutated_

                   'gene_symbol': '',
                   'protein_affected': '',
                   'transcript_affected': '',
                   'transcript_name': ''},
                  {'aa_mutation': '',
                   'cds_mutation': '',
                   'consequence_type': 'upstream_gene_variant',
                   'gene_affected': 'ENSG00000224445',
                   'gene_strand': '1',
                   'gene_symbol': 'RP11-413P11.1',
                   'protein_affected': '',
                   'transcript_affected': 'ENST00000438829',
                   'transcript_name': 'RP11-413P11.1-001'}],
 'filter': '.',
 'mutated_allele': 'T',
 'mutation_id': 'MU51795369',
 'occurrence_by_project': [{'affected_donors': 1,
                            'frequency': 0.00546,
                            'project_code': 'MELA-AU',
                            'tested_donors': 183}],
 'occurrence_global': {'affected_donors': 1,
                       'frequency': 9.391435011269723e-05

                            'tested_donors': 203}],
 'occurrence_global': {'affected_donors': 1,
                       'frequency': 9.391435011269723e-05,
                       'mutation': 'T>A',
                       'project_count': 1,
                       'tested_donors': 10648},
 'quality': '.',
 'reference_allele': 'T'}
{'GRCh37_pos': 100005994,
 'chromosome': '1',
 'consequences': [{'aa_mutation': '',
                   'cds_mutation': '',
                   'consequence_type': 'intergenic_region',
                   'gene_affected': '',
                   'gene_strand': '',
                   'gene_symbol': '',
                   'protein_affected': '',
                   'transcript_affected': '',
                   'transcript_name': ''}],
 'filter': '.',
 'mutated_allele': 'A',
 'mutation_id': 'MU55262197',
 'occurrence_by_project': [{'affected_donors': 1,
                            'frequency': 0.00546,
                            'project_code': 'MELA-AU',
           

                       'frequency': 9.391435011269723e-05,
                       'mutation': 'A>G',
                       'project_count': 1,
                       'tested_donors': 10648},
 'quality': '.',
 'reference_allele': 'A'}
{'GRCh37_pos': 100007893,
 'chromosome': '1',
 'consequences': [{'aa_mutation': '',
                   'cds_mutation': '',
                   'consequence_type': 'intergenic_region',
                   'gene_affected': '',
                   'gene_strand': '',
                   'gene_symbol': '',
                   'protein_affected': '',
                   'transcript_affected': '',
                   'transcript_name': ''}],
 'filter': '.',
 'mutated_allele': 'T',
 'mutation_id': 'MU36253242',
 'occurrence_by_project': [{'affected_donors': 1,
                            'frequency': 0.00493,
                            'project_code': 'ESAD-UK',
                            'tested_donors': 203}],
 'occurrence_global': {'affected_donors': 1,
           

                       'tested_donors': 10648},
 'quality': '.',
 'reference_allele': 'C'}
{'GRCh37_pos': 100009846,
 'chromosome': '1',
 'consequences': [{'aa_mutation': '',
                   'cds_mutation': '',
                   'consequence_type': 'intergenic_region',
                   'gene_affected': '',
                   'gene_strand': '',
                   'gene_symbol': '',
                   'protein_affected': '',
                   'transcript_affected': '',
                   'transcript_name': ''}],
 'filter': '.',
 'mutated_allele': 'A',
 'mutation_id': 'MU54498439',
 'occurrence_by_project': [{'affected_donors': 1,
                            'frequency': 0.00546,
                            'project_code': 'MELA-AU',
                            'tested_donors': 183}],
 'occurrence_global': {'affected_donors': 1,
                       'frequency': 9.391435011269723e-05,
                       'mutation': 'G>A',
                       'project_count': 1,
           

                   'gene_strand': '+',
                   'gene_symbol': 'RP11-84A14.4',
                   'protein_affected': '',
                   'transcript_affected': 'ENST00000445884',
                   'transcript_name': 'RP11-84A14.4-001'},
                  {'aa_mutation': '',
                   'cds_mutation': '',
                   'consequence_type': 'upstream_gene_variant',
                   'gene_affected': 'ENSG00000173614',
                   'gene_strand': '+',
                   'gene_symbol': 'NMNAT1',
                   'protein_affected': '',
                   'transcript_affected': 'ENST00000462686',
                   'transcript_name': 'NMNAT1-005'},
                  {'aa_mutation': '',
                   'cds_mutation': '',
                   'consequence_type': 'upstream_gene_variant',
                   'gene_affected': 'ENSG00000173614',
                   'gene_strand': '+',
                   'gene_symbol': 'NMNAT1',
                   'protein_affec

                           {'affected_donors': 2,
                            'frequency': 0.01093,
                            'project_code': 'MELA-AU',
                            'tested_donors': 183}],
 'occurrence_global': {'affected_donors': 3,
                       'frequency': 0.00028174305033809165,
                       'mutation': 'C>T',
                       'project_count': 2,
                       'tested_donors': 10648},
 'quality': '.',
 'reference_allele': 'C'}
{'GRCh37_pos': 100013211,
 'chromosome': '1',
 'consequences': [{'aa_mutation': '',
                   'cds_mutation': '',
                   'consequence_type': 'intergenic_region',
                   'gene_affected': '',
                   'gene_strand': '',
                   'gene_symbol': '',
                   'protein_affected': '',
                   'transcript_affected': '',
                   'transcript_name': ''}],
 'filter': '.',
 'mutated_allele': 'A',
 'mutation_id': 'MU62984970',
 'occurren

                   'gene_affected': 'ENSG00000273443',
                   'gene_strand': '1',
                   'gene_symbol': 'RP11-54O7.18',
                   'protein_affected': '',
                   'transcript_affected': 'ENST00000442292',
                   'transcript_name': 'RP11-54O7.18-001'},
                  {'aa_mutation': '',
                   'cds_mutation': '',
                   'consequence_type': 'intron_variant',
                   'gene_affected': 'ENSG00000217801',
                   'gene_strand': '+',
                   'gene_symbol': 'RP11-465B22.3',
                   'protein_affected': '',
                   'transcript_affected': 'ENST00000451054',
                   'transcript_name': 'RP11-465B22.3-004'},
                  {'aa_mutation': '',
                   'cds_mutation': '',
                   'consequence_type': 'intron_variant',
                   'gene_affected': 'ENSG00000217801',
                   'gene_strand': '+',
                   'ge

                   'transcript_affected': '',
                   'transcript_name': ''}],
 'filter': '.',
 'mutated_allele': 'C',
 'mutation_id': 'MU47349837',
 'occurrence_by_project': [{'affected_donors': 1,
                            'frequency': 0.00546,
                            'project_code': 'MELA-AU',
                            'tested_donors': 183}],
 'occurrence_global': {'affected_donors': 1,
                       'frequency': 9.391435011269723e-05,
                       'mutation': 'G>C',
                       'project_count': 1,
                       'tested_donors': 10648},
 'quality': '.',
 'reference_allele': 'G'}
{'GRCh37_pos': 100016879,
 'chromosome': '1',
 'consequences': [{'aa_mutation': '',
                   'cds_mutation': '',
                   'consequence_type': 'intergenic_region',
                   'gene_affected': '',
                   'gene_strand': '',
                   'gene_symbol': '',
                   'protein_affected': '',
           

                            'tested_donors': 203}],
 'occurrence_global': {'affected_donors': 1,
                       'frequency': 9.391435011269723e-05,
                       'mutation': 'G>T',
                       'project_count': 1,
                       'tested_donors': 10648},
 'quality': '.',
 'reference_allele': 'G'}
{'GRCh37_pos': 100019270,
 'chromosome': '1',
 'consequences': [{'aa_mutation': '',
                   'cds_mutation': '',
                   'consequence_type': 'intergenic_region',
                   'gene_affected': '',
                   'gene_strand': '',
                   'gene_symbol': '',
                   'protein_affected': '',
                   'transcript_affected': '',
                   'transcript_name': ''}],
 'filter': '.',
 'mutated_allele': 'T',
 'mutation_id': 'MU45761640',
 'occurrence_by_project': [{'affected_donors': 1,
                            'frequency': 0.00546,
                            'project_code': 'MELA-AU',
           

                       'tested_donors': 10648},
 'quality': '.',
 'reference_allele': 'A'}
{'GRCh37_pos': 100020149,
 'chromosome': '1',
 'consequences': [{'aa_mutation': '',
                   'cds_mutation': '',
                   'consequence_type': 'intergenic_region',
                   'gene_affected': '',
                   'gene_strand': '',
                   'gene_symbol': '',
                   'protein_affected': '',
                   'transcript_affected': '',
                   'transcript_name': ''}],
 'filter': '.',
 'mutated_allele': 'A',
 'mutation_id': 'MU60063573',
 'occurrence_by_project': [{'affected_donors': 1,
                            'frequency': 0.00546,
                            'project_code': 'MELA-AU',
                            'tested_donors': 183}],
 'occurrence_global': {'affected_donors': 1,
                       'frequency': 9.391435011269723e-05,
                       'mutation': 'G>A',
                       'project_count': 1,
           

                       'mutation': 'G>A',
                       'project_count': 1,
                       'tested_donors': 10648},
 'quality': '.',
 'reference_allele': 'G'}
{'GRCh37_pos': 100022648,
 'chromosome': '1',
 'consequences': [{'aa_mutation': '',
                   'cds_mutation': '',
                   'consequence_type': 'intergenic_region',
                   'gene_affected': '',
                   'gene_strand': '',
                   'gene_symbol': '',
                   'protein_affected': '',
                   'transcript_affected': '',
                   'transcript_name': ''}],
 'filter': '.',
 'mutated_allele': 'G',
 'mutation_id': 'MU30343677',
 'occurrence_by_project': [{'affected_donors': 1,
                            'frequency': 0.02703,
                            'project_code': 'PAEN-IT',
                            'tested_donors': 37}],
 'occurrence_global': {'affected_donors': 1,
                       'frequency': 9.391435011269723e-05,
            

 'reference_allele': 'T'}
{'GRCh37_pos': 100025015,
 'chromosome': '1',
 'consequences': [{'aa_mutation': '',
                   'cds_mutation': '',
                   'consequence_type': 'intergenic_region',
                   'gene_affected': '',
                   'gene_strand': '',
                   'gene_symbol': '',
                   'protein_affected': '',
                   'transcript_affected': '',
                   'transcript_name': ''}],
 'filter': '.',
 'mutated_allele': 'A',
 'mutation_id': 'MU58409205',
 'occurrence_by_project': [{'affected_donors': 1,
                            'frequency': 0.00546,
                            'project_code': 'MELA-AU',
                            'tested_donors': 183}],
 'occurrence_global': {'affected_donors': 1,
                       'frequency': 9.391435011269723e-05,
                       'mutation': 'G>A',
                       'project_count': 1,
                       'tested_donors': 10648},
 'quality': '.',
 'reference

                       'tested_donors': 10648},
 'quality': '.',
 'reference_allele': 'C'}
{'GRCh37_pos': 100027208,
 'chromosome': '1',
 'consequences': [{'aa_mutation': '',
                   'cds_mutation': '',
                   'consequence_type': 'intergenic_region',
                   'gene_affected': '',
                   'gene_strand': '',
                   'gene_symbol': '',
                   'protein_affected': '',
                   'transcript_affected': '',
                   'transcript_name': ''}],
 'filter': '.',
 'mutated_allele': 'T',
 'mutation_id': 'MU59346482',
 'occurrence_by_project': [{'affected_donors': 1,
                            'frequency': 0.00546,
                            'project_code': 'MELA-AU',
                            'tested_donors': 183}],
 'occurrence_global': {'affected_donors': 1,
                       'frequency': 9.391435011269723e-05,
                       'mutation': 'C>T',
                       'project_count': 1,
           

                   'gene_symbol': 'NMNAT1',
                   'protein_affected': '',
                   'transcript_affected': 'ENST00000492735',
                   'transcript_name': 'NMNAT1-004'},
                  {'aa_mutation': '',
                   'cds_mutation': '',
                   'consequence_type': '5_prime_UTR_premature_start_codon_gain_variant',
                   'gene_affected': 'ENSG00000162441',
                   'gene_strand': '1',
                   'gene_symbol': 'LZIC',
                   'protein_affected': '',
                   'transcript_affected': 'ENST00000541052',
                   'transcript_name': 'LZIC-202'}],
 'filter': '.',
 'mutated_allele': 'C',
 'mutation_id': 'MU4626672',
 'occurrence_by_project': [{'affected_donors': 1,
                            'frequency': 0.01075,
                            'project_code': 'OV-AU',
                            'tested_donors': 93}],
 'occurrence_global': {'affected_donors': 1,
                       

 'consequences': [{'aa_mutation': '',
                   'cds_mutation': '',
                   'consequence_type': 'intergenic_region',
                   'gene_affected': '',
                   'gene_strand': '',
                   'gene_symbol': '',
                   'protein_affected': '',
                   'transcript_affected': '',
                   'transcript_name': ''}],
 'filter': '.',
 'mutated_allele': 'TT',
 'mutation_id': 'MU43615612',
 'occurrence_by_project': [{'affected_donors': 1,
                            'frequency': 0.00546,
                            'project_code': 'MELA-AU',
                            'tested_donors': 183}],
 'occurrence_global': {'affected_donors': 1,
                       'frequency': 9.391435011269723e-05,
                       'mutation': 'CC>TT',
                       'project_count': 1,
                       'tested_donors': 10648},
 'quality': '.',
 'reference_allele': 'CC'}
{'GRCh37_pos': 100029877,
 'chromosome': '1',
 'conse

 'consequences': [{'aa_mutation': '',
                   'cds_mutation': '',
                   'consequence_type': 'intergenic_region',
                   'gene_affected': '',
                   'gene_strand': '',
                   'gene_symbol': '',
                   'protein_affected': '',
                   'transcript_affected': '',
                   'transcript_name': ''}],
 'filter': '.',
 'mutated_allele': 'A',
 'mutation_id': 'MU45220254',
 'occurrence_by_project': [{'affected_donors': 1,
                            'frequency': 0.00546,
                            'project_code': 'MELA-AU',
                            'tested_donors': 183}],
 'occurrence_global': {'affected_donors': 1,
                       'frequency': 9.391435011269723e-05,
                       'mutation': 'G>A',
                       'project_count': 1,
                       'tested_donors': 10648},
 'quality': '.',
 'reference_allele': 'G'}
{'GRCh37_pos': 100031164,
 'chromosome': '1',
 'consequen

                   'gene_symbol': '',
                   'protein_affected': '',
                   'transcript_affected': '',
                   'transcript_name': ''}],
 'filter': '.',
 'mutated_allele': 'A',
 'mutation_id': 'MU43787304',
 'occurrence_by_project': [{'affected_donors': 1,
                            'frequency': 0.00546,
                            'project_code': 'MELA-AU',
                            'tested_donors': 183}],
 'occurrence_global': {'affected_donors': 1,
                       'frequency': 9.391435011269723e-05,
                       'mutation': 'G>A',
                       'project_count': 1,
                       'tested_donors': 10648},
 'quality': '.',
 'reference_allele': 'G'}
{'GRCh37_pos': 100033049,
 'chromosome': '1',
 'consequences': [{'aa_mutation': '',
                   'cds_mutation': '',
                   'consequence_type': 'intergenic_region',
                   'gene_affected': '',
                   'gene_strand': '',
           

 'quality': '.',
 'reference_allele': 'A'}
{'GRCh37_pos': 100034347,
 'chromosome': '1',
 'consequences': [{'aa_mutation': '',
                   'cds_mutation': '',
                   'consequence_type': 'intergenic_region',
                   'gene_affected': '',
                   'gene_strand': '',
                   'gene_symbol': '',
                   'protein_affected': '',
                   'transcript_affected': '',
                   'transcript_name': ''}],
 'filter': '.',
 'mutated_allele': 'G',
 'mutation_id': 'MU59162763',
 'occurrence_by_project': [{'affected_donors': 1,
                            'frequency': 0.00926,
                            'project_code': 'PRAD-UK',
                            'tested_donors': 108}],
 'occurrence_global': {'affected_donors': 1,
                       'frequency': 9.391435011269723e-05,
                       'mutation': 'A>G',
                       'project_count': 1,
                       'tested_donors': 10648},
 'quality':

                            'project_code': 'MELA-AU',
                            'tested_donors': 183}],
 'occurrence_global': {'affected_donors': 1,
                       'frequency': 9.391435011269723e-05,
                       'mutation': 'T>A',
                       'project_count': 1,
                       'tested_donors': 10648},
 'quality': '.',
 'reference_allele': 'T'}
{'GRCh37_pos': 100036657,
 'chromosome': '1',
 'consequences': [{'aa_mutation': '',
                   'cds_mutation': '',
                   'consequence_type': 'intergenic_region',
                   'gene_affected': '',
                   'gene_strand': '',
                   'gene_symbol': '',
                   'protein_affected': '',
                   'transcript_affected': '',
                   'transcript_name': ''}],
 'filter': '.',
 'mutated_allele': 'A',
 'mutation_id': 'MU58678430',
 'occurrence_by_project': [{'affected_donors': 1,
                            'frequency': 0.00546,
           

                  {'aa_mutation': '',
                   'cds_mutation': '',
                   'consequence_type': 'upstream_gene_variant',
                   'gene_affected': 'ENSG00000162441',
                   'gene_strand': '1',
                   'gene_symbol': 'LZIC',
                   'protein_affected': '',
                   'transcript_affected': 'ENST00000400903',
                   'transcript_name': 'LZIC-201'},
                  {'aa_mutation': '',
                   'cds_mutation': '',
                   'consequence_type': 'intron_variant',
                   'gene_affected': 'ENSG00000173614',
                   'gene_strand': '+',
                   'gene_symbol': 'NMNAT1',
                   'protein_affected': '',
                   'transcript_affected': 'ENST00000403197',
                   'transcript_name': 'NMNAT1-002'},
                  {'aa_mutation': '',
                   'cds_mutation': '',
                   'consequence_type': 'intron_variant',
     

In [171]:
from pony.orm import *


db = Database()


class Mutation(db.Entity):
    id = PrimaryKey(int, auto=True)
    mutation_id = Required(str, unique=True)
    chromosome = Required(str)
    GRCh37_pos = Required(int)
    reference_allele = Required(str)
    mutated_allele = Required(str)
    quality = Optional(str)
    filter = Optional(str)
    occurrence_global = Required('OccurrenceGlobal')
    consequences = Set('Consequence')
    occurrences = Set('OccurrenceByProject')


class Consequence(db.Entity):
    id = PrimaryKey(int, auto=True)
    gene_symbol = Optional(str)
    gene_affected = Optional(str)
    gene_strand = Optional(str)
    transcript_name = Optional(str)
    transcript_affected = Optional(str)
    protein_affected = Optional(str)
    consequence_type = Optional(str)
    cds_mutation = Optional(str)
    aa_mutation = Optional(str)
    mutation = Set(Mutation)


class OccurrenceByProject(db.Entity):
    id = PrimaryKey(int, auto=True)
    project_code = Required(str)
    affected_donors = Required(int)
    tested_donors = Required(int)
    frequency = Required(float)
    mutations = Set(Mutation)


class OccurrenceGlobal(db.Entity):
    id = PrimaryKey(int, auto=True)
    project_count = Required(int)
    mutation = Optional(str)
    affected_donors = Required(int)
    tested_donors = Required(int)
    frequency = Optional(float)
    mutations = Set(Mutation)


# Initialize a database
pwd = !pwd;
db.bind(provider='sqlite', filename=pwd[0]+'/mutations.sqlite', create_db=True)

# Map the objects to tables
db.generate_mapping(create_tables=True)

In [None]:
@db_session
def add_mutation_to_db(mutation):
    """Map the parsed mutation to a form understandable by the database"""
    
    # Map the consequences
    c = [ Consequence(**consequence) for consequence in mutation['consequences'] ]

    # Map the occurrences
    op = [ OccurrenceByProject(**occurrence) for occurrence in mutation['occurrence_by_project'] ]

    # Map the global occurrence
    og = OccurrenceGlobal(**mutation['occurrence_global'])

    # Map the mutation
    excluded = ['occurrence_global', 'occurrence_by_project', 'consequences']
    m = Mutation( **{ key : mutation[key] for key in mutation if key not in excluded },
                  occurrence_global=og
                )
    # Add the occurrences by project
    for occurrence in op:
        m.occurrences.add(occurrence)
    # Add the consequences
    for consequence in c:
        m.consequences.add(consequence)
# ---

@db_session
def load_mutations(filename):
    for raw_mutation in open_vcf(filename):
        mutation = parse_mutation(raw_mutation)
        add_mutation_to_db(mutation)
# ---


load_mutations('../data/data_release_22/ssm_all.vcf')

In [149]:
Mutation.select().show()

id |mutati...|chromo...|GRCh37...|refere...|mutate...|quality|filter|occurr...
---+---------+---------+---------+---------+---------+-------+------+---------
1  |MU3953...|1        |100000022|C        |T        |.      |.     |Occurr...
2  |MU259333 |1        |100000181|G        |A        |.      |.     |Occurr...
3  |MU6491...|1        |100000340|AGGAAT...|AGTATA...|.      |.     |Occurr...
4  |MU1214865|1        |100000409|G        |A        |.      |.     |Occurr...
5  |MU6625...|1        |100000520|T        |C        |.      |.     |Occurr...
6  |MU5165...|1        |100000600|A        |G        |.      |.     |Occurr...
7  |MU4737...|1        |100000874|A        |G        |.      |.     |Occurr...
8  |MU5551...|1        |100000914|G        |A        |.      |.     |Occurr...
9  |MU1765...|1        |100000961|G        |A        |.      |.     |Occurr...
10 |MU4397...|1        |10000103 |G        |A        |.      |.     |Occurr...
11 |MU6654...|1        |100001284|T        |A       

In [150]:
OccurrenceGlobal.select().show()

id |project_count|mutation       |affected_donors|tested_donors|frequency      
---+-------------+---------------+---------------+-------------+---------------
1  |1            |C>T            |1              |10648        |9.3914350112...
2  |1            |G>A            |1              |10648        |9.3914350112...
3  |1            |AGGAATAGGGTG...|1              |10648        |9.3914350112...
4  |1            |G>A            |1              |10648        |9.3914350112...
5  |1            |T>C            |1              |10648        |9.3914350112...
6  |1            |A>G            |1              |10648        |9.3914350112...
7  |1            |A>G            |1              |10648        |9.3914350112...
8  |1            |G>A            |1              |10648        |9.3914350112...
9  |1            |G>A            |1              |10648        |9.3914350112...
10 |1            |G>A            |1              |10648        |9.3914350112...
11 |1            |T>A            |1     

In [155]:
query = select(m.occurrence_global.affected_donors for m in Mutation)

# Get the recurrence relation
from collections import Counter
recurrence = Counter(query).items()
list(recurrence)

[(1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (10, 1), (8, 1)]

In [135]:
db.insert?

In [161]:
%%time
2+2

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 11 µs


4

In [170]:
db.drop_all_tables(with_all_data=True)