In [5]:
import sys
import csv
import re
import collections

In [2]:
input_genes_file = '/SGRNJ06/randd/USER/cjj/celedev/mkgtf/c3/Homo_sapiens.GRCh38.92.chr.gtf'
out_genes_file = '/SGRNJ06/randd/USER/cjj/celedev/mkgtf/c3/jupyter.gtf'

In [3]:
attributes_str = " --attribute=gene_biotype:protein_coding,
                   --attribute=gene_biotype:lncRNA,
                   --attribute=gene_biotype:antisense
                   --attribute=gene_biotype:IG_LV_gene
                   --attribute=gene_biotype:IG_V_gene
                   --attribute=gene_biotype:IG_V_pseudogene
                   --attribute=gene_biotype:IG_D_gene
                   --attribute=gene_biotype:IG_J_gene
                   --attribute=gene_biotype:IG_J_pseudogene
                   --attribute=gene_biotype:IG_C_gene
                   --attribute=gene_biotype:IG_C_pseudogene
                   --attribute=gene_biotype:TR_V_gene
                   --attribute=gene_biotype:TR_V_pseudogene
                   --attribute=gene_biotype:TR_D_gene
                   --attribute=gene_biotype:TR_J_gene
                   --attribute=gene_biotype:TR_J_pseudogene
                   --attribute=gene_biotype:TR_C_gene "

In [4]:
attributes = collections.defaultdict(set)

In [5]:
    for attribute_str in attributes_str:
        parts = attribute_str.split(':')
        if len(parts) != 2:
            sys.exit("Attribute option must have format <key;value>: %s" % attribute_str)
        key, value = parts
        attributes[key].add(value)

SystemExit: Attribute option must have format <key;value>:  

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [6]:
input_genes_file = '/SGRNJ06/randd/USER/cjj/celedev/mkgtf/c3/Homo_sapiens.GRCh38.92.chr.gtf'

In [None]:
with open(input_genes_file, 'r') as f:
    reader = csv.reader(f, delimiter='\t')
    for i, row in enumerate(reader):
        print(row[-1])

In [7]:
def get_properties_dict(properties_str):
    if isinstance(properties_str, dict):
        return properties_str

    properties = collections.OrderedDict()
    pattern = re.compile('(\S+?)\s*"(.*?)"')
    for m in re.finditer(pattern, properties_str):
        print(m)
        key = m.group(1)
        value = m.group(2)
        properties[key] = value
    return properties

In [8]:
def gtf_reader_iter(in_gtf):
    with open(in_gtf, 'r') as f:
        reader = csv.reader(f, delimiter='\t')
        for i, row in enumerate(reader):
            if len(row) == 0:
                continue
            if row[0].startswith('#'):
                yield row, True, None
                continue

            if len(row) != 9:
                sys.exit("Invalid number of columns in GTF line %d: %s\n\n%s" % (i+1, '\t'.join(row), 'Please check your GTF file'))

            strand = row[6]
            if strand not in ['+', '-']:
                sys.exit('Invalid strand in GTF line %d: %s\n\n%s' % (i+1, '\t'.join(row), 'Please check your GTF file'))

            annotation = row[2]
            properties = get_properties_dict(row[8])
            if annotation == 'exon':
                if 'transcript_id' not in properties:
                    sys.exit("Property 'transcript_id' not found in GTF line %d: %s\n\n%s" % (i+1, '\t'.join(row), 'Please check your GTF file'))
                if ';' in properties['transcript_id']:
                    sys.exit("Property 'transcript_id' has invalid character ';' in GTF line %d: %s\n\n%s" % (i+1, '\t'.join(row), 'Please check your GTF file'))
                if re.search(r'\s', properties['transcript_id']) is not None:
                    sys.exit("Property 'transcript_id' has invalid whitespace character in GTF line %d: %s\n\n%s" % (i+1, '\t'.join(row), 'Please check your GTF file'))
                if 'gene_id' not in properties:
                    sys.exit("Property 'gene_id' not found in GTF line %d: %s\n\n%s" % (i+1, '\t'.join(row), 'Please check your GTF file'))
                if ';' in properties['gene_id']:
                    sys.exit("Property 'gene_id' has invalid character ';' in GTF line %d: %s\n\n%s" % (i+1, '\t'.join(row), 'Please check your GTF file'))

            yield row, False, properties

In [9]:
        for row, is_comment, properties in gtf_reader_iter(input_genes_file):
            print(properties)
            break
            remove = False
            for key, value in properties.items():
                if key in ATTRIBUTES and value not in ATTRIBUTES[key]:
                    remove = True

            if not remove:
                print(row)


<re.Match object; span=(0, 25), match='gene_id "ENSG00000223972"'>
<re.Match object; span=(27, 43), match='gene_version "5"'>
<re.Match object; span=(45, 64), match='gene_name "DDX11L1"'>
<re.Match object; span=(66, 86), match='gene_source "havana"'>
<re.Match object; span=(88, 137), match='gene_biotype "transcribed_unprocessed_pseudogene">
OrderedDict([('gene_id', 'ENSG00000223972'), ('gene_version', '5'), ('gene_name', 'DDX11L1'), ('gene_source', 'havana'), ('gene_biotype', 'transcribed_unprocessed_pseudogene')])


In [32]:
ATTRIBUTES = {
    'gene_biotype':{
        'lncRNA',
        'antisense',
        'IG_LV_gene',
        'IG_V_gene',
        'IG_V_pseudogene',
        'IG_D_gene',
        'IG_J_gene',
        'IG_J_pseudogene',
        'IG_C_gene',
        'IG_C_pseudogene',
        'TR_V_gene',
        'TR_V_pseudogene',
        'TR_D_gene',
        'TR_J_gene',
        'TR_J_pseudogene',
        'TR_C_gene',
    }
}

In [42]:
count_set_sgr = set()
with open('/SGRNJ06/randd/USER/cjj/celedev/mkgtf/c3/cs.gtf', 'r') as f:
    reader = csv.reader(f, delimiter='\t')
    pattern = r'gene_biotype "(.*?)"'
    for i, row in enumerate(reader):
        gene_name = re.findall(pattern, row[-1])[0]
        count_set_sgr.add(gene_name)

In [43]:
count_set_sgr

{'IG_C_gene',
 'IG_C_pseudogene',
 'IG_D_gene',
 'IG_J_gene',
 'IG_J_pseudogene',
 'IG_V_gene',
 'IG_V_pseudogene',
 'TR_C_gene',
 'TR_D_gene',
 'TR_J_gene',
 'TR_J_pseudogene',
 'TR_V_gene',
 'TR_V_pseudogene',
 'antisense'}

In [44]:
count_set_sgr = set()
with open('/SGRNJ06/randd/USER/cjj/celedev/mkgtf/c3/Homo_sapiens.GRCh38.92.chr.gtf', 'r') as f:
    reader = csv.reader(f, delimiter='\t')
    pattern = r'gene_biotype "(.*?)"'
    for i, row in enumerate(reader):
        gene_name = re.findall(pattern, row[-1])[0]
        count_set_sgr.add(gene_name)

In [45]:
count_set_sgr

{'3prime_overlapping_ncRNA',
 'IG_C_gene',
 'IG_C_pseudogene',
 'IG_D_gene',
 'IG_J_gene',
 'IG_J_pseudogene',
 'IG_V_gene',
 'IG_V_pseudogene',
 'IG_pseudogene',
 'Mt_rRNA',
 'Mt_tRNA',
 'TEC',
 'TR_C_gene',
 'TR_D_gene',
 'TR_J_gene',
 'TR_J_pseudogene',
 'TR_V_gene',
 'TR_V_pseudogene',
 'antisense',
 'bidirectional_promoter_lncRNA',
 'lincRNA',
 'macro_lncRNA',
 'miRNA',
 'misc_RNA',
 'non_coding',
 'polymorphic_pseudogene',
 'processed_pseudogene',
 'processed_transcript',
 'protein_coding',
 'pseudogene',
 'rRNA',
 'ribozyme',
 'sRNA',
 'scRNA',
 'scaRNA',
 'sense_intronic',
 'sense_overlapping',
 'snRNA',
 'snoRNA',
 'transcribed_processed_pseudogene',
 'transcribed_unitary_pseudogene',
 'transcribed_unprocessed_pseudogene',
 'translated_processed_pseudogene',
 'unitary_pseudogene',
 'unprocessed_pseudogene',
 'vaultRNA'}

In [46]:
count_set_sgr = set()
with open('/SGRNJ06/randd/USER/cjj/celedev/mkgtf/c3/c3out.gtf', 'r') as f:
    reader = csv.reader(f, delimiter='\t')
    pattern = r'gene_biotype "(.*?)"'
    for i, row in enumerate(reader):
        gene_name = re.findall(pattern, row[-1])[0]
        count_set_sgr.add(gene_name)

In [47]:
count_set_sgr

{'IG_C_gene',
 'IG_C_pseudogene',
 'IG_D_gene',
 'IG_J_gene',
 'IG_J_pseudogene',
 'IG_V_gene',
 'IG_V_pseudogene',
 'TR_C_gene',
 'TR_D_gene',
 'TR_J_gene',
 'TR_J_pseudogene',
 'TR_V_gene',
 'TR_V_pseudogene',
 'antisense',
 'protein_coding'}

In [48]:
count_set_sgr = set()
with open('/SGRNJ06/randd/USER/cjj/celedev/mkgtf/c3/cs.gtf', 'r') as f:
    reader = csv.reader(f, delimiter='\t')
    pattern = r'gene_biotype "(.*?)"'
    for i, row in enumerate(reader):
        gene_name = re.findall(pattern, row[-1])[0]
        count_set_sgr.add(gene_name)

In [49]:
count_set_sgr

{'IG_C_gene',
 'IG_C_pseudogene',
 'IG_D_gene',
 'IG_J_gene',
 'IG_J_pseudogene',
 'IG_V_gene',
 'IG_V_pseudogene',
 'TR_C_gene',
 'TR_D_gene',
 'TR_J_gene',
 'TR_J_pseudogene',
 'TR_V_gene',
 'TR_V_pseudogene',
 'antisense',
 'protein_coding'}

In [51]:
count_set_sgr = set()
with open('/SGRNJ06/randd/USER/cjj/celedev/mkgtf/c3/c7out.gtf', 'r') as f:
    reader = csv.reader(f, delimiter='\t')
    pattern = r'gene_biotype "(.*?)"'
    for i, row in enumerate(reader):
        gene_name = re.findall(pattern, row[-1])[0]
        count_set_sgr.add(gene_name)

In [52]:
count_set_sgr

{'IG_C_gene',
 'IG_C_pseudogene',
 'IG_D_gene',
 'IG_J_gene',
 'IG_J_pseudogene',
 'IG_V_gene',
 'IG_V_pseudogene',
 'TR_C_gene',
 'TR_D_gene',
 'TR_J_gene',
 'TR_J_pseudogene',
 'TR_V_gene',
 'TR_V_pseudogene',
 'antisense',
 'protein_coding'}

In [59]:
c7 = open('/SGRNJ06/randd/USER/cjj/celedev/mkgtf/c3/c7out.gtf', 'r')
c7reader = csv.reader(c7, delimiter='\t')
cs = open('/SGRNJ06/randd/USER/cjj/celedev/mkgtf/c3/cs.gtf', 'r')
csreader = csv.reader(cs, delimiter='\t')

In [62]:
next(c7reader)

['1',
 'havana',
 'transcript',
 '65419',
 '71585',
 '.',
 '+',
 '.',
 'gene_id "ENSG00000186092"; gene_version "6"; transcript_id "ENST00000641515"; transcript_version "2"; gene_name "OR4F5"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR4F5-202"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "basic";']

In [63]:
next(csreader)

['1',
 'havana',
 'transcript',
 '65419',
 '71585',
 '.',
 '+',
 '.',
 'gene_id "ENSG00000186092"; gene_version "6"; transcript_id "ENST00000641515"; transcript_version "2"; gene_name "OR4F5"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR4F5-202"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "basic";']

In [57]:
count = 0
for i in range(2505922):
    count+=1 
    if next(c7reader) != next(csreader):
        print(count)

StopIteration: 

In [58]:
next(c7reader)

StopIteration: 

In [10]:
a = 
5       ensembl_havana  exon    73579317        73579375        .       +       .       gene_id "ENSG00000164338"; gene_version "9"; transcript_id "ENST00000296792"; transcript_version "8"; exon_number "12"; gene_name "UTP15"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "UTP15-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS34186"; exon_id "ENSE00001083657"; exon_version "1"; tag "basic"; transcript_support_level "1";
5       ensembl_havana  CDS     73579317        73579375        .       +       1       gene_id "ENSG00000164338"; gene_version "9"; transcript_id "ENST00000296792"; transcript_version "8"; exon_number "12"; gene_name "UTP15"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "UTP15-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS34186"; protein_id "ENSP00000296792"; protein_version "4"; tag "basic"; transcript_support_level "1";
5       ensembl exon    73579317        73579375        .       +       .       gene_id "ENSG00000164338"; gene_version "9"; transcript_id "ENST00000543251"; transcript_version "5"; exon_number "11"; gene_name "UTP15"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "UTP15-208"; transcript_source "ensembl"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS68894"; exon_id "ENSE00001083657"; exon_version "1"; tag "basic"; transcript_support_level "2";
5       ensembl CDS     73579317        73579375        .       +       1       gene_id "ENSG00000164338"; gene_version "9"; transcript_id "ENST00000543251"; transcript_version "5"; exon_number "11"; gene_name "UTP15"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "UTP15-208"; transcript_source "ensembl"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS68894"; protein_id "ENSP00000440796"; protein_version "1"; tag "basic"; transcript_support_level "2";
5       havana  exon    73579317        73579375        .       +       .       gene_id "ENSG00000164338"; gene_version "9"; transcript_id "ENST00000509005"; transcript_version "5"; exon_number "11"; gene_name "UTP15"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "UTP15-204"; transcript_source "havana"; transcript_biotype "protein_coding"; exon_id "ENSE00001083657"; exon_version "1"; tag "cds_start_NF"; tag "mRNA_start_NF"; transcript_support_level "2";
5       havana  CDS     73579317        73579375        .       +       1       gene_id "ENSG00000164338"; gene_version "9"; transcript_id "ENST00000509005"; transcript_version "5"; exon_number "11"; gene_name "UTP15"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "UTP15-204"; transcript_source "havana"; transcript_biotype "protein_coding"; protein_id "ENSP00000421669"; protein_version "1"; tag "cds_start_NF"; tag "mRNA_start_NF"; transcript_support_level "2";
5       havana  exon    73579317        73579375        .       +       .       gene_id "ENSG00000164338"; gene_version "9"; transcript_id "ENST00000508491"; transcript_version "1"; exon_number "12"; gene_name "UTP15"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "UTP15-202"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS68893"; exon_id "ENSE00001083657"; exon_version "1"; tag "basic"; transcript_support_level "2";
5       havana  CDS     73579317        73579375        .       +       1       gene_id "ENSG00000164338"; gene_version "9"; transcript_id "ENST00000508491"; transcript_version "1"; exon_number "12"; gene_name "UTP15"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "UTP15-202"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS68893"; protein_id "ENSP00000424609"; protein_version "1"; tag "basic"; transcript_support_level "2";

SyntaxError: invalid syntax (<ipython-input-10-38192711f8db>, line 1)

In [None]:
b = 
5       ensembl_havana  exon    73579317        73579375        .       +       .       gene_id "ENSG00000164338"; gene_version "9"; transcript_id "ENST00000296792"; transcript_version "8"; exon_number "12"; gene_name "UTP15"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "UTP15-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS34186"; exon_id "ENSE00001083657"; exon_version "1"; tag "basic"; transcript_support_level "1";
5       ensembl_havana  CDS     73579317        73579375        .       +       1       gene_id "ENSG00000164338"; gene_version "9"; transcript_id "ENST00000296792"; transcript_version "8"; exon_number "12"; gene_name "UTP15"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "UTP15-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS34186"; protein_id "ENSP00000296792"; protein_version "4"; tag "basic"; transcript_support_level "1";
5       ensembl exon    73579317        73579375        .       +       .       gene_id "ENSG00000164338"; gene_version "9"; transcript_id "ENST00000543251"; transcript_version "5"; exon_number "11"; gene_name "UTP15"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "UTP15-208"; transcript_source "ensembl"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS68894"; exon_id "ENSE00001083657"; exon_version "1"; tag "basic"; transcript_support_level "2";
5       ensembl CDS     73579317        73579375        .       +       1       gene_id "ENSG00000164338"; gene_version "9"; transcript_id "ENST00000543251"; transcript_version "5"; exon_number "11"; gene_name "UTP15"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "UTP15-208"; transcript_source "ensembl"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS68894"; protein_id "ENSP00000440796"; protein_version "1"; tag "basic"; transcript_support_level "2";
5       havana  exon    73579317        73579375        .       +       .       gene_id "ENSG00000164338"; gene_version "9"; transcript_id "ENST00000509005"; transcript_version "5"; exon_number "11"; gene_name "UTP15"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "UTP15-204"; transcript_source "havana"; transcript_biotype "protein_coding"; exon_id "ENSE00001083657"; exon_version "1"; tag "cds_start_NF"; tag "mRNA_start_NF"; transcript_support_level "2";
5       havana  CDS     73579317        73579375        .       +       1       gene_id "ENSG00000164338"; gene_version "9"; transcript_id "ENST00000509005"; transcript_version "5"; exon_number "11"; gene_name "UTP15"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "UTP15-204"; transcript_source "havana"; transcript_biotype "protein_coding"; protein_id "ENSP00000421669"; protein_version "1"; tag "cds_start_NF"; tag "mRNA_start_NF"; transcript_support_level "2";
5       havana  exon    73579317        73579375        .       +       .       gene_id "ENSG00000164338"; gene_version "9"; transcript_id "ENST00000508491"; transcript_version "1"; exon_number "12"; gene_name "UTP15"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "UTP15-202"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS68893"; exon_id "ENSE00001083657"; exon_version "1"; tag "basic"; transcript_support_level "2";
5       havana  CDS     73579317        73579375        .       +       1       gene_id "ENSG00000164338"; gene_version "9"; transcript_id "ENST00000508491"; transcript_version "1"; exon_number "12"; gene_name "UTP15"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "UTP15-202"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS68893"; protein_id "ENSP00000424609"; protein_version "1"; tag "basic"; transcript_support_level "2";