# Introduction

I should double check what's been removed in the 10x minimal annotation set.

In [1]:
import scipy
import pandas
import scanpy
import numpy
from pathlib import Path
from matplotlib import pyplot
import os
import sys
from collections import Counter
from xopen import xopen

import upsetplot

In [2]:
WRS = str(Path("~/proj/long-rna-seq-condor/").expanduser())

if WRS not in sys.path:
    sys.path.append(WRS)

from woldrnaseq import gff2table

In [3]:
tenx_root = Path("~/proj/illumina/refdata-cellranger-arc-GRCh38-2020-A-2.0.0").expanduser()
tenx_parser = gff2table.GFFParser()
with xopen(tenx_root / "genes" / "genes.gtf.gz", "rt") as instream:
    tenx_parser.read_gff(instream)

gtf.shape (2765969, 9)


In [4]:
tenx_parser.gtf

Unnamed: 0,chromosome,source,type,start,stop,score,strand,frame,gene_id,gene_version,...,transcript_type,transcript_name,transcript_support_level,havana_transcript,exon_number,exon_id,exon_version,protein_id,ccdsid,ont
0,chr1,HAVANA,gene,29554,31109,,1,,ENSG00000243485,5,...,,,,,,,,,,
1,chr1,HAVANA,transcript,29554,31097,,1,,ENSG00000243485,5,...,lncRNA,MIR1302-2HG-202,5,OTTHUMT00000002840.1,,,,,,
2,chr1,HAVANA,exon,29554,30039,,1,,ENSG00000243485,5,...,lncRNA,MIR1302-2HG-202,5,OTTHUMT00000002840.1,1.0,ENSE00001947070,1,,,
3,chr1,HAVANA,exon,30564,30667,,1,,ENSG00000243485,5,...,lncRNA,MIR1302-2HG-202,5,OTTHUMT00000002840.1,2.0,ENSE00001922571,1,,,
4,chr1,HAVANA,exon,30976,31097,,1,,ENSG00000243485,5,...,lncRNA,MIR1302-2HG-202,5,OTTHUMT00000002840.1,3.0,ENSE00001827679,1,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2765964,KI270734.1,ENSEMBL,CDS,138483,138667,,-1,2.0,ENSG00000277196,4,...,protein_coding,AC007325.2-201,1,,15.0,ENSE00003753010,1,ENSP00000481127.1,,
2765965,KI270734.1,ENSEMBL,stop_codon,138480,138482,,-1,0.0,ENSG00000277196,4,...,protein_coding,AC007325.2-201,1,,15.0,ENSE00003753010,1,ENSP00000481127.1,,
2765966,KI270734.1,ENSEMBL,UTR,161689,161852,,-1,,ENSG00000277196,4,...,protein_coding,AC007325.2-201,1,,1.0,ENSE00003746084,1,ENSP00000481127.1,,
2765967,KI270734.1,ENSEMBL,UTR,161587,161626,,-1,,ENSG00000277196,4,...,protein_coding,AC007325.2-201,1,,2.0,ENSE00003719550,1,ENSP00000481127.1,,


In [5]:
v29_genome_dir = Path("~/proj/genome").expanduser()
v29 = v29_genome_dir / "GRCh38-V29-male-2.7.8a" / "GRCh38-V29-male-2.7.8a.h5"
store = pandas.HDFStore(v29, "r")
try:
    genes = store.select("/gtf", columns=["gene_id", "transcript_id", "gene_type", "transcript_type"], where=["type==gene"])
    gtf = store['/gtf']
finally:
    store.close()

In [6]:
genes['gene_base'] = genes['gene_id'].apply(lambda x: x.split('.')[0])

In [7]:
genes_types_included = pandas.Series(Counter(tenx_parser.gtf[tenx_parser.gtf['type'] == 'gene']['gene_type'])).to_frame('count').sort_values(by="count", ascending=False)
genes_types_included

Unnamed: 0,count
protein_coding,19394
lncRNA,16562
IG_V_pseudogene,188
IG_V_gene,144
TR_V_gene,106
TR_J_gene,79
IG_D_gene,37
TR_V_pseudogene,33
IG_J_gene,18
IG_C_gene,14


In [24]:
tenx_parser.gtf[tenx_parser.gtf['gene_id'] == 'ENSG00000267749'][['gene_id', 'gene_type', 'transcript_type']]

Unnamed: 0,gene_id,gene_type,transcript_type
2362196,ENSG00000267749,lncRNA,
2362197,ENSG00000267749,lncRNA,lncRNA
2362198,ENSG00000267749,lncRNA,lncRNA


In [9]:
genes_types_available = pandas.Series(Counter(genes['gene_type'])).to_frame('count').sort_values(by="count", ascending=False)
genes_types_available

Unnamed: 0,count
protein_coding,19969
processed_pseudogene,10198
lincRNA,7635
antisense,5587
unprocessed_pseudogene,2649
misc_RNA,2222
snRNA,1909
miRNA,1881
TEC,1060
snoRNA,943


In [10]:
target_dir = Path("~/proj/encode-202006-jamboree-detrout-rna-sc-pipeline/genome/GRCh38-V29_minimal-male").expanduser()

In [11]:
trna_types =  pandas.Series(Counter(gtf[gtf['type'] == 'tRNA']['gene_type'])).to_frame('count').sort_values(by="count", ascending=False)
trna_types

Unnamed: 0,count
Pseudo_tRNA,114
Ala_tRNA,48
Leu_tRNA,40
Asn_tRNA,38
Val_tRNA,36
Gln_tRNA,33
Lys_tRNA,33
Gly_tRNA,32
Cys_tRNA,30
Ser_tRNA,28


In [12]:
desired_types = set(genes_types_included.index)
desired_types

{'IG_C_gene',
 'IG_C_pseudogene',
 'IG_D_gene',
 'IG_J_gene',
 'IG_J_pseudogene',
 'IG_V_gene',
 'IG_V_pseudogene',
 'TR_C_gene',
 'TR_D_gene',
 'TR_J_gene',
 'TR_J_pseudogene',
 'TR_V_gene',
 'TR_V_pseudogene',
 'lncRNA',
 'protein_coding'}

In [13]:
desired_types.add('lincRNA')
desired_types

{'IG_C_gene',
 'IG_C_pseudogene',
 'IG_D_gene',
 'IG_J_gene',
 'IG_J_pseudogene',
 'IG_V_gene',
 'IG_V_pseudogene',
 'TR_C_gene',
 'TR_D_gene',
 'TR_J_gene',
 'TR_J_pseudogene',
 'TR_V_gene',
 'TR_V_pseudogene',
 'lincRNA',
 'lncRNA',
 'protein_coding'}

In [14]:
pandas.Series(Counter(gtf[gtf['type'] == 'tRNA']['gene_type'])).to_frame('count').sort_values(by="count", ascending=False)


Unnamed: 0,count
Pseudo_tRNA,114
Ala_tRNA,48
Leu_tRNA,40
Asn_tRNA,38
Val_tRNA,36
Gln_tRNA,33
Lys_tRNA,33
Gly_tRNA,32
Cys_tRNA,30
Ser_tRNA,28


In [15]:
desired_genes = set(genes[genes['gene_type'].isin(desired_types)]['gene_id'])

In [16]:
set(genes[genes['gene_type'].isin(desired_types)]['gene_type'])

{'IG_C_gene',
 'IG_C_pseudogene',
 'IG_D_gene',
 'IG_J_gene',
 'IG_J_pseudogene',
 'IG_V_gene',
 'IG_V_pseudogene',
 'TR_C_gene',
 'TR_D_gene',
 'TR_J_gene',
 'TR_J_pseudogene',
 'TR_V_gene',
 'TR_V_pseudogene',
 'lincRNA',
 'protein_coding'}

In [17]:
target_gtf = target_dir / "minimal.V29.gtf"
with open(target_gtf, "wt") as outstream:
    # write out regular gtf records
    for i, row in gtf.iterrows():
        if row['gene_id'] in desired_genes:
            outstream.write(gff2table.format_gtf_record(row))
            outstream.write(os.linesep)

In [18]:
filtered_parser = gff2table.GFFParser()
filtered_parser.read_gff(target_gtf)
filtered_parser.gtf.shape

gtf.shape (2568904, 9)


(2568904, 24)

In [21]:
desired_genes

{'ENSG00000183032.11',
 'ENSG00000172113.9',
 'ENSG00000230773.7',
 'ENSG00000144306.14',
 'ENSG00000114841.17',
 'ENSG00000129467.13',
 'ENSG00000275969.2',
 'ENSG00000173575.20',
 'ENSG00000079308.18',
 'ENSG00000126759.13',
 'ENSG00000121749.15',
 'ENSG00000256742.1',
 'ENSG00000173214.5',
 'ENSG00000223430.1',
 'ENSG00000272297.2',
 'ENSG00000119718.10',
 'ENSG00000167615.16',
 'ENSG00000211650.2',
 'ENSG00000234956.6',
 'ENSG00000257366.1',
 'ENSG00000285079.1',
 'ENSG00000123352.17',
 'ENSG00000204682.5',
 'ENSG00000204814.5',
 'ENSG00000109625.18',
 'ENSG00000092096.16',
 'ENSG00000154027.18',
 'ENSG00000143951.15',
 'ENSG00000143867.6',
 'ENSG00000116095.10',
 'ENSG00000095383.19',
 'ENSG00000242553.1',
 'ENSG00000214686.5',
 'ENSG00000243566.6',
 'ENSG00000205089.7',
 'ENSG00000205693.3',
 'ENSG00000106327.12',
 'ENSG00000279151.1',
 'ENSG00000006459.10',
 'ENSG00000233081.1',
 'ENSG00000169683.7',
 'ENSG00000125895.5',
 'ENSG00000142632.16',
 'ENSG00000176472.10',
 'ENSG00000

In [25]:
long_id = [x for x in desired_genes if x.startswith('ENSG00000267749')]
long_id

[]

In [22]:
filtered_parser.gtf[filtered_parser.gtf['gene_id'] == 'ENSG00000259588.1'][['gene_id', 'gene_type', 'transcript_type']]

Unnamed: 0,gene_id,gene_type,transcript_type
