In [1]:
import importlib
import dask.dataframe as dd
import openomics as oo
from openomics.database import *
from openomics.transcriptomics import *
from openomics.genomics import *
import filetype
import gzip 

from openomics import MultiOmics

In [2]:
cohort_folder = "/data/datasets/Bioinformatics_ExternalData/tcga-assembler/LUAD/"
cohort_name = "LUAD"
luad_data = MultiOmics(cohort_name)
luad_data.add_clinical_data(clinical_data="/data/datasets/Bioinformatics_ExternalData/tcga-assembler/LUAD/clinical/nationwidechildrens.org_clinical_patient.txt")

luad_data.add_omic(MessengerRNA(cohort_name, file_path=os.path.join(cohort_folder, "gene_exp", "geneExp.txt"), 
                                gene_index="gene_name",
                                columns="GeneSymbol|TCGA", genes_col_name="GeneSymbol"))

luad_data.add_omic(MicroRNA(cohort_name, file_path=os.path.join(cohort_folder, "mirna/", "miRNAExp__RPM.txt"), 
                            gene_index="gene_name",
                            columns="GeneSymbol|TCGA", genes_col_name="GeneSymbol"))

luad_data.add_omic(LncRNA(cohort_name, file_path=os.path.join(cohort_folder, "lncrna", "TCGA-rnaexpr.tsv"), 
                          gene_index="gene_id",
                          columns="Gene_ID|TCGA", genes_col_name="Gene_ID"))

luad_data.add_omic(SomaticMutation(cohort_name, file_path=os.path.join(cohort_folder, "somatic", "somaticMutation_geneLevel.txt"), 
                          gene_index="gene_id",
                          columns="GeneSymbol|TCGA", genes_col_name="GeneSymbol"))

luad_data.build_samples()

MessengerRNA (576, 20472)
MicroRNA (494, 1870)
LncRNA (546, 12727)
SomaticMutation (587, 21070)


In [3]:
# self.WSI = WholeSlideImage(cohort_name, os.path.join(cohort_folder, "wsi/"))

# file_path_SNP = os.path.join(cohort_folder, "somatic/", "somaticMutation_geneLevel.txt")
# self.SNP = SomaticMutation(cohort_name, file_path_SNP)
# file_path_DNA = os.path.join(cohort_folder, "dna/", "methylation_450.txt")
# self.DNA = DNAMethylation(cohort_name, file_path_DNA)
# file_path_CNV = os.path.join(cohort_folder, "cnv/", "copyNumber.txt")
# self.CNV = CopyNumberVariation(cohort_name, file_path_CNV)
# file_path_PRO = os.path.join(cohort_folder, "protein_rppa/", "protein_RPPA.txt")
# self.PRO = Protein(cohort_name, file_path_PRO)

# Import Datasets

In [8]:
# genomic annotation & sequence datasets
# gencode = GENCODE(path="/data/datasets/Bioinformatics_ExternalData/GENCODE/", 
#                   import_sequences="shortest")
rnacentral = RNAcentral(path="/data/datasets/Bioinformatics_ExternalData/RNAcentral/")
mirbase = MirBase(path="/data/datasets/Bioinformatics_ExternalData/mirbase/", 
                  RNAcentral_folder="/data/datasets/Bioinformatics_ExternalData/RNAcentral/")
ensembl = EnsemblGenes()

RNAcentral: ['index', 'RNAcentral id', 'database', 'transcript_id', 'species', 'RNA type', 'gene_name', 'go_id', 'Rfams']
MirBase: ['mirbase id', 'RNAcentral id', 'database', 'species', 'RNA type', 'gene name', 'gene_name']


  import sys


EnsemblGenes ['gene_id', 'gene_name', 'transcript_id', 'transcript_name', 'chromosome_name', 'transcript_start', 'transcript_end', 'transcript_length', 'gene_biotype', 'transcript_biotype', 'Rfams', 'go_id']


In [2]:
mirbase = MirBase(path="ftp://mirbase.org/pub/mirbase/CURRENT/")

Fetching file from URL: ftp://mirbase.org/pub/mirbase/CURRENT/ aliases.txt.gz
Fetching file from URL: ftp://mirbase.org/pub/mirbase/CURRENT/ mature.fa.gz
Fetching file from URL: ftp://ftp.ebi.ac.uk/pub/databases/RNAcentral/current_release/id_mapping/database_mappings/ mirbase.tsv
MirBase: ['mirbase id', 'RNAcentral id', 'database', 'species', 'RNA type', 'gene name', 'gene_name']


In [3]:
rnacentral = RNAcentral(path="ftp://ftp.ebi.ac.uk/pub/databases/RNAcentral/current_release/",
                       file_resources={"rnacentral_rfam_annotations.tsv":"go_annotations/rnacentral_rfam_annotations.tsv.gz",
                                      "gencode.tsv":"id_mapping/database_mappings/gencode.tsv"},
                       )


Fetching file from URL: ftp://ftp.ebi.ac.uk/pub/databases/RNAcentral/current_release/go_annotations/rnacentral_rfam_annotations.tsv.gz
Fetching file from URL: ftp://ftp.ebi.ac.uk/pub/databases/RNAcentral/current_release/id_mapping/database_mappings/gencode.tsv
RNAcentral: ['index', 'RNAcentral id', 'database', 'transcript_id', 'species', 'RNA type', 'gene_name', 'go_id', 'Rfams']


In [2]:
gencode = GENCODE(path="ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_32/",
                  file_resources={"long_noncoding_RNAs.gtf": "gencode.v32.long_noncoding_RNAs.gtf.gz",
                                  "lncRNA_transcripts.fa": "gencode.v32.lncRNA_transcripts.fa.gz",
                                  "transcripts.fa": "gencode.v32.transcripts.fa.gz"},
                  import_sequences="shortest")

INFO:root:Extracted GTF attributes: ['gene_id', 'gene_type', 'gene_name', 'level', 'hgnc_id', 'tag', 'havana_gene', 'transcript_id', 'transcript_type', 'transcript_name', 'transcript_support_level', 'havana_transcript', 'exon_number', 'exon_id', 'ont']


GENCODE: ['index', 'seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'gene_id', 'gene_type', 'gene_name', 'level', 'hgnc_id', 'tag', 'havana_gene', 'transcript_id', 'transcript_type', 'transcript_name', 'transcript_support_level', 'havana_transcript', 'exon_number', 'exon_id', 'ont']


In [7]:
# interaction datasets
lncbase = LncBase("/data/datasets/Bioinformatics_ExternalData/lncBase/", )
# targetscan = TargetScan("/data/datasets/Bioinformatics_ExternalData/TargetScan/", )
mirtarbase = MiRTarBase("/data/datasets/Bioinformatics_ExternalData/miRTarBase/", target_index="Target Gene")

LncBase ['geneId', 'geneName', 'mirna', 'species', 'cell_line', 'tissue', 'category', 'method', 'positive_negative', 'direct_indirect', 'condition']
Name: LncBase
Type: DiGraph
Number of nodes: 8212
Number of edges: 62321
Average in degree:   7.5890
Average out degree:   7.5890
Name: MiRTarBase
Type: DiGraph
Number of nodes: 17663
Number of edges: 380639
Average in degree:  21.5501
Average out degree:  21.5501


In [5]:
# disease association datasets

# Annotate Datasets

In [10]:
luad_data.LncRNA.annotate_sequences(gencode, index="gene_id", omic="LncRNA")
luad_data.LncRNA.annotate_genomics(gencode, index="gene_id", 
                                columns=['feature', 'start', 'end', 'strand', 'tag', 'havana_gene'])
luad_data.LncRNA.annotate_genomics(database=ensembl, index='gene_id', 
                                columns=['gene_name', 'transcript_id', 'transcript_name', 
                                         'chromosome_name', 'transcript_start', 'transcript_end', 'transcript_length',
                                         'Rfams', 'go_id', 'gene_biotype', 'transcript_biotype'])
luad_data.LncRNA.annotate_genomics(database=rnacentral, index='gene_name',
                                columns=['Rfams', 'go_id', 'gene_name'])
luad_data.LncRNA.annotations.info()

ValueError: I/O operation on closed file.

In [7]:
luad_data.MicroRNA.annotate_genomics(database=mirbase, index="gene_name", 
                                columns=['mirbase id', 'RNAcentral id', 'database'])
luad_data.MicroRNA.annotate_genomics(database=rnacentral, index="RNAcentral id",
                                columns=['transcript_id', 'RNA type', 'go_id', 'Rfams'])
luad_data.MicroRNA.annotate_genomics(database=ensembl, index='gene_name',
                                columns=['gene_name', 'transcript_id', 'transcript_name', 
                                         'chromosome_name', 'transcript_start', 'transcript_end', 'transcript_length',
                                         'Rfams', 'go_id', 'gene_biotype', 'transcript_biotype'])
luad_data.MicroRNA.annotate_sequences(mirbase, index="gene_name", omic="MIR")
luad_data.MicroRNA.annotations.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1870 entries, hsa-let-7a-1 to hsa-mir-99b
Data columns (total 15 columns):
RNAcentral id          1839 non-null object
mirbase id             1839 non-null object
database               1839 non-null object
transcript_id          487 non-null object
RNA type               487 non-null object
go_id                  487 non-null object
Rfams                  487 non-null object
transcript_name        2 non-null object
chromosome_name        2 non-null object
transcript_start       2 non-null object
transcript_end         2 non-null object
transcript_length      2 non-null object
gene_biotype           2 non-null object
transcript_biotype     2 non-null object
Transcript sequence    0 non-null float64
dtypes: float64(1), object(14)
memory usage: 233.8+ KB


In [8]:
luad_data.MicroRNA.annotations["Rfams"].notnull().sum()

487

In [None]:
luad_data.GE.annotate_genomics(database=rnacentral, index="gene_name",
                                columns=['gene_name', 'transcript_id', 'RNA type', 'go_id', 'Rfams'])
luad_data.GE.annotate_genomics(database=ensembl, index='gene_name',
                                columns=['gene_id', 'transcript_id', 'transcript_name', 
                                         'chromosome_name', 'transcript_start', 'transcript_end', 'transcript_length',
                                         'Rfams', 'go_id', 'gene_biotype', 'transcript_biotype'])
luad_data.GE.annotate_sequences(gencode, index="gene_name", omic="GE")
luad_data.GE.annotations.info()

In [8]:
luad_data.GE.annotations

Unnamed: 0_level_0,transcript_id,RNA type,go_id,Rfams,gene_id,transcript_id_,transcript_name,chromosome_name,transcript_start,transcript_end,transcript_length,Rfams_,go_id_,gene_biotype,transcript_biotype,Transcript sequence
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
A1BG,ENST00000596924|ENST00000263100|ENST0000059501...,,GO:0005576|GO:0005615|GO:0070062|GO:0003674|GO...,,ENSG00000121410,ENST00000596924|ENST00000263100|ENST0000059501...,A1BG-203|A1BG-201|A1BG-202|A1BG-204|A1BG-205,19,58345178|58346850|58346858|58346860|58350594,58347634|58353499|58353491|58347657|58353129,2134|1722|2301|475|917,,GO:0005576|GO:0005615|GO:0070062|GO:0003674|GO...,protein_coding,lncRNA|protein_coding|retained_intron,[CTCCCAGGCCTCAGCTCCGGGCGACGTGGAGTGGGGCGGTCCTGG...
A1BG-AS1,ENST00000594950|ENST00000593374|ENST0000059972...,,,,ENSG00000268895,ENST00000594950|ENST00000593374|ENST0000059972...,A1BG-AS1-203|A1BG-AS1-201|A1BG-AS1-205|A1BG-AS...,19,58347751|58347752|58347753|58347787|58347789|5...,58355183|58354425|58354039|58354502|58354724|5...,1718|959|559|977|712|1056|2130|1869|2412,,,lncRNA,lncRNA,[TCATCTTCCCTGTCTTGTTCCTGGGCGCAGAGGGCTCCTCCGGCG...
A1CF,ENST00000374001|ENST00000373993|ENST0000037399...,,GO:0003676|GO:0003727|GO:0016556|GO:0030895|GO...,,ENSG00000148584,ENST00000374001|ENST00000373993|ENST0000037399...,A1CF-205|A1CF-202|A1CF-204|A1CF-203|A1CF-210|A...,10,50799409|50806562|50811044|50828131|50836074|5...,50885675|50859985|50885627|50828461,9221|1997|2211|926|813|885|9517|9350|9400,,GO:0003676|GO:0003727|GO:0016556|GO:0030895|GO...,protein_coding,protein_coding|lncRNA,[TTTGATATGACGATTAGAGCATAACCCGAGTGACACGTTGAATTC...
A2M,ENST00000543436|ENST00000495442|ENST0000049570...,,GO:0005576|GO:0005615|GO:0004866|GO:0010951|GO...,,ENSG00000175899,ENST00000543436|ENST00000495442|ENST0000049570...,A2M-211|A2M-206|A2M-207|A2M-201|A2M-212|A2M-21...,12,9067664|9067712|9072433|9077769|9079981|909139...,9091412|9068955|9069078|9116157|9106296|908039...,600|729|509|4844|471|563|546|593|692|590|623|533,,GO:0005576|GO:0005615|GO:0004866|GO:0010951|GO...,protein_coding,lncRNA|retained_intron|protein_coding|nonsense...,[AGGTAGGAGTAACAGTCCCTGACACCATCACCGAGTGGAAGGCAG...
A2ML1,ENST00000299698|ENST00000537546|ENST0000054145...,,GO:0005576|GO:0005615|GO:0004866|GO:0010951|GO...,,ENSG00000166535,ENST00000299698|ENST00000537546|ENST0000054145...,A2ML1-201|A2ML1-204|A2ML1-207|A2ML1-202|A2ML1-...,12,8822621|8823513|8843236|8844954|8845004|884528...,8876787|8834935|8875814|8846124|8876456|884890...,5127|560|3370|462|3460|570|592|464|1136,,GO:0005576|GO:0005615|GO:0004866|GO:0010951|GO...,protein_coding,protein_coding|retained_intron|lncRNA|nonsense...,[GTCCACCACCTAACCTGGTGGTTACAGAAGGAGGAACACAGTCCT...
A4GALT,ENST00000642412|ENST00000401850|ENST0000038127...,,GO:0016020|GO:0016021|GO:0016740|GO:0016757|GO...,,ENSG00000128274,ENST00000642412|ENST00000401850|ENST0000038127...,A4GALT-206|A4GALT-203|A4GALT-202|A4GALT-205|A4...,22,42692121|42692122|42693971|42695171|42718246,42720870|42694986|42720819|42720829|42721298|4...,2092|2321|1956|546|595|407|2019,,GO:0016020|GO:0016021|GO:0016740|GO:0016757|GO...,protein_coding,protein_coding|lncRNA,[GGGCCGGAGGGGCGGTGCTGCCTCCCGCCGGGCCCCAGGCACTGC...
A4GNT,ENST00000236709,,GO:0016020|GO:0016021|GO:0016740|GO:0016757|GO...,,ENSG00000118017,ENST00000236709,A4GNT-201,3,138123718,138132387,1771,,GO:0016020|GO:0016021|GO:0016740|GO:0016757|GO...,protein_coding,protein_coding,[ATTTCTAAGACCTCAAATACTGGTTAACTGCATTTGCAGCTAGAA...
AAAS,ENST00000209873|ENST00000552876|ENST0000039438...,,GO:0005515|GO:0051028|GO:0005634|GO:0005737|GO...,,ENSG00000094914,ENST00000209873|ENST00000552876|ENST0000039438...,AAAS-201|AAAS-220|AAAS-202|AAAS-217|AAAS-212|A...,12,53307460|53307457|53307459|53307490|53307738|5...,53321610|53321257|53321596|53324758|53314815|5...,1815|2016|1703|1652|1075|900|769|582|832|827|9...,,GO:0005515|GO:0051028|GO:0005634|GO:0005737|GO...,protein_coding,protein_coding|retained_intron|lncRNA,[TTTCCCGTTAGTCTTTTCTTCACTTCCGTTGAGTTCCGCCTCGCC...
AACS,ENST00000316519|ENST00000418937|ENST0000053747...,,GO:0003824|GO:0006629|GO:0030729|GO:0005737|GO...,,ENSG00000081760,ENST00000316519|ENST00000418937|ENST0000053747...,AACS-201|AACS-204|AACS-207|AACS-208|AACS-205|A...,12,125065435|125065434|125091013|125092480|125100...,125143316|125107162|125107154|125107268|125125...,3256|1235|429|573|2536|3115|555|1757|6646|2922...,,GO:0003824|GO:0006629|GO:0030729|GO:0005737|GO...,protein_coding,protein_coding|nonsense_mediated_decay|lncRNA|...,[GGCCTGCGGGGCGGGGCCTGGGCCAAGCGGCCCGCAGGAGGCGGC...
AACSP1,ENST00000503486|ENST00000521412,,,,ENSG00000250420,ENST00000503486|ENST00000521412,AACSP1-201|AACSP1-202,5,178764861|178767204,178818435|178797611,2822|1095,,,transcribed_unprocessed_pseudogene,lncRNA|transcribed_unprocessed_pseudogene,[GGAGATGGGATCAGCCTGCGTGCCACGCGCACTGTGAGCAGCAGG...
