In [None]:
####################################################################################################

#    Copyright 2019 Srijan Verma and EMBL-European Bioinformatics Institute

#    Licensed under the Apache License, Version 2.0 (the "License");
#    you may not use this file except in compliance with the License.
#    You may obtain a copy of the License at

#        http://www.apache.org/licenses/LICENSE-2.0

#    Unless required by applicable law or agreed to in writing, software
#    distributed under the License is distributed on an "AS IS" BASIS,
#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#    See the License for the specific language governing permissions and
#    limitations under the License.

####################################################################################################

## Loading all gene IDs from the csv file

In [1]:
import csv 
import pandas as pd
import numpy as np
import json

In [2]:
path = '/all_gene_data/all_gene_ids.csv'

df = pd.read_csv(path)
gene_ids = df['All gene ID'].tolist()
cleaned_ids = []
cleaned_ids = gene_ids

## calculating time taken to open 1.05GB json file

## 'read_content' variable (given in below cell) contains the json response received

In [3]:
%%time
with open("/merged_all_gene_data(dict).json") as access_json:
    read_content = json.load(access_json)

CPU times: user 14.2 s, sys: 5.61 s, total: 19.8 s
Wall time: 24.2 s


## Get all gene IDs having biotype as 'lincRNA'

In [4]:
lincrna_ids = []

In [5]:
def get_all_lincrna_ids():
    for i in range(len(cleaned_ids)):
        if read_content[cleaned_ids[i]]['biotype'] == 'lincRNA':
            lincrna_ids.append(cleaned_ids[i])
        else:
            continue


In [6]:
get_all_lincrna_ids()
len(lincrna_ids)

7690

## Loading all lincRNA IDs from csv file to a list

In [7]:
cleaned_ids = []
path = '/all_lincRNA_data/all_lincrna_ids.csv'

In [8]:
df = pd.read_csv(path)
cleaned_ids = df['All lincRNA ID'].tolist()

## Total lincRNA gene IDs = 7690

In [9]:
len(cleaned_ids)

7690

In [10]:
from tqdm import tqdm
gene_ids = []
null_gene_ids = []

In [11]:
def get_gene_data():
    
    count = 0
    for i in tqdm(range(len(cleaned_ids))):
        if read_content[cleaned_ids[i]] == None:
            null_gene_ids.append([cleaned_ids[i]])
            #del cleaned_ids[i]
        else:
            
            gene_ids.append(cleaned_ids[i])
#             gene_display_name.append(read_content[cleaned_ids[i]]['display_name'])
#             gene_start.append(read_content[cleaned_ids[i]]['start'])
#             gene_end.append(read_content[cleaned_ids[i]]['end'])
#             gene_strand.append(read_content[cleaned_ids[i]]['strand'])
#             gene_seq_region_name.append(read_content[cleaned_ids[i]]['seq_region_name'])
#             gene_biotype.append(read_content[cleaned_ids[i]]['biotype'])
            if cleaned_ids[i] in read_content:
                count = count + 1
        
    #print(count) 

In [12]:
get_gene_data()

100%|██████████| 7690/7690 [00:00<00:00, 508124.17it/s]


## 'null_gene_ids' variable contains the IDs having null values

In [13]:
print('No. of Null IDs are {0}'.format(len(null_gene_ids)))
print('Null IDs are :')
for i in range(len(null_gene_ids)):
    
    print(null_gene_ids[i])

No. of Null IDs are 0
Null IDs are :


In [14]:
cleaned_ids = []
cleaned_ids = gene_ids
print(len(cleaned_ids))

7690


In [17]:
transcript_strand = []
transcript_seq_region_name = []

## Below function [get_transcript_data() ] to extract 'transcript' data. Data Extracted are :
1. transcript id
2. transcript start
3. transcript end
4. transcript biotype

In [18]:
def get_transcript_data():
    for i in range(len(cleaned_ids)):
        
        for j in range(len(read_content[cleaned_ids[i]]['Transcript'])):
            transcript_strand.append(read_content[cleaned_ids[i]]['Transcript'][j]['strand'])
            transcript_seq_region_name.append(read_content[cleaned_ids[i]]['Transcript'][j]['seq_region_name'])
            
#     for k in range(len(gene_ids_for_transcripts)):
#         print('Transcript "{0}" of gene ID "{1}" has start and end as : "{2}" & "{3}"'.format(transcript_id[k],gene_ids_for_transcripts[k],transcript_start[k],transcript_end[k]))

In [19]:
get_transcript_data()

In [20]:
print(len(transcript_strand))
print(len(transcript_seq_region_name))

13666
13666


In [23]:
exon_strand = []
exon_seq_region_name = []

## Below function [get_exon_data() ] to extract 'exon' data. Data Extracted are :
1. exon id
2. exon start
3. exon end

In [24]:
def get_exon_data():
    for i in tqdm(range(len(cleaned_ids))):
        
        for j in range(len(read_content[cleaned_ids[i]]['Transcript'])):
            for k in range(len(read_content[cleaned_ids[i]]['Transcript'][j]["Exon"])):
                
                exon_strand.append(read_content[cleaned_ids[i]]['Transcript'][j]["Exon"][k]['strand'])
                exon_seq_region_name.append(read_content[cleaned_ids[i]]['Transcript'][j]["Exon"][k]['seq_region_name'])
                
            
#     for l in range(len(transcript_ids_for_exons)):
        
#         print('Exon "{0}" of Transcript ID "{1}" having gene ID "{2}" has start and end as : "{3}" & "{4}"'.format(exon_id[l],transcript_ids_for_exons[l],gene_ids_for_exons[l],exon_start[l],exon_end[l]))
    

In [25]:
get_exon_data()

100%|██████████| 7690/7690 [00:00<00:00, 55789.79it/s]


In [26]:
print(len(exon_seq_region_name))
print(len(exon_strand))

43789
43789


# Transcript data correction

In [27]:
import pandas as pd
df = pd.read_csv('/all_lincRNA_data/all_lincrna_transcript_data.csv', index_col=0)


In [28]:
df.head(3)

Unnamed: 0,SNO,Gene ID,Transcript ID,Biotype,Transcript Start,Transcript End,Transcript Length,No. of Exons
0,1,ENSG00000115934,ENST00000538297,lincRNA,23181334,23251499,1329,2
1,2,ENSG00000122043,ENST00000400540,lincRNA,29935905,29950488,1031,6
2,3,ENSG00000122043,ENST00000481738,lincRNA,29937906,29942567,575,2


In [29]:
df['Strand'] = transcript_strand
df['Seq region Name'] = transcript_seq_region_name

In [30]:
df.head(4)

Unnamed: 0,SNO,Gene ID,Transcript ID,Biotype,Transcript Start,Transcript End,Transcript Length,No. of Exons,Strand,Seq region Name
0,1,ENSG00000115934,ENST00000538297,lincRNA,23181334,23251499,1329,2,-1,12
1,2,ENSG00000122043,ENST00000400540,lincRNA,29935905,29950488,1031,6,1,13
2,3,ENSG00000122043,ENST00000481738,lincRNA,29937906,29942567,575,2,1,13
3,4,ENSG00000122548,ENST00000242109,lincRNA,26533121,26538788,4283,2,-1,7


In [31]:
df = df[['SNO', 'Gene ID', 'Transcript ID', 'Biotype', 'Strand', 'Seq region Name', 'Transcript Start', 'Transcript End', 'Transcript Length','No. of Exons']]     

In [32]:
df.head(3)

Unnamed: 0,SNO,Gene ID,Transcript ID,Biotype,Strand,Seq region Name,Transcript Start,Transcript End,Transcript Length,No. of Exons
0,1,ENSG00000115934,ENST00000538297,lincRNA,-1,12,23181334,23251499,1329,2
1,2,ENSG00000122043,ENST00000400540,lincRNA,1,13,29935905,29950488,1031,6
2,3,ENSG00000122043,ENST00000481738,lincRNA,1,13,29937906,29942567,575,2


In [33]:
len(df['SNO'])

13666

In [34]:
df.to_csv('/all_lincRNA_data/all_lincrna_transcript_data.csv')

# Exon data correction below

In [35]:
import pandas as pd
df = pd.read_csv('/all_lincRNA_data/all_lincrna_exon_data.csv', index_col=0)

In [36]:
len(df)

43789

In [37]:
df['Strand'] = exon_strand
df['Seq region Name'] = exon_seq_region_name

In [38]:
df.head(3)

Unnamed: 0,SNO,Gene ID,Transcript ID,Exon ID,Exon Start,Exon End,Exon Length,Strand,Seq region Name
0,1,ENSG00000115934,ENST00000538297,ENSE00002222490,23251461,23251499,39,-1,12
1,2,ENSG00000115934,ENST00000538297,ENSE00002300099,23181334,23182623,1290,-1,12
2,3,ENSG00000122043,ENST00000400540,ENSE00001543395,29935905,29936224,320,1,13


In [39]:
df = df[['SNO', 'Gene ID', 'Transcript ID', 'Exon ID', 'Strand','Seq region Name','Exon Start', 'Exon End', 'Exon Length']]     

In [40]:
df.head(2)

Unnamed: 0,SNO,Gene ID,Transcript ID,Exon ID,Strand,Seq region Name,Exon Start,Exon End,Exon Length
0,1,ENSG00000115934,ENST00000538297,ENSE00002222490,-1,12,23251461,23251499,39
1,2,ENSG00000115934,ENST00000538297,ENSE00002300099,-1,12,23181334,23182623,1290


In [41]:
df.to_csv('/all_lincRNA_data/all_lincrna_exon_data.csv')

# Correcting sequence data

In [42]:
import pandas as pd
df = pd.read_csv('/all_lincRNA_data/all_lincrna_transcript_data_with_sequences.csv', index_col=0)


In [43]:
df.head(3)

Unnamed: 0,SNO,Gene ID,Transcript ID,Biotype,Transcript Start,Transcript End,Transcript Length,No. of Exons,Length of Sequences,| Seq_len - Trans_len |,Match ?,Sequences
0,1,ENSG00000115934,ENST00000538297,lincRNA,23181334,23251499,1329,2,1329,0,YES,CCCGCCATGATCGTGAGGCCTCCCCAGCGATGTGGAACTGCTGGAG...
1,2,ENSG00000122043,ENST00000400540,lincRNA,29935905,29950488,1031,6,1031,0,YES,ATAGAGGTGTTTGGGGTGTCAGCTCTGAAATGCTAAAAATAAAACC...
2,3,ENSG00000122043,ENST00000481738,lincRNA,29937906,29942567,575,2,575,0,YES,GCAGAGAACAAAGATTGGTGGCTTCCTCCTGAGCACACTGGGATGT...


In [44]:
df['Strand'] = transcript_strand
df['Seq region Name'] = transcript_seq_region_name

In [45]:
df.head(3)

Unnamed: 0,SNO,Gene ID,Transcript ID,Biotype,Transcript Start,Transcript End,Transcript Length,No. of Exons,Length of Sequences,| Seq_len - Trans_len |,Match ?,Sequences,Strand,Seq region Name
0,1,ENSG00000115934,ENST00000538297,lincRNA,23181334,23251499,1329,2,1329,0,YES,CCCGCCATGATCGTGAGGCCTCCCCAGCGATGTGGAACTGCTGGAG...,-1,12
1,2,ENSG00000122043,ENST00000400540,lincRNA,29935905,29950488,1031,6,1031,0,YES,ATAGAGGTGTTTGGGGTGTCAGCTCTGAAATGCTAAAAATAAAACC...,1,13
2,3,ENSG00000122043,ENST00000481738,lincRNA,29937906,29942567,575,2,575,0,YES,GCAGAGAACAAAGATTGGTGGCTTCCTCCTGAGCACACTGGGATGT...,1,13


In [46]:
df = df[['SNO', 'Gene ID', 'Transcript ID', 'Biotype', 'Strand', 'Seq region Name', 'Transcript Start', 'Transcript End', 'Transcript Length','No. of Exons', 'Length of Sequences', '| Seq_len - Trans_len |','Match ?', 'Sequences']]     

In [47]:
df.head(2)

Unnamed: 0,SNO,Gene ID,Transcript ID,Biotype,Strand,Seq region Name,Transcript Start,Transcript End,Transcript Length,No. of Exons,Length of Sequences,| Seq_len - Trans_len |,Match ?,Sequences
0,1,ENSG00000115934,ENST00000538297,lincRNA,-1,12,23181334,23251499,1329,2,1329,0,YES,CCCGCCATGATCGTGAGGCCTCCCCAGCGATGTGGAACTGCTGGAG...
1,2,ENSG00000122043,ENST00000400540,lincRNA,1,13,29935905,29950488,1031,6,1031,0,YES,ATAGAGGTGTTTGGGGTGTCAGCTCTGAAATGCTAAAAATAAAACC...


In [48]:
df.to_csv('/all_lincRNA_data/all_lincrna_transcript_data_with_sequences.csv')