In [None]:
####################################################################################################

#    Copyright 2019 Srijan Verma and EMBL-European Bioinformatics Institute

#    Licensed under the Apache License, Version 2.0 (the "License");
#    you may not use this file except in compliance with the License.
#    You may obtain a copy of the License at

#        http://www.apache.org/licenses/LICENSE-2.0

#    Unless required by applicable law or agreed to in writing, software
#    distributed under the License is distributed on an "AS IS" BASIS,
#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#    See the License for the specific language governing permissions and
#    limitations under the License.

####################################################################################################

## Loading all gene IDs from the csv file

In [1]:
import csv 
import pandas as pd
import numpy as np
import json

In [2]:
path = '/all_gene_data/all_gene_ids.csv'

df = pd.read_csv(path)
gene_ids = df['All gene ID'].tolist()
cleaned_ids = []
cleaned_ids = gene_ids

## calculating time taken to open 1.05GB json file

## 'read_content' variable (given in below cell) contains the json response received

In [3]:
%%time
with open("/merged_all_gene_data(dict).json") as access_json:
    read_content = json.load(access_json)

CPU times: user 12.9 s, sys: 5.86 s, total: 18.8 s
Wall time: 21.7 s


## Get all gene IDs having biotype as 'lincRNA'

In [4]:
lincrna_ids = []

In [5]:
def get_all_lincrna_ids():
    for i in range(len(cleaned_ids)):
        if read_content[cleaned_ids[i]]['biotype'] == 'lincRNA':
            lincrna_ids.append(cleaned_ids[i])
        else:
            continue


In [6]:
get_all_lincrna_ids()
len(lincrna_ids)

7690

## Saving lincRNA gene IDs in a csv file

In [32]:
import csv 
header = ['SNO', 'All lincRNA ID']

path = '/all_lincRNA_data/all_lincrna_ids.csv'

with open(path, 'wt', newline ='') as file:
    writer = csv.writer(file, delimiter=',')
    writer.writerow(i for i in header)


In [33]:
s_no = []
for i in range(len(lincrna_ids)):
    s_no.append(i+1)

In [34]:
import pandas as pd

df = pd.read_csv(path)

df[df.columns[0]] = s_no
df[df.columns[1]] = lincrna_ids

df.to_csv(path)

## Loading all lincRNA IDs from csv file to a list

In [7]:
cleaned_ids = []
path = '/all_lincRNA_data/all_lincrna_ids.csv'

In [8]:
df = pd.read_csv(path)
cleaned_ids = df['All lincRNA ID'].tolist()

## Total lincRNA gene IDs = 7690

In [9]:
len(cleaned_ids)

7690

In [10]:
gene_display_name = []
gene_start = []
gene_end = []
gene_strand = []
gene_seq_region_name = []
null_gene_ids = []
gene_ids = []
gene_biotype = []

## Below function [get_gene_data() ] to extract 'gene' data. Data Extracted are :
1. gene display_name
2. gene start
3. gene end
4. gene strand
5. gene seq_region_name
6. gene biotype

In [11]:
def get_gene_data():
    
    count = 0
    for i in range(len(cleaned_ids)):
        if read_content[cleaned_ids[i]] == None:
            null_gene_ids.append([cleaned_ids[i]])
            #del cleaned_ids[i]
        else:
            
            gene_ids.append(cleaned_ids[i])
            gene_display_name.append(read_content[cleaned_ids[i]]['display_name'])
            gene_start.append(read_content[cleaned_ids[i]]['start'])
            gene_end.append(read_content[cleaned_ids[i]]['end'])
            gene_strand.append(read_content[cleaned_ids[i]]['strand'])
            gene_seq_region_name.append(read_content[cleaned_ids[i]]['seq_region_name'])
            gene_biotype.append(read_content[cleaned_ids[i]]['biotype'])
            if cleaned_ids[i] in read_content:
                count = count + 1
        
    #print(count) 

In [12]:
get_gene_data()

## 'null_gene_ids' variable contains the IDs having null values

In [13]:
print('No. of Null IDs are {0}'.format(len(null_gene_ids)))
print('Null IDs are :')
for i in range(len(null_gene_ids)):
    
    print(null_gene_ids[i])

No. of Null IDs are 0
Null IDs are :


In [10]:
print('No. of contents of gene_start is {0}'.format(len(gene_start)))
print('No. of contents of gene_end is {0}'.format(len(gene_end)))
print('No. of contents of gene_strand is {0}'.format(len(gene_strand)))
print('No. of contents of gene_seq_region_name is {0}'.format(len(gene_seq_region_name)))
print('No. of contents of gene_display_name is {0}'.format(len(gene_display_name)))

In [14]:
cleaned_ids = []
cleaned_ids = gene_ids
print(len(cleaned_ids))

7690


In [15]:
no_of_transcripts = []
gene_ids_for_transcripts = []

## Below function [ get_no_of_transcripts() ] to calculate no. of transcripts in a particular gene

In [16]:
def get_no_of_transcripts():
    for i in range(len(cleaned_ids)):
        no_of_transcripts.append(len(read_content[cleaned_ids[i]]['Transcript']))
        
        for k in range(len(read_content[cleaned_ids[i]]['Transcript'])):
            gene_ids_for_transcripts.append(cleaned_ids[i])
            
#     for j in range(len(cleaned_ids)):
#         print('No. of transcripts in gene "{0}" are {1}'.format(cleaned_ids[j],no_of_transcripts[j]))

In [17]:
get_no_of_transcripts()

In [18]:
transcript_id = []
transcript_start = []
transcript_end = []
transcript_biotype = []
transcript_strand = []
transcript_seq_region_name = []

## Below function [get_transcript_data() ] to extract 'transcript' data. Data Extracted are :
1. transcript id
2. transcript start
3. transcript end
4. transcript biotype
5. transcript strand
6. transcript seq_region_name

In [19]:
def get_transcript_data():
    for i in range(len(cleaned_ids)):
        
        for j in range(len(read_content[cleaned_ids[i]]['Transcript'])):
            transcript_id.append(read_content[cleaned_ids[i]]['Transcript'][j]['id'])
            transcript_start.append(read_content[cleaned_ids[i]]['Transcript'][j]['start'])
            transcript_end.append(read_content[cleaned_ids[i]]['Transcript'][j]['end'])
            transcript_biotype.append(read_content[cleaned_ids[i]]['Transcript'][j]['biotype'])
            transcript_strand.append(read_content[cleaned_ids[i]]['Transcript'][j]['strand'])
            transcript_seq_region_name.append(read_content[cleaned_ids[i]]['Transcript'][j]['seq_region_name'])
            
#     for k in range(len(gene_ids_for_transcripts)):
#         print('Transcript "{0}" of gene ID "{1}" has start and end as : "{2}" & "{3}"'.format(transcript_id[k],gene_ids_for_transcripts[k],transcript_start[k],transcript_end[k]))

In [20]:
get_transcript_data()

In [21]:
print(len(transcript_id))
print(len(transcript_start))
print(len(transcript_end))
print(len(gene_ids_for_transcripts))

In [None]:
len(read_content[cleaned_ids[0]]['Transcript'][0]["Exon"])

In [41]:
no_of_exons = []
transcript_ids_for_exons = []

## Below function [ get_no_of_exons() ] to calculate no. of exons for a particular transcript

In [42]:
def get_no_of_exons():
    for i in range(len(cleaned_ids)):
        for j in range(len(read_content[cleaned_ids[i]]['Transcript'])):
            no_of_exons.append(len(read_content[cleaned_ids[i]]['Transcript'][j]["Exon"]))
        
            for k in range(len(read_content[cleaned_ids[i]]['Transcript'][j]["Exon"])):
                transcript_ids_for_exons.append(read_content[cleaned_ids[i]]['Transcript'][j]['id'])
            
#     for l in range(len(cleaned_ids)):
#         print('No. of exons in transcript "{0}" are {1}'.format(transcript_id[l],no_of_exons[l]))

In [None]:
len(read_content[cleaned_ids[0]]['Transcript'][0]["Exon"])

In [43]:
get_no_of_exons()

In [24]:
sum(no_of_exons)

43789

In [44]:
len(transcript_ids_for_exons)

43789

In [None]:
read_content[cleaned_ids[0]]['Transcript'][0]["Exon"][0]

In [26]:
exon_id = []
exon_start = []
exon_end = []
gene_ids_for_exons = []
exon_strand = []
exon_seq_region_name = []

## Below function [get_exon_data() ] to extract 'exon' data. Data Extracted are :
1. exon id
2. exon start
3. exon end
4. Exon strand
5. Exon seq region name

In [27]:
def get_exon_data():
    for i in range(len(cleaned_ids)):
        
        for j in range(len(read_content[cleaned_ids[i]]['Transcript'])):
            for k in range(len(read_content[cleaned_ids[i]]['Transcript'][j]["Exon"])):
                
                
                exon_id.append(read_content[cleaned_ids[i]]['Transcript'][j]["Exon"][k]['id'])
                exon_start.append(read_content[cleaned_ids[i]]['Transcript'][j]["Exon"][k]['start'])
                exon_end.append(read_content[cleaned_ids[i]]['Transcript'][j]["Exon"][k]['end'])
                exon_strand.append(read_content[cleaned_ids[i]]['Transcript'][j]["Exon"][k]['strand'])
                exon_seq_region_name.append(read_content[cleaned_ids[i]]['Transcript'][j]["Exon"][k]['seq_region_name'])
                gene_ids_for_exons.append(cleaned_ids[i])
        
            
#     for l in range(len(transcript_ids_for_exons)):
        
#         print('Exon "{0}" of Transcript ID "{1}" having gene ID "{2}" has start and end as : "{3}" & "{4}"'.format(exon_id[l],transcript_ids_for_exons[l],gene_ids_for_exons[l],exon_start[l],exon_end[l]))
    

In [28]:
get_exon_data()

In [30]:
len(exon_id)

In [None]:
len(gene_ids_for_exons)

In [29]:
transcript_len = []

## Below function[ get_transcript_length() ] to calculate length of transcript

In [30]:
def get_transcript_length():
 #   for i in range(transcript_id):
  #      for j in range(exon)

    for i in range(len(cleaned_ids)):
        
        for j in range(len(read_content[cleaned_ids[i]]['Transcript'])):
            trans_len = 0
            start = 0
            end = 0
            total_exon_len = 0
            for k in range(len(read_content[cleaned_ids[i]]['Transcript'][j]["Exon"])):
                start = read_content[cleaned_ids[i]]['Transcript'][j]["Exon"][k]['start']
                end = read_content[cleaned_ids[i]]['Transcript'][j]["Exon"][k]['end']
                total_exon_len = total_exon_len + (end - start + 1)

            transcript_len.append(total_exon_len)
    
#     for k in range(len(transcript_id)):
#         print('Transcript ID "{0}" has length of {1} bps'.format(transcript_id[k], transcript_len[k]))
                
        

In [None]:
len(transcript_id)

In [31]:
get_transcript_length()

In [None]:
len(transcript_len)

In [None]:
transcript_len[-1]

In [None]:
transcript_id[-1]

In [32]:
exon_len = []

## Below function[ get_exon_length() ] to calculate length of exon

In [33]:
def get_exon_length():
 #   for i in range(transcript_id):
  #      for j in range(exon)
#exon_id
    for i in range(len(cleaned_ids)):
        
        for j in range(len(read_content[cleaned_ids[i]]['Transcript'])):
           # exon_len = 0
           # start = 0
           # end = 0
           # exon_len = 0
            for k in range(len(read_content[cleaned_ids[i]]['Transcript'][j]["Exon"])):
                start = 0
                end = 0
                exon_len_sum = 0
                start = read_content[cleaned_ids[i]]['Transcript'][j]["Exon"][k]['start']
                end = read_content[cleaned_ids[i]]['Transcript'][j]["Exon"][k]['end']
                exon_len_sum = (end - start + 1)

                exon_len.append(exon_len_sum)
    
#     for k in range(len(exon_id)):
#         print('Exon ID "{0}" has length of {1} bps'.format(exon_id[k], exon_len[k]))
                
        

In [34]:
get_exon_length()

In [None]:
len(exon_len)

In [48]:
len(exon_id)

1279073

In [46]:
gene_ids_for_exons[904]

'ENSG00000182586'

In [45]:
transcript_ids_for_exons[904]

'ENST00000584169'

## Exporting gene data to all_lincrna_gene_data.csv file

In [63]:
import csv 
header = ['SNO', 'Gene ID', 'Display Name', 'Biotype', 'Start', 'End', 'Strand', 'Seq region Name', 'No. of Transcripts']

path = '/all_lincRNA_data/all_lincrna_gene_data.csv'

with open(path, 'wt', newline ='') as file:
    writer = csv.writer(file, delimiter=',')
    writer.writerow(i for i in header)


In [64]:
s_no = []
for i in range(len(cleaned_ids)):
    s_no.append(i+1)

In [65]:
import pandas as pd

df = pd.read_csv(path)

df[df.columns[0]] = s_no
df[df.columns[1]] = cleaned_ids
df[df.columns[2]] = gene_display_name
df[df.columns[3]] = gene_biotype
df[df.columns[4]] = gene_start
df[df.columns[5]] = gene_end
df[df.columns[6]] = gene_strand
df[df.columns[7]] = gene_seq_region_name
df[df.columns[8]] = no_of_transcripts


df.to_csv(path)

## Exporting transcript data to all_lincrna_transcript_data.csv file

In [66]:
import csv 
header = ['SNO', 'Gene ID', 'Transcript ID', 'Biotype', 'Strand', 'Seq region Name', 'Transcript Start', 'Transcript End', 'Transcript Length','No. of Exons']

path = '/all_lincRNA_data/all_lincrna_transcript_data.csv'

with open(path, 'wt', newline ='') as file:
    writer = csv.writer(file, delimiter=',')
    writer.writerow(i for i in header)


In [67]:
s_no = []
for i in range(len(transcript_id)):
    s_no.append(i+1)

In [68]:
import pandas as pd

df = pd.read_csv(path)

df[df.columns[0]] = s_no
df[df.columns[1]] = gene_ids_for_transcripts
df[df.columns[2]] = transcript_id
df[df.columns[3]] = transcript_biotype
df[df.columns[4]] = transcript_strand
df[df.columns[5]] = transcript_seq_region_name
df[df.columns[6]] = transcript_start
df[df.columns[7]] = transcript_end
df[df.columns[8]] = transcript_len
df[df.columns[9]] = no_of_exons

df.to_csv(path)

## Exporting exon data to all_lincrna_exon_data.csv file

In [47]:
import csv 
header = ['SNO', 'Gene ID', 'Transcript ID', 'Exon ID', 'Strand','Seq region Name','Exon Start', 'Exon End', 'Exon Length']

path = '/all_lincRNA_data/all_lincrna_exon_data.csv'

with open(path, 'wt', newline ='') as file:
    writer = csv.writer(file, delimiter=',')
    writer.writerow(i for i in header)


In [48]:
s_no = []
for i in range(len(exon_id)):
    s_no.append(i+1)

In [49]:
import pandas as pd

df = pd.read_csv(path)

df[df.columns[0]] = s_no
df[df.columns[1]] = gene_ids_for_exons
df[df.columns[2]] = transcript_ids_for_exons
df[df.columns[3]] = exon_id
df[df.columns[4]] = exon_strand
df[df.columns[5]] = exon_seq_region_name
df[df.columns[6]] = exon_start
df[df.columns[7]] = exon_end
df[df.columns[8]] = exon_len

df.to_csv(path)