In [None]:
####################################################################################################

#    Copyright 2019 Srijan Verma and EMBL-European Bioinformatics Institute

#    Licensed under the Apache License, Version 2.0 (the "License");
#    you may not use this file except in compliance with the License.
#    You may obtain a copy of the License at

#        http://www.apache.org/licenses/LICENSE-2.0

#    Unless required by applicable law or agreed to in writing, software
#    distributed under the License is distributed on an "AS IS" BASIS,
#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#    See the License for the specific language governing permissions and
#    limitations under the License.

####################################################################################################

# Approach :
1. Filter 'all_transcript_data.csv' having gbkey as ncRNA. Save the RefSeq Gene IDs and RefSeq Transcript ID columns.
2. Filter 'all_gene_data.csv' whose gene IDs lie in the 'RefSeq Gene ID' collected in point 1 above.
3. Filter 'all_exon_data.csv' having gbkey as ncRNA.

In [1]:
import pandas as pd

df = pd.read_csv('/all_gene_data/all_transcript_data.csv', index_col=0)

In [41]:
df.head(5)

In [3]:
df.drop('SNO', axis=1,inplace=True)

In [42]:
len(df[df.columns[1]])

In [5]:
# df[df.columns[5]]
from tqdm import tqdm

## Below code for getting all transcript data having gbkey as ncRNA

In [43]:
%%time
refseq_gene_ids = []
ncrna_transcript_index_to_drop = []
for i in tqdm(range(len(df[df.columns[1]]))):
    if df[df.columns[5]][i] != 'ncRNA':
        ncrna_transcript_index_to_drop.append(i)
    elif df[df.columns[5]][i] == 'ncRNA':
        refseq_gene_ids.append(df[df.columns[1]][i])
        

In [7]:
df.drop(ncrna_transcript_index_to_drop, axis=0,inplace=True)

In [8]:
s_no = []
for i in range(len(df[df.columns[1]])):
    s_no.append(i+1)

In [9]:
df['SNO'] = s_no

In [10]:
df = df[['SNO','Index', 'RefSeq Gene ID','RefSeq Transcript ID', 'Dbxref', 'Parent', 'gbkey', 'Type', 'Seq_id', 'Strand','Start', 'End','No. of Exons','Transcript Length']]     

In [12]:
df.reset_index(drop=True, inplace=True)

In [44]:
df.head(5)

In [14]:
df.to_csv('/all_gbkey=ncRNA_data/all_ncrna_transcript_data.csv')

# gene data ncrna

In [16]:
df = pd.read_csv('/refseq_numeric/all_gene_data/all_gene_data.csv', index_col=0)

In [17]:
df.drop('SNO', axis=1,inplace=True)

In [45]:
# df.head(5)

In [46]:
%%time
s = set(refseq_gene_ids)
drop_values = []
for i in tqdm(range(len(df[df.columns[1]]))):
    if df[df.columns[1]][i] in s:
        continue
    else:
        drop_values.append(i)

In [21]:
df.drop(drop_values, axis=0,inplace=True)

In [22]:
s_no = []
for i in range(len(df[df.columns[1]])):
    s_no.append(i+1)

df['SNO'] = s_no

In [47]:
# df.head(2)

In [24]:
df = df[['SNO','Index', 'RefSeq Gene ID', 'Dbxref','Name', 'gbkey', 'Biotype', 'Type', 'Seq_id', 'Strand','Start', 'End', 'No. of Transcripts']]     

In [25]:
df.reset_index(drop=True, inplace=True)

In [48]:
# df.head(5)

In [49]:
# len(df[df.columns[1]])

In [28]:
df.to_csv('/all_gbkey=ncRNA_data/all_ncrna_gene_data.csv')

## Exon data ncrna below

In [50]:
import pandas as pd
df = pd.read_csv('/all_gene_data/all_exon_data.csv', index_col=0)

In [30]:
df.drop('SNO', axis=1,inplace=True)

In [51]:
# df.head(5)

In [52]:
ncrna_exon_index_to_drop = []

for i in tqdm(range(len(df[df.columns[1]]))):
    
    if df[df.columns[6]][i] != 'ncRNA':
        
        ncrna_exon_index_to_drop.append(i)

In [33]:
df.drop(ncrna_exon_index_to_drop, axis=0,inplace=True)

In [53]:
# len(df[df.columns[1]])

In [35]:
s_no = []
for i in range(len(df[df.columns[1]])):
    s_no.append(i+1)

In [36]:
df['SNO'] = s_no

In [37]:
df = df[['SNO','Index', 'RefSeq Gene ID','RefSeq Transcript ID','RefSeq Exon ID','Dbxref', 'Parent', 'gbkey', 'Type', 'Seq_id', 'Strand','Start', 'End','Exon Length']]     

In [38]:
df.reset_index(drop=True, inplace=True)

In [54]:
df.head(5)

In [40]:
df.to_csv('/all_gbkey=ncRNA_data/all_ncrna_exon_data.csv')

# Once you load the csv files, do not forget to reset the index values ! [ df.reset_index(drop=True, inplace=True) ]. Default index values are not sequential (Since they got dropped)