In [1]:
import pandas as pd
import pysam

In [2]:
col_list = ["Transcript_ID","Gene_ID","Gene_name","5UTR_length","CDS_length","3UTR_length"]
P_gtf = pd.read_csv('/Data_2/Daehwa/Data_Library/GTF_parsed/v0.7.1/gencode.vM27.annotation.gtf/Processed_gtf.tsv', sep='\t', usecols=col_list)
repre = pd.read_csv('/Data_2/Jun/Adipocytes/references/representative-isoforms.txt', sep='\t', names=['Gene_ID','Transcript_ID','a','b','c']).drop(columns=['a','b','c'])

RPF_abundant = pd.read_csv('/Data_2/Daehwa/Adipocyte/Analysis/Gene_lists/Abundant_genes/v20230529/adi_RPF_top100_abundant_genelist.txt', names=['gene_id'])
RNA_abundant = pd.read_csv('/Data_2/Daehwa/Adipocyte/Analysis/Gene_lists/Abundant_genes/v20230529/adi_RNA_top100_abundant_genelist.txt', names=['gene_id'])

In [3]:
DBs = {}

DB = pd.merge(repre, P_gtf, on=['Gene_ID','Transcript_ID'])
DB = DB.rename(columns = {'Gene_ID':'gene_id',
                          'Gene_name':'gene_name',
                          'Transcript_ID':'transcript_id'})
DB = pd.merge(RPF_abundant, DB, on='gene_id')
DB = DB[['transcript_id',"5UTR_length","CDS_length","3UTR_length","gene_id","gene_name"]]
DB = DB[ DB['CDS_length'] > 300 ]
DB = DB.set_index('transcript_id').T.to_dict('list')

DBs['RPF'] = DB

#
DB = pd.merge(repre, P_gtf, on=['Gene_ID','Transcript_ID'])
DB = DB.rename(columns = {'Gene_ID':'gene_id',
                          'Gene_name':'gene_name',
                          'Transcript_ID':'transcript_id'})
DB = pd.merge(RNA_abundant, DB, on='gene_id')
DB = DB[['transcript_id',"5UTR_length","CDS_length","3UTR_length","gene_id","gene_name"]]
DB = DB[ DB['CDS_length'] > 300 ]
DB = DB.set_index('transcript_id').T.to_dict('list')

DBs['RNA'] = DB

display(dict(list(DB.items())[:20]))

{'ENSMUST00000082402.1': [0, 1545, 0, 'ENSMUSG00000064351.1', 'mt-Co1'],
 'ENSMUST00000041331.4': [300, 1068, 3464, 'ENSMUSG00000037071.4', 'Scd1'],
 'ENSMUST00000029041.6': [65, 399, 432, 'ENSMUSG00000062515.4', 'Fabp4'],
 'ENSMUST00000042235.15': [120, 1389, 285, 'ENSMUSG00000037742.15', 'Eef1a1'],
 'ENSMUST00000001547.8': [145, 4362, 1423, 'ENSMUSG00000001506.11', 'Col1a1'],
 'ENSMUST00000214685.2': [196, 909, 164, 'ENSMUSG00000018593.14', 'Sparc'],
 'ENSMUST00000033741.15': [172, 1110, 1137, 'ENSMUSG00000031375.18', 'Bgn'],
 'ENSMUST00000031668.10': [347, 4119, 882, 'ENSMUSG00000029661.17', 'Col1a2'],
 'ENSMUST00000087883.13': [210, 4395, 959, 'ENSMUSG00000026043.19', 'Col3a1'],
 'ENSMUST00000032934.12': [231, 1095, 185, 'ENSMUSG00000030695.17', 'Aldoa'],
 'ENSMUST00000047864.11': [96, 2577, 416, 'ENSMUSG00000034994.11', 'Eef2'],
 'ENSMUST00000082392.1': [0, 957, 0, 'ENSMUSG00000064341.1', 'mt-Nd1'],
 'ENSMUST00000082421.1': [0, 1144, 0, 'ENSMUSG00000064370.1', 'mt-Cytb'],
 'ENSMUS

In [4]:
prefix = 'adi_'
#data file
folder = '/Data_2/Jun/Adipocytes/tr-aln'
dataset = {'RNA':'RNA', 'RPF':'novaseq'}
days = ['D0','D4','D8']
reps = ['a','b','c']
filetype = ".rep.bam"

In [5]:
out_w = pd.ExcelWriter('adi_abundant_metagene2.xlsx')
for LIB in ['RNA','RPF']:
    for day in days:
        for rep in reps:
            SP = day+rep
            result_start = {}
            result_stop = {}
            result_start['transcript_id'] = [str(i) for i in range(-100,201)]
            result_stop['transcript_id'] = [str(i) for i in range(-200,101)]
            
            for T_id in DBs[LIB]:
                result_start[T_id] = [0]*len(range(-100,201))
                result_stop[T_id]  = [0]*len(range(-200,101))
            
            f = pysam.AlignmentFile(f'{folder}/{dataset[LIB]}/{SP}{filetype}','rb')
            for line in f:
                line = line.tostring(f)
                Aligned = line.split("\t")
                
                T_id = Aligned[2]
                
                if DBs[LIB].get(T_id)==None: continue
                P = DBs[LIB][T_id]

                start_pos = int( int(Aligned[3])-(P[0]+1) )
                stop_pos = int( int(Aligned[3])-(P[0]+P[1]-2) )
                if -100 <= start_pos <= 200 :
                    result_start[T_id][start_pos+100] += 1
                if -200 <= stop_pos <= 100 :
                    result_stop[T_id][stop_pos+200] += 1
            
            print (f'{LIB} {SP} done')

            result_start = pd.DataFrame(result_start).T
            result_start.to_excel(out_w, sheet_name=f'{LIB}_{SP}_start', header=False)
            result_stop = pd.DataFrame(result_stop).T
            result_stop.to_excel(out_w, sheet_name=f'{LIB}_{SP}_stop', header=False)
out_w.close()

RNA D0a done
RNA D0b done
RNA D0c done
RNA D4a done
RNA D4b done
RNA D4c done
RNA D8a done
RNA D8b done
RNA D8c done
RPF D0a done
RPF D0b done
RPF D0c done
RPF D4a done
RPF D4b done
RPF D4c done
RPF D8a done
RPF D8b done
RPF D8c done
