In [4]:
from Bio import SeqIO
import pandas as pd
from Bio.Seq import Seq
codons_table = {
        'TTT': 'F', 'CTT': 'L', 'ATT': 'I', 'GTT': 'V',
        'TTC': 'F', 'CTC': 'L', 'ATC': 'I', 'GTC': 'V',
        'TTA': 'L', 'CTA': 'L', 'ATA': 'I', 'GTA': 'V',
        'TTG': 'L', 'CTG': 'L', 'ATG': 'M', 'GTG': 'V',
        'TCT': 'S', 'CCT': 'P', 'ACT': 'T', 'GCT': 'A',
        'TCC': 'S', 'CCC': 'P', 'ACC': 'T', 'GCC': 'A',
        'TCA': 'S', 'CCA': 'P', 'ACA': 'T', 'GCA': 'A',
        'TCG': 'S', 'CCG': 'P', 'ACG': 'T', 'GCG': 'A',
        'TAT': 'Y', 'CAT': 'H', 'AAT': 'N', 'GAT': 'D',
        'TAC': 'Y', 'CAC': 'H', 'AAC': 'N', 'GAC': 'D',
        'TAA': '*', 'CAA': 'Q', 'AAA': 'K', 'GAA': 'E',
        'TAG': '*', 'CAG': 'Q', 'AAG': 'K', 'GAG': 'E',
        'TGT': 'C', 'CGT': 'R', 'AGT': 'S', 'GGT': 'G',
        'TGC': 'C', 'CGC': 'R', 'AGC': 'S', 'GGC': 'G',
        'TGA': '*', 'CGA': 'R', 'AGA': 'R', 'GGA': 'G',
        'TGG': 'W', 'CGG': 'R', 'AGG': 'R', 'GGG': 'G'
    }

stops = set(["TAA","TAG","TGA"])
starts = set(["TTG", "GTG","CTG", "ATG",])
def translate(seq):
    aaseq = ""
    for i in range(0, len(seq), 3):
        if i+3>len(seq):
            continue
        codon = seq[i:i+3]
        if codon in starts and aaseq == "":
            aaseq = codons_table[codon]
            start = i
            continue
        if aaseq != "":
            if codon in stops:
                if len(aaseq) > 10:
                    # yield aaseq+"*"
                    yield start, i+3
                aaseq = ""
            else:
                aaseq += codons_table[codon]

## Ensembl ncRNA

In [2]:
path_ensembl = "/home/chensa/gb_data/data_for_analysis/ncRNA"
records = SeqIO.parse(f"{path_ensembl}/Homo_sapiens.GRCh38.ncrna.fa", "fasta")
enst2seq = {}
for rec in records:
    id = str(rec.id).split('.')[0]
    enst2seq[str(id)] = str(rec.seq)
with open(f"{path_ensembl}/Homo_sapiens.GRCh38.lncrna.fa", 'w') as w:
    for rec in records:
        if "gene_biotype:lncRNA" in str(rec.description):
            w.write(">"+str(rec.description)+"\n")
            w.write(str(rec.seq)+"\n")
# all_gene_types = []
# for rec in records:
#     gene_type = str(rec.description).split("gene_biotype:")[1].split()[0]
#     all_gene_types.append(gene_type)


In [5]:
id2all = {}
records = SeqIO.parse(f"{path_ensembl}/Homo_sapiens.GRCh38.lncrna.fa", "fasta")
enst2seq = {}
for rec in records:
    id = str(rec.id).split('.')[0]
    enst2seq[str(id)] = str(rec.seq)
for enst, rnaseq1 in enst2seq.items():
    if 'N' in rnaseq1:continue
    rnaseq1_r = str(Seq(rnaseq1).reverse_complement())
    id2all[enst] = {"rna":rnaseq1, "orf":[]}
    id2all[enst+'_r'] = {"rna":rnaseq1_r, "orf":[]}
    rnaseq2 = rnaseq1[1:]
    rnaseq3 = rnaseq2[1:]
    for start,end in translate(rnaseq1):
        id2all[enst]['orf'].append((start,end))
    for start,end in translate(rnaseq2):
        id2all[enst]['orf'].append((start+1,end+1))
    for start,end in translate(rnaseq3):
        id2all[enst]['orf'].append((start+2,end+2))
    rnaseq2_r = rnaseq1_r[1:]
    rnaseq3_r = rnaseq2_r[1:]
    for start,end in translate(rnaseq1_r):
        id2all[enst+'_r']['orf'].append((start,end))
    for start,end in translate(rnaseq2_r):
        id2all[enst+'_r']['orf'].append((start+1,end+1))
    for start,end in translate(rnaseq3_r):
        id2all[enst+'_r']['orf'].append((start+2,end+2))

In [6]:
import pickle
pickle.dump(id2all, open(f"{path_ensembl}/id2all.pkl", 'wb'))

## Smprot

In [3]:
path_smprot = "/home/chensa/gb_data/data_for_analysis/smprot/"
df = pd.read_csv(f"{path_smprot}SmProt2_highConfidenceSet.txt", sep='\t')
df = df[df['Species']=='human']
RNAseqs = df['RNAseq'].tolist();AAseqs = df['AAseq'].tolist()
RNAseqs = [i.upper() for i in RNAseqs if i==i]; AAseqs = [i.upper() for i in AAseqs if i==i]
AAseqs = set(AAseqs)

In [None]:
! grep lncRNA SmProt2_human_RiboResults.txt > lncRNA.SmProt2_human_RiboResults.txt

In [4]:
records = SeqIO.parse(f"{path_smprot}/SmProt2_human_Ribo.fa", "fasta")
highcd = set()
with open(f"{path_smprot}/SmProt2_human_Ribo_highConfidence.fa", 'w') as w:
    for rec in records:
        seq = str(rec.seq)
        if seq in AAseqs:
            w.write(">"+str(rec.id)+"\n"+seq+"\n")
            highcd.add(str(rec.id))

In [5]:
df = pd.read_csv(f"{path_smprot}/lncRNA.SmProt2_human_RiboResults.txt", sep='\t')
orf_lncs = df['smPEP_ID'].tolist()
orf_lncs = ['SPROHSA'+str(i) for i in orf_lncs]
orf_lncs = list(set(orf_lncs))

In [6]:
orf_lnc_with_highcd = []
for i in orf_lncs:
    if i in highcd:
        orf_lnc_with_highcd.append(i)
orf_lnc_with_highcd = set(orf_lnc_with_highcd)
print(len(orf_lnc_with_highcd))

id2aaseq = {}
records = SeqIO.parse(f"{path_smprot}/SmProt2_human_Ribo_highConfidence.fa", "fasta")
for rec in records:
    id = str(rec.id)
    if id in orf_lnc_with_highcd:
        id2aaseq[id] = str(rec.seq)

45


In [7]:
df = pd.read_csv(f"{path_smprot}/lncRNA.SmProt2_human_RiboResults.txt", sep='\t')
id2info = {}
for pepid, geneid, tid in zip(df['smPEP_ID'], df['GeneID'], df['TranscriptID']):
    id = 'SPROHSA'+str(pepid)
    if id in orf_lnc_with_highcd:
        if id not in id2info:
            id2info[id] = []
        id2info[id].append((geneid, tid))


In [8]:
with open(f"{path_smprot}/lncrna_aa_with_highcd.fa", 'w') as w:
    for k,v in id2aaseq.items():
        w.write(">"+k+"\n"+v+"\n")

In [9]:
records = SeqIO.parse(f"{path_smprot}/lncrna_aa_with_highcd.dedup.fa", "fasta")
keep_ids = [str(rec.id) for rec in records]


In [13]:
id2all = {}
keep_ids = id2info.keys()
for i in keep_ids:
    enst = id2info[i][0][1].split('.')[0]
    rnaseq = enst2seq[enst]
    rna1aa = Seq(rnaseq).translate()
    rna2aa = Seq(rnaseq[1:]).translate()
    rna3aa = Seq(rnaseq[2:]).translate()
    seq = id2aaseq[i]
    idx1 = rna1aa.find(seq)
    idx2 = rna2aa.find(seq)
    idx3 = rna3aa.find(seq)
    if idx1 > 0:
        start = idx1*3
        orf = rnaseq[start:start+len(seq)*3]
        id2all[i] = {'rna':rnaseq, 'aa':orf, 'start':idx1*3}
    elif idx2 > 0:
        start = idx2*3+1
        orf = rnaseq[start:start+len(seq)*3]
        id2all[i] = {'rna':rnaseq, 'aa':orf, 'start':idx2*3+1}
    elif idx3 > 0:
        start = idx3*3+2
        orf = rnaseq[start:start+len(seq)*3]
        id2all[i] = {'rna':rnaseq, 'aa':orf, 'start':idx3*3+2}
    else:
        print(i,"error")




In [15]:
import pickle
pickle.dump(id2all, open(f"{path_smprot}/smprot_id2all.pkl", 'wb'))

## SPENCER

In [121]:
path_spencer = "/home/chensa/gb_data/data_for_analysis/spencer/"
df = pd.read_csv(f"{path_spencer}SPENCER_Peptide_info.txt", sep='\t')

tid2aaseqs = {}
for aaseq, tids in zip(df['sequence'], df['associated_transcript_id']):
    tids = tids.split(',')
    for tid in tids:
        if tid not in tid2aaseqs:
            tid2aaseqs[tid] = []
        tid2aaseqs[tid].append(aaseq)

tid2rnaseq = {}
records = SeqIO.parse(f"{path_spencer}SPENCER_Transcript_sequence.fasta", "fasta")
for rec in records:
    tid = str(rec.id).split('.')[0]
    tid2rnaseq[tid] = str(rec.seq)

In [122]:
with open(f"{path_spencer}SPENCER_Transcript_sequence_prediced_orfs.fasta",'w') as w:
    for tid, rnaseq in tid2rnaseq.items():
        for i in translate(rnaseq):
            print(i)
        break

LLSSRSGPLPAAPAPCSSILFSSPLWARMWSWN*
LLHSGLSGPIRAAASR*
VMAPPPLPLPLSLVTHNPKSQPRTPNPQYLKGFCFFTNDGFVGFF*


In [123]:
tid2orfid = {}
df = pd.read_csv(f"{path_spencer}SPENCER_Transcript_info.txt", sep='\t')
for tid, orfids in zip(df['transcript_id'],df['associated_orf_id']):
    orfids = orfids.split(',')
    for orfid in orfids:
        if tid not in tid2orfid:
            tid2orfid[tid] = []
        tid2orfid[tid].append(orfid)

In [124]:
orfid2orfseq = {}
records = SeqIO.parse(f"{path_spencer}SPENCER_ORF_sequence.fasta", "fasta")
for rec in records:
    orfid = str(rec.id)
    orfid2orfseq[orfid] = str(rec.seq)

In [125]:
len(tid2orfid)

6842

In [126]:
tid2all = {}
for tid, orfids in tid2orfid.items():
    rnaseq = tid2rnaseq[tid]
    flag = False
    for orfid in orfids:
        if orfid not in orfid2orfseq:
            continue
        orfseq = orfid2orfseq[orfid]
        if orfseq in rnaseq:
            idx = rnaseq.find(orfseq)
            if tid not in tid2all:
                tid2all[tid] = {'rna':rnaseq, 'aa':[], 'start':[]}
            tid2all[tid]['aa'].append(orfseq)
            tid2all[tid]['start'].append(idx)
            flag = True
    if not flag:
        print(tid)

In [127]:
import pickle
pickle.dump(tid2all, open(f"{path_spencer}/SPENCER_id2all.pkl", 'wb'))

## AHIGH

meta.csv comes from 1-s2.0-S1097276522006062-mmc3.xls

In [None]:
df = pd.read_csv("meta.csv")
df_iORF_id = [_.split('_')[0].split('.')[0] for _ in df['iORF_id'].tolist()]

df = pd.read_csv("FANTOM_CAT.lv1_raw.bed", sep='\t', header=None)
with open("fantom_associated_gene.bed", 'w') as w:
    for i in range(len(df)):
        if df.iloc[i,3].split('|')[1].split('.')[0] in df_iORF_id:
            w.write('\t'.join([str(_) for _ in df.iloc[i,:]])+'\n')


In [None]:
!bedtools getfasta -fi hg38.fa /home/chensa/data_for_analysis/ahigh/fantom_associated_gene.bed -bed /home/chensa/data_for_analysis/ahigh/fantom_associated_gene.bed -s -name -fo /home/chensa/data_for_analysis/ahigh/fantom_associated_gene.fa

In [95]:
path_ahigh = "/home/chensa/gb_data/data_for_analysis/ahigh/"
id2seq = {}
records = SeqIO.parse(f"{path_ahigh}fantom_associated_gene.fa", "fasta")
for rec in records:
    id = str(rec.id).split('|')[1].split('.')[0]
    id2seq[id] = str(rec.seq).upper()
records = SeqIO.parse(f"{path_ahigh}gencode.v46.transcripts.oneline.fa", "fasta")
for rec in records:
    id = str(rec.id).split('.')[0]
    id2seq[id] = str(rec.seq).upper()

In [64]:
df = pd.read_csv(f"{path_ahigh}meta.csv")
nc_list = []
for iORF_id,orf_type in zip(df['iORF_id'], df['ORF type']):
    if orf_type=='ncORF':
        nc_list.append(iORF_id)

In [144]:
df = pd.read_csv(f"{path_ahigh}sequence.csv")
id2all = {}
not_in_file = []; notin_dict = {}
allids = []
top3 = []
for orfid, seq, strand in zip(df['iORF_id.Genomic_coordinates'], df['Sequence'], df['Strand']):
    orfid = orfid.split(';')[0]
    if orfid in nc_list:
        id = orfid.split('_')[0]
        if 'T0' in id: id = id.split('.')[0]
        allids.append(id)
        if id in id2seq:
            rnaseq = id2seq[id]
            if strand == '-':
                seq = str(Seq(seq).reverse_complement())
            top3.append(seq[:3])
            if seq not in rnaseq:
                notin_dict[id] = seq
            else:
                idx = rnaseq.find(seq)
                if id not in id2all:
                    id2all[id] = {'rna':rnaseq, 'aa':[], 'start':[]}
                id2all[id]['aa'].append(seq)
                id2all[id]['start'].append(idx)
        else:
            not_in_file.append(id)
            if id in notin_dict:print("rep "+id)
            notin_dict[id] = seq


In [89]:
records = SeqIO.parse(f"{path_ahigh}gencode.v46.transcripts.fa", "fasta")
with open(f"{path_ahigh}gencode.v46.transcripts.oneline.fa", 'w') as w:
    for rec in records:
        w.write(">"+str(rec.id)+"\n"+str(rec.seq)+"\n")

In [147]:
import subprocess
notin_dict2 = {}
for id,seq in notin_dict.items():
    cmd = f"grep -n {seq} {path_ahigh}gencode.v46.transcripts.oneline.fa"
    result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE)
    if result.stdout:
        line = str(int(result.stdout.decode().split('\n')[0].split(':')[0])-1)
        cmd = f"sed -n '{line}p' {path_ahigh}gencode.v46.transcripts.oneline.fa"
        result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE)
        info = result.stdout.decode()
        # print(id, info)
        enstid = info[1:].split('|')[0].split('.')[0]
        rnaseq = id2seq[enstid]
        if id not in id2all:
            id2all[id] = {'rna':rnaseq, 'aa':[], 'start':[]}
        idx = rnaseq.find(seq)
        id2all[id]['aa'].append(seq)
        id2all[id]['start'].append(idx)
    else:
        notin_dict2[id] = seq

In [150]:
notin_dict3 = {}
for id, seq in notin_dict2.items():
    r_seq = str(Seq(seq).reverse_complement())
    cmd = f"grep -n {r_seq} {path_ahigh}gencode.v46.transcripts.oneline.fa"
    result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE)
    if result.stdout:
        line = str(int(result.stdout.decode().split('\n')[0].split(':')[0])-1)
        cmd = f"sed -n '{line}p' {path_ahigh}gencode.v46.transcripts.oneline.fa"
        result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE)
        info = result.stdout.decode()
        # print(id, info)
        enstid = info[1:].split('|')[0].split('.')[0]
        rnaseq = id2seq[enstid]
        rnaseq_r = str(Seq(rnaseq).reverse_complement())
        if id not in id2all:
            id2all[id] = {'rna':rnaseq_r, 'aa':[], 'start':[]}
        idx = rnaseq_r.find(seq)
        id2all[id]['aa'].append(seq)
        id2all[id]['start'].append(idx)
    else:
        notin_dict3[id] = seq

In [168]:
seq = "AGGCAGAACAGGTGGAGAGGTATGAAGAACGAGGAACACTGCCCGGGGAGTTTCTTCCTCTGCAAAATAAGGGAGTGTGTACTGAATTACCGTTTCCAGCTCCAACATCCAGGATTTCAGCACTACCTGCAGAGTAGTGGGAGGAGAGATAGAGGAAGAAGTGAAGATAAGAAACCACTCGAAGCCGGTGTCTGGTGCTGGGACCGTGGTGGCTGGGACGGCAGCAGTCGTGCTGTCCATCTGCTCTTCCGAGGGGTGGCACATCCTAGCCTCTATCTTTTCCCCAGAGAAGATCCCCCTCGTCTGCTCTTCCCCCGTCTCAGCCTTTTAGTGTGTGAACAGTTCTGGTGTTACTCAGCAACGCTTTTGTTAGCGCCTTTACCAGCAAGTACTTGTTAA"
seq_r = str(Seq(seq).reverse_complement())
notin_dict3
# 前四个、ENST00000577841没有比对到转录本，ENST0000055494前面多出来一段"ATGCTGC",ENST00000608817前面多出来一段“AG”

{'ENST00000417576': 'CTGGCCGTTACTAACTTTCTGTTGTCAAGACCCAGGGGTACGTGCGAAGGATAG',
 'ENST00000488429': 'ATAGACAAAGAAAAAGGAAGAGAGGAGACCAAAGGAAGGAAAATGACACAACAGAGCTTCGGCTATGGGACTGGTTTAATCCAAACCCAGAGGTTGTGA',
 'ENST00000524793': 'TTGGAGTTCTTATAA',
 'ENST00000551203': 'CTGGACCCAGAACTCTGGGATCAGTTTGACAACTTAGAAAATCGATAG',
 'ENST00000554945': 'ATGCTGCGGCCGCTTCGCACTGTGGCTCCTGCCGACCGCCAGGCGCTGTTACAGTGGATGCGAACTAACCGGTAA',
 'ENST00000577841': 'ATTGTATCCCCCTCCAATTCAGTTATGGTGCTGCTTCAGAAAGAAAAGAGCAAAAACAACAGACCTTCATTCAGCTTAGGAACATAA',
 'ENST00000608817': 'AGGCAGAACAGGTGGAGAGGTATGAAGAACGAGGAACACTGCCCGGGGAGTTTCTTCCTCTGCAAAATAAGGGAGTGTGTACTGAATTACCGTTTCCAGCTCCAACATCCAGGATTTCAGCACTACCTGCAGAGTAGTGGGAGGAGAGATAGAGGAAGAAGTGAAGATAAGAAACCACTCGAAGCCGGTGTCTGGTGCTGGGACCGTGGTGGCTGGGACGGCAGCAGTCGTGCTGTCCATCTGCTCTTCCGAGGGGTGGCACATCCTAGCCTCTATCTTTTCCCCAGAGAAGATCCCCCTCGTCTGCTCTTCCCCCGTCTCAGCCTTTTAGTGTGTGAACAGTTCTGGTGTTACTCAGCAACGCTTTTGTTAGCGCCTTTACCAGCAAGTACTTGTTAA'}

In [171]:
pickle.dump(id2all, open(f"{path_ahigh}/ahigh_id2all.pkl", 'wb'))

## mRNA

In [16]:
path_mRNA = "/home/chensa/gb_data/data_for_analysis/mRNA/"
id2seq = {}
records = SeqIO.parse(f"{path_mRNA}MANE.GRCh38.v1.3.ensembl_rna.fna", "fasta")
for rec in records:
    id = str(rec.id).split('.')[0]
    id2seq[id] = str(rec.seq).upper()

In [None]:
!grep 'five_prime_UTR' "{path_mRNA}MANE.GRCh38.v1.3.ensembl_genomic.gff" > "{path_mRNA}five_prime_UTR.MANE.GRCh38.v1.3.ensembl_genomic.gff"

In [18]:
records = SeqIO.parse(f"{path_mRNA}MANE.GRCh38.v1.3.ensembl_protein.faa", "fasta")
notin_dict = {}
id2all = {}
for rec in records:
    tid = str(rec.description).split('transcript:')[1].split('.')[0]
    rnaseq = id2seq[tid]
    aaseq = str(rec.seq)
    rnaseq1 = rnaseq
    rnaseq2 = rnaseq[1:]
    rnaseq3 = rnaseq[2:]
    aaseq1 = Seq(rnaseq1).translate()
    if aaseq in aaseq1:
        idx = aaseq1.find(aaseq)
        orf = rnaseq1[idx*3:idx*3+len(aaseq)*3]
        id2all[tid] = {'rna':rnaseq1, 'aa':orf, 'start':idx*3}
        continue
    aaseq2 = Seq(rnaseq2).translate()
    if aaseq in aaseq2:
        idx = aaseq2.find(aaseq)
        orf = rnaseq2[idx*3:idx*3+len(aaseq)*3]
        id2all[tid] = {'rna':rnaseq1, 'aa':orf, 'start':idx*3+1}
        continue
    aaseq3 = Seq(rnaseq3).translate()
    if aaseq in aaseq3:
        idx = aaseq3.find(aaseq)
        orf = rnaseq3[idx*3:idx*3+len(aaseq)*3]
        id2all[tid] = {'rna':rnaseq1, 'aa':orf, 'start':idx*3+2}
        continue
    notin_dict[tid] = aaseq

In [178]:
#一些aa对应不到序列的
notin_dict

{'ENST00000361547': 'MGRARPGQRGPPSPGPAAQPPAPPRRRARSLALLGALLAAAAAAAVRVCARHAEAQAAARQELALKTLGTDGLFLFSSLDTDGDMYISPEEFKPIAEKLTGSCSVTQTGVQWCSHSSLQPQLPWLNUSSCLSLLRSTPAASCEEEELPPDPSEETLTIEARFQPLLPETMTKSKDGFLGVSRLALSGLRNWTAAASPSAVFATRHFQPFLPPPGQELGEPWWIIPSELSMFTGYLSNNRFYPPPPKGKEVIIHRLLSMFHPRPFVKTRFAPQGAVACLTAISDFYYTVMFRIHAEFQLSEPPDFPFWFSPAQFTGHIILSKDATHVRDFRLFVPNHRSLNVDMEWLYGASESSNMEVDIGYIPQMELEATGPSVPSVILDEDGSMIDSHLPSGEPLQFVFEEIKWQQELSWEEAARRLEVAMYPFKKVSYLPFTEAFDRAKAENKLVHSILLWGALDDQSCUGSGRTLRETVLESSPILTLLNESFISTWSLVKELEELQNNQENSSHQKLAGLHLEKYSFPVEMMICLPNGTVVHHINANYFLDITSVKPEEIESNLFSFSSTFEDPSTATYMQFLKEGLRRGLPLLQP',
 'ENST00000331835': 'MVAMAAGPSGCLVPAFGLRLLLATVLQAVSAFGAEFSSEACRELGFSSNLLCSSCDLLGQFNLLQLDPDCRGCCQEEAQFETKKLYAGAILEVCGUKLGRFPQVQAFVRSDKPKLFRGLQIKYVRGSDPVLKLLDDNGNIAEELSILKWNTDSVEEFLSEKLERI',
 'ENST00000305943': 'MALPAGPAEAACALCQRAPREPVRADCGHRFCRACVVRFWAEEDGPFPCPECADDCWQRAVEPGRPPLSRRLLALEEAAAAPARDGPASEAALQLLCRADAGPLCAACRMAAGPEPPEWEPRWRKALRGKENKGSVEIMRKDLNDARDLHGQAESAAAVWKGHVMDRRKKALTDYK

In [20]:
pickle.dump(id2all, open(f"{path_mRNA}/mRNA_id2all.pkl", 'wb'))