In [90]:
from Bio import SeqIO

rna_dict = {}
protein_dict = {}

# RefSeq
# https://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.25_GRCh37.p13/
# https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/
ref_seq_acc = {}
with open('hg19_refseq_metadata.txt', "r") as handle:
    for line in handle.readlines()[1:]:
        tokens = line.split('\t')
        ref_seq_acc[tokens[5]] = tokens[4]
        
with open('GRCh37_latest_cds.fna', "r") as handle:
    for record in SeqIO.parse(handle, "fasta"):
        header = record.description
        start_pos = header.find("protein_id=")
        end_pos = header[start_pos:].find("]")
        protein_id = header[start_pos:][:end_pos].split('=')[-1].strip()
        transcript_id = ref_seq_acc[protein_id]
        rna_dict[transcript_id.split('.')[0]] = str(record.seq).strip()
    
with open('GRCh37_latest_protein.faa', "r") as handle:
    for record in SeqIO.parse(handle, "fasta"):
        protein_id = record.description.split()[0].strip()
        transcript_id = ref_seq_acc[protein_id]
        protein_dict[transcript_id.split('.')[0]] = str(record.seq).strip()
        
# Ensembl
# ftp://ftp.ensembl.org/pub/grch37/current/fasta/homo_sapiens/
with open('Homo_sapiens.GRCh37.cds.all.fa', "r") as handle:
    for record in SeqIO.parse(handle, "fasta"):
        rna_dict[record.description.split()[0].split('.')[0]] = str(record.seq).strip()
    
with open('Homo_sapiens.GRCh37.pep.all.fa', "r") as handle:
    for record in SeqIO.parse(handle, "fasta"):
        protein = record.description.split()[4].split(':')[1]
        protein_dict[protein.split('.')[0]] = str(record.seq).strip()
        
# UCSC
# https://genome-euro.ucsc.edu/cgi-bin/hgTables
with open('knownGene_cds.fna', "r") as handle:
    for record in SeqIO.parse(handle, "fasta"):
        transcript_id = record.description.split()[0].split("_")[-1].strip()
        rna_dict[transcript_id] = str(record.seq).strip()
    
with open('knownGene_protein.faa', "r") as handle:
    for record in SeqIO.parse(handle, "fasta"):
        transcript_id = record.description.split()[0].strip()
        protein_dict[transcript_id] = str(record.seq).strip()
        
# Write dictionaries
with open('hg19_cDNA_DICT.dict', 'w') as handle:
    for key,value in rna_dict.items():
        handle.write("{}:{}\n".format(key,value))
    
with open('hg19_PROTEIN_DICT.dict', 'w') as handle:
    for key,value in protein_dict.items():
        handle.write("{}:{}\n".format(key,value))