In [1]:
import sys, re
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import single_letter_alphabet

In [2]:
def read_fasta(fastaFile):
    handle = open(fastaFile, "rU")
    fastaRecords = list(SeqIO.parse(handle, "fasta"))
    handle.close()

    return fastaRecords

In [3]:
fastaFile = "../data/sequence/hg38_CAR_transcripts.fa"
fastaRecords = read_fasta(fastaFile)

In [4]:
fastaRecords[0]

SeqRecord(seq=Seq('ATGCTGCTGCTGGTGACCAGCCTGCTGCTGTGCGAGCTGCCCCACCCCGCCTTT...TGA', SingleLetterAlphabet()), id='CAR-1', name='CAR-1', description='CAR-1 gene=CAR', dbxrefs=[])

In [5]:
geneList = ["CAR", "CSF2", "CD28", "TNFRSF9", "CD247", "EGFR"]

In [8]:
fastaSubset = [r for r in fastaRecords 
               if re.search('(?<=gene=)\w*', r.description).group() in geneList
               and re.search('^NM', r.name)]

In [9]:
fastaSubset

[SeqRecord(seq=Seq('CAAGGAGGGATCCCACAGATGTCACAGGGCTGTCACAGAGCTGTGGTGGGAATT...AGA', SingleLetterAlphabet()), id='NM_001561', name='NM_001561', description='NM_001561 gene=TNFRSF9 CDS=262-1026', dbxrefs=[]),
 SeqRecord(seq=Seq('TGCTTTCTCAAAGGCCCCACAGTCCTCCACTTCCTGGGGAGGTAGCTGCAGAAT...GCA', SingleLetterAlphabet()), id='NM_000734', name='NM_000734', description='NM_000734 gene=CD247 CDS=146-634', dbxrefs=[]),
 SeqRecord(seq=Seq('TGCTTTCTCAAAGGCCCCACAGTCCTCCACTTCCTGGGGAGGTAGCTGCAGAAT...GCA', SingleLetterAlphabet()), id='NM_198053', name='NM_198053', description='NM_198053 gene=CD247 CDS=146-637', dbxrefs=[]),
 SeqRecord(seq=Seq('TAAAGTCATCAAAACAACGTTATATCCTGTGTGAAATGCTGCAGTCAGGATGCC...gaa', SingleLetterAlphabet()), id='NM_001243077', name='NM_001243077', description='NM_001243077 gene=CD28 CDS=223-592', dbxrefs=[]),
 SeqRecord(seq=Seq('TAAAGTCATCAAAACAACGTTATATCCTGTGTGAAATGCTGCAGTCAGGATGCC...gaa', SingleLetterAlphabet()), id='NM_001243078', name='NM_001243078', description='NM_001243078 gen

In [10]:
for record in fastaSubset:
    name = record.name
    gene = re.search('(?<=gene=)\w*', record.description).group()
    record.id = name

In [20]:
def write_fasta(fastaRecords, fastaOutFile):
    with open(fastaOutFile, 'wb') as f:
        writer = SeqIO.FastaIO.FastaWriter(f, wrap=70)
        writer.write_file(fastaRecords)
#         [ SeqIO.write(record, f, "fasta") for record in fastaRecords ]

In [21]:
write_fasta(fastaSubset, '../data/sequence/car_parts_overlap.fa')

In [11]:
SeqIO.FastaIO.FastaWriter()

SeqRecord(seq=Seq('TGCTTTCTCAAAGGCCCCACAGTCCTCCACTTCCTGGGGAGGTAGCTGCAGAAT...GCA', SingleLetterAlphabet()), id='NM_000734', name='NM_000734', description='NM_000734 gene=CD247 CDS=146-634', dbxrefs=[])

In [22]:
fastaRecords[0]

SeqRecord(seq=Seq('ATGCTGCTGCTGGTGACCAGCCTGCTGCTGTGCGAGCTGCCCCACCCCGCCTTT...TGA', SingleLetterAlphabet()), id='CAR-1', name='CAR-1', description='CAR-1 gene=CAR', dbxrefs=[])