In [10]:
# First time exploring Biopython. Will explore some of its features using both COX1 amino acid sequences 
# and Chlamydia trachomatis DNA sequences
# Starting by just printing the IDs and sequences separately and saving them into lists
from Bio import SeqIO

cox_id_list = []
cox_seq_list = []

aa_sequences = SeqIO.parse(open('cox1.fasta'),'fasta')
for fasta in aa_sequences:
    print(fasta.id)
    print(fasta.seq)
    
    cox_id_list.append(fasta.id)
    cox_seq_list.append(fasta.seq)

COX1_human
MFADRWLFSTNHKDIGtLyLLFGAWAGVLGtALsLLIRAELGQPGNLLGNDHIYNVIVTAHAFVMIFFMVMPIMIGGFGNWLVPLMIGAPDMAFPRMNNMSFWLLPPSLLLLLASAMVEAGAGTGWTVYPPLAGNYSHPGASVDLTIFSLHLAGVSSILGAINFITTIINMKPPAMTQYQTPLFVWSVLITAVLLLLSLPVLAAGITMLLTDRNLNTTFFDPAGGGDPILYQHLFWFFGHPEVYILILPGFGMISHIVTYYSGKKEPFGYMGMVWAMMSIGFLGFIVWAHHMFTVGMDVDTRAYFTSATMIIAIPtGVKVFSWLATLHGsNMKWSAAVLWALGFIFLFTVGGLTGIVLANSSLDIVLHDTYYVVAHFHYVLSMGAVFAIMGGFIHWFPLFSGYTLDQTYAKIHFTIMFIGVNLTFFPQHFLGLSGMPRRYSDYPDAYTTWNILSSVGSFISLTAVMLMIFMIWEAFASKRKVLMVEEPSMNLEWLYGCPPPYHTFEEPVYMKS-
COX1_mouse
MFINRWLFSTNHKDIGTLYLLFGAWAGMVGTALSILIRAELGQPGALLGDDQIYNVIVTAHAFVMIFFMVMPMMIGGFGNWLVPLMIGAPDMAFPRMNNMSFWLLPPSFLLLLASSMVEAGAGTGWTVYPPLAGNLAHAGASVDLTIFSLHLAGVSSILGAINFITTIINMKPPAMTQYQTPLFVWSVLITAVLLLLSLPVLAAGITMLLTDRNLNTTFFDPAGGGDPILYQHLFWFFGHPEVYILILPGFGIISHVVTYYSGKKEPFGYMGMVWAMMSIGFLGFIVWAHHMFTVGLDVDTRAYFTSATMIIAIPTGVKVFSWLATLHGGNIKWSPAMLWALGFIFLFTVGGLTGIVLSNSSLDIVLHDTYYVVAHFHYVLSMGAVFAIMAGFVHWFPLFSGFTLDDTWAKAHFAIMFVGVNMTFFPQHFLGLSGMPRRYSDYPDAYTTWNTVSSMGSFISLT

In [11]:
print(cox_id_list)
print(cox_seq_list)

['COX1_human', 'COX1_mouse', 'COX1_rabbit', 'COX1_cow']
[Seq('MFADRWLFSTNHKDIGtLyLLFGAWAGVLGtALsLLIRAELGQPGNLLGNDHIY...KS-'), Seq('MFINRWLFSTNHKDIGTLYLLFGAWAGMVGTALSILIRAELGQPGALLGDDQIY...KVK'), Seq('MFVNRWLFSTNHKDIGTLYLLFGAWAGMVGTALSLLIRAELGQPGTLLGDDQIY...KA-'), Seq('MFINRWLFSTNHKDIGTLYLLFGAWAGMVGTALSLLIRAELGQPGTLLGDDQIY...NLK')]


In [12]:
# They can also be saved into a dictionary
record_dict = SeqIO.to_dict(SeqIO.parse("cox1.fasta", "fasta"))
print(record_dict["COX1_human"])

ID: COX1_human
Name: COX1_human
Description: COX1_human
Number of features: 0
Seq('MFADRWLFSTNHKDIGtLyLLFGAWAGVLGtALsLLIRAELGQPGNLLGNDHIY...KS-')


In [13]:
# Using alignment features to align and compare protein sequences
from Bio import AlignIO

align = AlignIO.read("cox1.fasta", "fasta")

print(align)

Alignment with 4 rows and 514 columns
MFADRWLFSTNHKDIGtLyLLFGAWAGVLGtALsLLIRAELGQP...KS- COX1_human
MFINRWLFSTNHKDIGTLYLLFGAWAGMVGTALSILIRAELGQP...KVK COX1_mouse
MFVNRWLFSTNHKDIGTLYLLFGAWAGMVGTALSLLIRAELGQP...KA- COX1_rabbit
MFINRWLFSTNHKDIGTLYLLFGAWAGMVGTALSLLIRAELGQP...NLK COX1_cow


In [14]:
from Bio.Align import PairwiseAligner

aligner = PairwiseAligner()

alignments = aligner.align(record_dict["COX1_human"], record_dict["COX1_mouse"])

alignment = alignments[0]

print(alignment)

COX1_huma         0 MFAD--RWLFSTNHKDIGt-Ly-LLFGAWAG-VLGt-ALs--LLIRAELGQPGN-LLGND
                  0 ||----||||||||||||--|--||||||||-|-|--||---|-|||||||||--|||-|
COX1_mous         0 MF--INRWLFSTNHKDIG-TL-YLLFGAWAGMV-G-TAL-SIL-IRAELGQPG-ALLG-D

COX1_huma        51 H--IYNVIVTAHAFVMIFFMVMPIM-IGGFGNWLVPLMIGAPDMAFPRMNNMSFWLLPPS
                 60 ---||||||||||||||||||||-|-||||||||||||||||||||||||||||||||||
COX1_mous        50 -DQIYNVIVTAHAFVMIFFMVMP-MMIGGFGNWLVPLMIGAPDMAFPRMNNMSFWLLPPS

COX1_huma       108 -LLLLLASA-MVEAGAGTGWTVYPPLAGNYS--HP-GASVDLTIFSLHLAGVSSILGAIN
                120 -||||-||--|||||||||||||||||||----|--||||||||||||||||||||||||
COX1_mous       108 FLLLL-AS-SMVEAGAGTGWTVYPPLAGN--LAH-AGASVDLTIFSLHLAGVSSILGAIN

COX1_huma       163 FITTIINMKPPAMTQYQTPLFVWSVLITAVLLLLSLPVLAAGITMLLTDRNLNTTFFDPA
                180 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
COX1_mous       163 FITTIINMKPPAMTQYQTPLFVWSVLITAVLLLLSLPVLAAGITMLLTDRNLNTTFFDPA

COX1_huma       223 GGGD

In [15]:
# Using a list to look at individuals IDs and sequences of a Chlymadia DNA sequence
# Will split it into codons
records = list(SeqIO.parse("Chlamydia_trachomatis_sequences.fa", "fasta"))
print(records[0].id)
print(records[0].seq)

O169_04115-1
TTTTCTGAGAATTTGATCTTGGTTCAGATTGAACGCTGGCGGCGTGGATGAGGCATGCAAGTCGAACGGAGCAATTGTTTCGGCAATTGTTTAGTGGCGGAAGGGTTAGTAATGCATAGATAATTTGTCCTTAACTTGGGAATAACGGTTGGAAACGGCCGCTAATACCGAATGTGGCGATATTTGGGCATCCGAGTAACGTTAAAGAAGGGGATCTTAGGACCTTTCGGTTAAGGGAGAGTCTATGTGATATCAGCTAGTTGGTGGGGTAAAGGCCTACCAAGGCTATGACGTCTAGGCGGATTGAGAGATTGGCCGCCAACACTGGGACTGAGACACTGCCCAGACTCCTACGGGAGGCTGCAGTCGAGAATCTTTCGCAATGGACGGAAGTCTGACGAAGCGACGCCGCGTGTGTGATGAAGGCTCTAGGGTTGTAAAGCACTTTCGCTTGGGAATAAGAGAAGACGGTTAATACCCGCTGGATTTGAGCGTACCAGGTAAAGAAGCACCGGCTAACTCCGTGCCAGCAGCTGCGGTAATACGGAGGGTGCTAGCGTTAATCGGATTTATTGGGCGTAAAGGGCGTGTAGGCGGAAAGGTAAGTTAGTTGTCAAAGATCGGGGCTCAACCCCGAGTCGGCATCTAATACTATTTTTCTAGAGGATAGATGGAGAAAAGGGAATTTCACGTGTAGCGGTGAAATGCGTAGATATGTGGAAGAACACCAGTGGCGAAGGCGCTTTTCTAATTTATACCTGACGCTAAGGCGCGAAAGCAAGGGGAGCAAACAGGATTAGATACCCTGGTAGTCCTTGCCGTAAACGATGCATACTTGATGTGGATGGTCTCAACCCCATCCGTGTCGGAGCTAACGCGTTAAGTATGCCGCCTGAGGAGTACACTCGCAAGGGTGAAACTCAAAAGAATTGACGGGGGCCCGCACAAGCAGTGGAGCATGTGGTTTAATTCGATGCAACGCGAAGG

In [16]:
# Transcribe it into RNA
from Bio.Seq import Seq 
from Bio.Seq import transcribe 

# Transcribing to RNA and saving into dictionary to make access easier

ct_id_list = []
ct_seq_list = []

dna_sequences = SeqIO.parse(open('Chlamydia_trachomatis_sequences.fa'),'fasta')
for fasta_ct in dna_sequences:
    ct_id_list.append(fasta_ct.id)
    ct_seq_list.append(transcribe(fasta_ct.seq))

seq_dict = dict(zip(ct_id_list, ct_seq_list))


In [17]:
print(seq_dict["O169_04115-1"])  

UUUUCUGAGAAUUUGAUCUUGGUUCAGAUUGAACGCUGGCGGCGUGGAUGAGGCAUGCAAGUCGAACGGAGCAAUUGUUUCGGCAAUUGUUUAGUGGCGGAAGGGUUAGUAAUGCAUAGAUAAUUUGUCCUUAACUUGGGAAUAACGGUUGGAAACGGCCGCUAAUACCGAAUGUGGCGAUAUUUGGGCAUCCGAGUAACGUUAAAGAAGGGGAUCUUAGGACCUUUCGGUUAAGGGAGAGUCUAUGUGAUAUCAGCUAGUUGGUGGGGUAAAGGCCUACCAAGGCUAUGACGUCUAGGCGGAUUGAGAGAUUGGCCGCCAACACUGGGACUGAGACACUGCCCAGACUCCUACGGGAGGCUGCAGUCGAGAAUCUUUCGCAAUGGACGGAAGUCUGACGAAGCGACGCCGCGUGUGUGAUGAAGGCUCUAGGGUUGUAAAGCACUUUCGCUUGGGAAUAAGAGAAGACGGUUAAUACCCGCUGGAUUUGAGCGUACCAGGUAAAGAAGCACCGGCUAACUCCGUGCCAGCAGCUGCGGUAAUACGGAGGGUGCUAGCGUUAAUCGGAUUUAUUGGGCGUAAAGGGCGUGUAGGCGGAAAGGUAAGUUAGUUGUCAAAGAUCGGGGCUCAACCCCGAGUCGGCAUCUAAUACUAUUUUUCUAGAGGAUAGAUGGAGAAAAGGGAAUUUCACGUGUAGCGGUGAAAUGCGUAGAUAUGUGGAAGAACACCAGUGGCGAAGGCGCUUUUCUAAUUUAUACCUGACGCUAAGGCGCGAAAGCAAGGGGAGCAAACAGGAUUAGAUACCCUGGUAGUCCUUGCCGUAAACGAUGCAUACUUGAUGUGGAUGGUCUCAACCCCAUCCGUGUCGGAGCUAACGCGUUAAGUAUGCCGCCUGAGGAGUACACUCGCAAGGGUGAAACUCAAAAGAAUUGACGGGGGCCCGCACAAGCAGUGGAGCAUGUGGUUUAAUUCGAUGCAACGCGAAGGACCUUACCUGGGU

In [18]:
# Can be translated into protein but we see that protein sequence isn't in perfect
# multiples of 3
# What's trailing?
print(seq_dict["O169_04115-1"].translate())  

FSENLILVQIERWRRG*GMQVERSNCFGNCLVAEGLVMHR*FVLNLGITVGNGR*YRMWRYLGIRVTLKKGILGPFG*GRVYVISASWWGKGLPRL*RLGGLRDWPPTLGLRHCPDSYGRLQSRIFRNGRKSDEATPRV**RL*GCKALSLGNKRRRLIPAGFERTR*RSTG*LRASSCGNTEGASVNRIYWA*RACRRKGKLVVKDRGSTPSRHLILFF*RIDGEKGISRVAVKCVDMWKNTSGEGAFLIYT*R*GAKARGANRIRYPGSPCRKRCILDVDGLNPIRVGANALSMPPEEYTRKGETQKN*RGPAQAVEHVV*FDATRRTLPGFDMYMTAAEMSFSARTYTQVLHGCRQLVP*GVGLSPATSATLIVSCQHLGWEL*RDCLG*PGGRRG*RQVSMALMPRATHVLQWPVQKVARS*DGANPQSWPQFGL*SATRLHEVGIASNGVSAITP*IRSRALYTPPVTSWELVLP*VVDSTRKGERRPR*G**LG*SRNKVALPEGGAGSPPF




In [19]:
# Creating the following function to split the Chlamydia DNA sequence into codons
#
def split_into_codons(sequence):
    lstring=str(sequence)
    begin_at_start_codon = 'AUG'+lstring.split("AUG", 1)[1]
    lenstr=int(len(begin_at_start_codon)/3)
    x=0
    y=3
    codon_list = []
    for codons in range (0,lenstr):
        newlist=list(begin_at_start_codon)
        evenewer=newlist[x:y]
        stringit=''.join(evenewer)
        x+=3
        y+=3
        codon_list.append(stringit)
    return codon_list
    #print(codon_list)

In [20]:
#print(lstring.split("AUG", 1)[1])
#print('AUG'+lstring.split("AUG", 1)[1])

In [21]:
# Running this function-using Biopython-created dictionary
# Will use this to make sure that sequences begin at a start codon and any trailing nucleotides are cut off
list_of_codons = split_into_codons(seq_dict["O169_04115-1"])


In [22]:
# Now we can translate
rna_str = Seq(str(''.join(list_of_codons)))
rna_str.translate()

# Alternatively, could use a loop to run all sequences rather than running them through the function
# one at a time, but that won't be part of this exploration

Seq('MRHASRTEQLFRQLFSGGRVSNA*IICP*LGNNGWKRPLIPNVAIFGHPSNVKE...SF*')