In [1]:
# 4.3.1.make_sequence / Sequence 객체 만들기
from Bio.Seq import Seq

tatabox_seq = Seq('tataaaggcAATATGCAGTAG')
print(tatabox_seq)
print(type(tatabox_seq))

tataaaggcAATATGCAGTAG
<class 'Bio.Seq.Seq'>


In [2]:
# 함수 속성 확인
print('dir :', dir(Seq))
# print('help :', help(Seq))

dir : ['__abstractmethods__', '__add__', '__array_ufunc__', '__bytes__', '__class__', '__contains__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__imul__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__mul__', '__ne__', '__new__', '__radd__', '__reduce__', '__reduce_ex__', '__repr__', '__rmul__', '__setattr__', '__sizeof__', '__slots__', '__str__', '__subclasshook__', '__weakref__', '_abc_impl', '_data', 'back_transcribe', 'complement', 'complement_rna', 'count', 'count_overlap', 'defined', 'defined_ranges', 'endswith', 'find', 'index', 'islower', 'isupper', 'join', 'lower', 'lstrip', 'replace', 'reverse_complement', 'reverse_complement_rna', 'rfind', 'rindex', 'rsplit', 'rstrip', 'split', 'startswith', 'strip', 'transcribe', 'translate', 'ungap', 'upper']


In [3]:
# # Alphabet 모듈
# # 4.3.2.alphabet
# from Bio.Seq import Seq
# from Bio.Alphabet import IUPAC
#
# tatabox_seq = Seq('tataaaggcAATATGCAGTAG', IUPAC.unambiguous_dan)
# print(tatabox_seq)
# print(type(tatabox_seq))
#
# # 사라짐

# 4.3.2.IUPACData

from Bio.Seq import Seq
from Bio.SeqUtils import IUPACData

tatabox_seq = Seq('tataaaggcAATATGCAGTAG', IUPACData.unambiguous_dna_letters)
print(tatabox_seq)
print(type(tatabox_seq))

tataaaggcAATATGCAGTAG
<class 'Bio.Seq.Seq'>


In [4]:
# 4.4.1 count / Sequence 객체의 염기 세기
exon_seq = Seq('ATGCAGTAG')
count_a = exon_seq.count('A')
print(count_a)

3


GC-contents(%) = (C 염기수+G 염기수)/(전체 염기수)*100(%)

In [5]:
# 4.4.2.gc_contents / 서열의 GC-contents(%) 계산
g_count = exon_seq.count('G')
c_count = exon_seq.count('C')
gc_contents = (g_count + c_count) / len(exon_seq) * 100
print(gc_contents)

44.44444444444444


In [6]:
# 4.4.3.case / Sequence 객체 서열 대소문자 변환
print(tatabox_seq.upper())
print(tatabox_seq.lower())

TATAAAGGCAATATGCAGTAG
tataaaggcaatatgcagtag


In [7]:
# 4.4.4.translate_transcribe / Sequence 객체 전사, 번역하기
dna = Seq('ATGCAGTAG')
mrna = dna.transcribe()
ptn = dna.translate()
print(mrna)
print(ptn) # '*'는 종결코돈

AUGCAGUAG
MQ*


In [8]:
# 4.4.5.translate_stop_1 / RNA 서열 번역하기

mRNA = Seq('AUGAACUAAGUUUAGAAU')
ptn = mRNA.translate()
print(ptn)

MN*V*N


In [9]:
# 4.4.5.translate_stop_2 첫번째 종결 코돈에서 번역 종료 / RNA 서열 번역 시 첫 종결 코돈에서 종료하기
ptn = mRNA.translate(to_stop=True) # to_stop=True 처음 생성 되는 종결코돈 이전 까지의 서열
print(ptn)

MN


In [10]:
# 4.4.6.split / 아미노산 성열 종결 코돈 기준으로 나누기
ptn = mRNA.translate()
print(ptn)
for seq in ptn.split('*'):
    print(seq)

MN*V*N
MN
V
N


In [11]:
# 4.4.7.complement / 순수 파이썬 구현으로 DNA Sequence 상보적, 역상보적 서열 만들기
seq = 'TATAAAGGCAATATGCAGTAG'
comp_dic = { 'A': 'T',
             'C': 'G',
             'G': 'C',
             'T': 'A' } # 상보적 염기를 키-값으로 하는 dic 생성
comp_seq = ''
for s in seq: # 서열에서 하나씩 읽기
    comp_seq += comp_dic[s] # 상보적 염기 추가
revcomp_seq = comp_seq[::-1] # 문자 뒤집기
print(comp_seq)
print(revcomp_seq)

ATATTTCCGTTATACGTCATC
CTACTGCATATTGCCTTTATA


In [12]:
# 4.4.7.complement.bio / 바이오파이썬으로 DNA Sequence 상보적, 역상보적 서열 만들기

seq = Seq('TATAAAGGCAATATGCAGTAG')
comp_seq = seq.complement()
rev_comp_seq = seq.reverse_complement()
print(comp_seq)
print(rev_comp_seq)

ATATTTCCGTTATACGTCATC
CTACTGCATATTGCCTTTATA


In [13]:
# 4.4.8.codonTable / 코돈 테이블 출력하기
from Bio.Data import CodonTable

codon_table = CodonTable.unambiguous_dna_by_name['Standard']
print(codon_table)

Table 1 Standard, SGC0

  |  T      |  C      |  A      |  G      |
--+---------+---------+---------+---------+--
T | TTT F   | TCT S   | TAT Y   | TGT C   | T
T | TTC F   | TCC S   | TAC Y   | TGC C   | C
T | TTA L   | TCA S   | TAA Stop| TGA Stop| A
T | TTG L(s)| TCG S   | TAG Stop| TGG W   | G
--+---------+---------+---------+---------+--
C | CTT L   | CCT P   | CAT H   | CGT R   | T
C | CTC L   | CCC P   | CAC H   | CGC R   | C
C | CTA L   | CCA P   | CAA Q   | CGA R   | A
C | CTG L(s)| CCG P   | CAG Q   | CGG R   | G
--+---------+---------+---------+---------+--
A | ATT I   | ACT T   | AAT N   | AGT S   | T
A | ATC I   | ACC T   | AAC N   | AGC S   | C
A | ATA I   | ACA T   | AAA K   | AGA R   | A
A | ATG M(s)| ACG T   | AAG K   | AGG R   | G
--+---------+---------+---------+---------+--
G | GTT V   | GCT A   | GAT D   | GGT G   | T
G | GTC V   | GCC A   | GAC D   | GGC G   | C
G | GTA V   | GCA A   | GAA E   | GGA G   | A
G | GTG V   | GCG A   | GAG E   | GGG G   | G
--+---------

In [14]:
# 4.4.8.codonTable.mitochondria / 미토콘드리아의 코돈 테이블 출력하기

codon_table = CodonTable.unambiguous_dna_by_name['Vertebrate Mitochondrial']
print(codon_table)

Table 2 Vertebrate Mitochondrial, SGC1

  |  T      |  C      |  A      |  G      |
--+---------+---------+---------+---------+--
T | TTT F   | TCT S   | TAT Y   | TGT C   | T
T | TTC F   | TCC S   | TAC Y   | TGC C   | C
T | TTA L   | TCA S   | TAA Stop| TGA W   | A
T | TTG L   | TCG S   | TAG Stop| TGG W   | G
--+---------+---------+---------+---------+--
C | CTT L   | CCT P   | CAT H   | CGT R   | T
C | CTC L   | CCC P   | CAC H   | CGC R   | C
C | CTA L   | CCA P   | CAA Q   | CGA R   | A
C | CTG L   | CCG P   | CAG Q   | CGG R   | G
--+---------+---------+---------+---------+--
A | ATT I(s)| ACT T   | AAT N   | AGT S   | T
A | ATC I(s)| ACC T   | AAC N   | AGC S   | C
A | ATA M(s)| ACA T   | AAA K   | AGA Stop| A
A | ATG M(s)| ACG T   | AAG K   | AGG Stop| G
--+---------+---------+---------+---------+--
G | GTT V   | GCT A   | GAT D   | GGT G   | T
G | GTC V   | GCC A   | GAC D   | GGC G   | C
G | GTA V   | GCA A   | GAA E   | GGA G   | A
G | GTG V(s)| GCG A   | GAG E   | GGG G   

In [15]:
# 4.4.9.orf_finder / Sequence 객체에서 ORF 찾기

tatabox_seq = Seq('tataaaggcAATATGCAGTAG')
start_idx = tatabox_seq.find('ATG')
end_idx = tatabox_seq.find('TAG', start_idx)
orf = tatabox_seq[start_idx:end_idx+3]
print(orf)

ATGCAGTAG


In [16]:
# 4.5.1.gc_contents / Bio.SeqUtils로 GC-contents(%) 계산하기
from Bio.SeqUtils import GC

exon_seq = Seq('ATGCAGTAG')
gc_contents = GC(exon_seq)
print(gc_contents)

44.44444444444444




In [17]:
# 4.5.2.calc_molecualr_weight / Bio.SeqUtils로 서열의 무게 계산하기
from Bio.Data import IUPACData
from Bio.SeqUtils import molecular_weight

seq1 = Seq('ATGCAGTAG')
seq2 = Seq('ATGCAGTAG', IUPACData.unambiguous_dna_letters)
seq3 = Seq('ATGCAGTAG', IUPACData.unambiguous_dna_letters)

print(molecular_weight(seq1))
print(molecular_weight(seq2, 'DNA'))
print(molecular_weight(seq3, 'protein'))

# from Bio.Alphabet import IUPAC 사라져서 다른 방식으로 확인

2842.8206999999993
2842.8206999999993
707.7536


In [18]:
# 4.5.3 make_six_frame_translations / Bio.Sequtlils로 가능한 모든 번역 구하기
from Bio.SeqUtils import six_frame_translations

seq1 = Seq('ATGCCTTGAAATGTATAG')
print(six_frame_translations(seq1))

GC_Frame: a:6 t:6 g:4 c:2
Sequence: atgccttgaaatgtatag, 18 nt, 33.33 %GC


1/1
  A  L  K  C  I
 C  L  E  M  Y
M  P  *  N  V  *
atgccttgaaatgtatag   33 %
tacggaactttacatatc
G  Q  F  T  Y
 H  R  S  I  Y  L
  A  K  F  H  I




In [19]:
# 4.5.4.calc_melting_temperature / Bio.SeqUtils로 DNA 서열 Tm 계산하기
from Bio.SeqUtils import MeltingTemp as mt

myseq = Seq('AGTCTGGGACGGCGCGGCAATCGCA')
print(mt.Tm_Wallace(myseq))

84.0


In [21]:
# 4.5.5.convert_aminoacid_1to3 / 아미노산 서열 기호를 약자로 변환하기
from  Bio.SeqUtils import seq1

essential_amino_acid_3 = 'LeuLysMetValIleThrTrpPhe'
print(seq1(essential_amino_acid_3))

LKMVITWF


In [22]:
# 4.5.5.convert_aminoacid_3to1 / 아미노산 약자를 서열 기호로 변환하기
from Bio.SeqUtils import seq3

essential_amino_acid_1 = 'LKMVITWF'
print(seq3(essential_amino_acid_1))

LeuLysMetValIleThrTrpPhe


In [23]:
# EX 서열을 Sequence 객체로 만들고 대문자 변환
# 1
seq = Seq('aagtGACAGggatTG')
print(seq.upper())

AAGTGACAGGGATTG


In [24]:
# 2 첫번째 종결 코돈까지 번역
seq = seq.upper()
print(seq.translate(to_stop=True))

K


In [27]:
# 3 역상보 서열의 GC(%)와 Tm 계산
# from Bio.SeqUtils import GC
from Bio.SeqUtils import gc_fraction
from Bio.SeqUtils import MeltingTemp as mt

myseq = seq
revcomp_seq = myseq.reverse_complement()

# print(GC(revcomp_seq))
print(gc_fraction(revcomp_seq) * 100)
print(mt.Tm_Wallace(revcomp_seq))

0.4666666666666667
44.0


In [31]:
# 4 코돈 테이블을 참고하여 인간과 미토콘드리아의 종결 코돈 확인
from Bio.Data import CodonTable

human_codon_table = CodonTable.unambiguous_dna_by_name['Standard']
mitochondrial_codon_table = CodonTable.unambiguous_dna_by_name['Vertebrate Mitochondrial']

print(human_codon_table)
print(mitochondrial_codon_table)
print('human_stop_codons :', human_codon_table.stop_codons)
print('mitochondrial_stop_codons :', mitochondrial_codon_table.stop_codons)

Table 1 Standard, SGC0

  |  T      |  C      |  A      |  G      |
--+---------+---------+---------+---------+--
T | TTT F   | TCT S   | TAT Y   | TGT C   | T
T | TTC F   | TCC S   | TAC Y   | TGC C   | C
T | TTA L   | TCA S   | TAA Stop| TGA Stop| A
T | TTG L(s)| TCG S   | TAG Stop| TGG W   | G
--+---------+---------+---------+---------+--
C | CTT L   | CCT P   | CAT H   | CGT R   | T
C | CTC L   | CCC P   | CAC H   | CGC R   | C
C | CTA L   | CCA P   | CAA Q   | CGA R   | A
C | CTG L(s)| CCG P   | CAG Q   | CGG R   | G
--+---------+---------+---------+---------+--
A | ATT I   | ACT T   | AAT N   | AGT S   | T
A | ATC I   | ACC T   | AAC N   | AGC S   | C
A | ATA I   | ACA T   | AAA K   | AGA R   | A
A | ATG M(s)| ACG T   | AAG K   | AGG R   | G
--+---------+---------+---------+---------+--
G | GTT V   | GCT A   | GAT D   | GGT G   | T
G | GTC V   | GCC A   | GAC D   | GGC G   | C
G | GTA V   | GCA A   | GAA E   | GGA G   | A
G | GTG V   | GCG A   | GAG E   | GGG G   | G
--+---------