In [1]:
import os
from Bio.Align.Applications import ClustalwCommandline
from Bio import Entrez 
from Bio import SeqIO 
from Bio.Align.Applications import MuscleCommandline
from Bio import Phylo

import pandas as pd 

def fetch_record(genbank_id): 
    Entrez.email = "dnjs0148@naver.com" 
    handle = Entrez.efetch(db="nucleotide", id=genbank_id, rettype="gb", retmode="xml") 
    records = Entrez.read(handle) 
    return records 

def get_sequence(records): 
    return records[0]["GBSeq_sequence"] 

#=========================== covid.fasta 파일에 바이러스 염기서열을 모두 써준다. =============================
def write_sequence(genbank_id, records): 
    print("## processing: "+genbank_id) 
    record_id = records[0]["GBSeq_accession-version"] 
    record_desc = records[0]["GBSeq_definition"] 
    record_seq = records[0]["GBSeq_sequence"] 
    
    with open("covid.fasta",'a') as fw: 
        fw.write(">"+record_id+" "+record_desc+"\n") 
        fw.write(record_seq+"\n") 
        
def read_table(file): 
    df = pd.read_csv(file, sep="\t") 
    return df 

#============================ number만큼 genbank_id를 입력하고 저장한다. ========================
f = open("covidlist.txt", 'a')
f.write("GenBank"+"\t"+"RefSeq"+"\t"+"Gene Region"+"\n")

a = int(input('number : '))
genbank_id = []
for b in range(0, a):
    genbank_id.append(input('name : '))
    
for b in range(0, a):
    f.write(genbank_id[b]+"\t"+"\t"+"complete"+"\n")

f.close()

#========== txt파일내에 있는 genbank_id에 해당하는 유전자의 염기서열을 covid.fasta 파일에  작성한다. ===========
file = "covidlist.txt" 
df = read_table(file) 
complete_df = df[df["Gene Region"]=="complete"] 
genbank_id_list = complete_df["GenBank"] 
#genbank_id_list = complete_df["GenBank"] 
for genbank_id in genbank_id_list: 
    records = fetch_record(genbank_id) 
    write_sequence(genbank_id, records)

cline = ClustalwCommandline("clustalw2", infile="covid.fasta")
print(cline)

clustalw_exe = r"C:\Users\dnjs0\clustalw2.exe"                                          
# clustalw_exe = r"C:\Program Files (x86)\ClustalW2\clustalw2.exe"
clustalw_cline = ClustalwCommandline(clustalw_exe, infile="covid.fasta")
assert os.path.isfile(clustalw_exe), "Clustal W executable missing"

stdout, stderr = clustalw_cline()

f = open('covid.aln', 'r')
while True:
    line = f.readline()
    if not line: break
    print(line)

tree = Phylo.read("covid.dnd", "newick")
Phylo.draw_ascii(tree)

number : 9
name : NC_045512
name : NC_004718
name : NC_019843
name : KU291448
name : KF530114
name : KU131570
name : MH940245
name : NC_001451
name : NC_010646
## processing: NC_045512
## processing: NC_004718
## processing: NC_019843
## processing: KU291448
## processing: KF530114
## processing: KU131570
## processing: MH940245
## processing: NC_001451
## processing: NC_010646
clustalw2 -infile=covid.fasta
CLUSTAL 2.1 multiple sequence alignment





NC_045512.2      ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTT--CGATCTCTTGT-AGA

NC_004718.3      ATATTAGGTTTTTACCTACCCAGGAAA--AGCCAACCAACCT---CGATCTCTTGT-AGA

KU131570.1       -----CTCTTCCGATCTAATTGCGTGC--GTGCAACCCGCTTCACTGATCTCTTGTTAGA

MH940245.1       ------------GATTGACGTTCGTAC--CGTCTATCAGCTT--ACGATCTCTTGTCAGA

KU291448.1       ------------------------------------------------------------

KF530114.1       ------------------------------------------------------------

NC_019843.3      -----------------------------------------------------

NC_045512.2      G----TAGCGTCACTTATCAACACACTTAACGATCTA--AATGAAACTCTTGTTACAATG

NC_004718.3      G----TAGCTTCTATTATTACGAAGCTGAACTCTCTA--AATGAGCCGCTTGTCACAATG

KU131570.1       GATCTTATTTCTAAGTGTCAGATAACTGCTGTTGAGGGCACTAAGAAATTGGCAGCGCGT

MH940245.1       GATATTATTCAAAAATGTCAAATTACTTCAGTTGTTGGTACTAAAGCATTGGCTGTTAGA

KU291448.1       GA---TGGTTTTGGTGTTAACAAGATTCAGCCCTGGACAAATGATGCACTTAATACTATT

KF530114.1       GA---TGGTTTTGGTGTCTTTGACATTAA---------AAATAGTAGTGTTAATACTGTT

NC_019843.3      CGT-CTAAGGACACTTTAGATGATATCTTACAACAGGCTAATAAGTCTGTTGGTATTATA

NC_001451.1      GA----AGTTGCAA-----AAGAAACAGCAGAAGAGGTTGATGAGTTTATTCTCATTTTT

NC_010646.1      ACTGGTCCTTATAATTGTAGTATGTATGGAGTAAAGGCTATTAGAAATGCTGTTGCACCC

                                                          *                  



NC_045512.2      CCACTTGGCTATGTAACACATGGCTTAAATTTGGAAGAAGCTGCTCGGTATATGAGATCT

NC_004718.3      CCAATTGGTTATGTGACACATGGTTTTAATCTTGAAGAGGCTGCGCGCTGTATGCGTTCT

KU131570.1       CTTTCTTTTAACGTTGGACGTTCCATTGTTTAT


NC_004718.3      TATAACAGGTATCTTGCTCTATATAACAAGTACAAGTATTTCAGTGGAGCCTTAGATACT

KU131570.1       TTTAATAGATATTTGAGTTTGTATAATAAATATAGGTATTACAGCGGTAAAATGGACACT

MH940245.1       TACAATAGATATTTGAGTTTGTATAATAAGTATCGTTACTATAGTGGTAAAATGGATACT

KU291448.1       TTGAAAAGTTATGCTGCTAGCTATAATAGATACAAGTACTATAGTGGTAATGCAAATGAA

KF530114.1       CTTAAGAATTATGCTGCAAGTTATAATAAATATAAATATTATAGTGGTAGTGCTAGTGAG

NC_019843.3      TATTCACGATTTTTGGGGTTGTTTAACAAGTATAAGTACTTCTCTGGTGCTATGGAAACA

NC_001451.1      TTTGAGGCCTACCTTTCAGCGTATGCTAGATTAAAGTACTATTCAGGCACTGGCAGTGAA

NC_010646.1      TTCCAAAAGTATCTTTCTGGTTATGCTAGGCTGAAGTATTATTCAGGTTCTGGTGGTGAC

                          *           * *   *        ** *     **             



NC_045512.2      ACTAGCTACAGAGAAGCTGCTTGTTGTCATCTCGCAAAGGCTCTCAATGACTTCA---GT

NC_004718.3      ACCAGCTATCGTGAAGCAGCTTGCTGCCACTTAGCAAAGGCTCTAAATGACTTTA---GC

KU131570.1       GCTGCATATAGGGAGGCTGCTTGCTCTCAGTTGGCTAAGGCAATGGACACATTTACCAAT

MH940245.1       GCTGCCTATAGAGAAGCGGCGTGTTCTCAGTT

NC_045512.2      TTGCTAAATTCCTAAAAACTAATTGTTGTCGCTTCCAAGA------------AAAGGACG

NC_004718.3      TTGCAAAGTTCCTAAAAACTAATTGCTGTCGCTTCCAGGA------------GAAGGATG

KU131570.1       TTGGTTTACATTTAAAAGTTAATTGTTGCCGTTTTCAGCG------------TGTTGATG

MH940245.1       TAGGTTTATATTATAAAGTGAATTGTTGCCGTTTTCAGCG------------TATAGATG

KU291448.1       TCGGAAAAAATCTGAAGTCCAATTGTGTGCGCTTCAAGAA------------TGTAGATA

KF530114.1       TGGGTAAGTGTTTGAAGATGAACTGTGTTCGTTTTAAAAA------------CGCTGATC

NC_019843.3      TTGGAAAATACTACAAGACTAATACTTGTAGGTTTGTAGA------------ATTAGATG

NC_001451.1      TGTTTCAAAATTTGAAGCGTAACTGCGCTAGATTCCAGGAACTACGCGATACTGAAGATG

NC_010646.1      TGTATGCTTCTTTAAAACATAATTGTGCCCGCTTTCAAGA------------GCTGGACG

                 *             **    **        * **                      **  



NC_045512.2      AAGATGACAATTTAATTGATTCTTACTTTGTAGTTAAGAGACACACTTTCTCTAACTACC

NC_004718.3      AGGAAGGCAATTTATTAGACTCTTACTTTGTAGTTAAGAGGCATACTATGTCTAACTACC

KU131570.1       AGAACGGTGATAAATTAGATCAGTTCTTTGTTG

KU291448.1       TGGTAACACTGATAATTTAGTCTTTGTTAACAAAACATCACTGCCTACAAATATAGCATT

KF530114.1       TGGCGATGTTGACAACTTGGTTTTTACAAATAAAACAACATTGCCTACTAATGTTGCTTT

NC_019843.3      TGGCGTTAATGACATTTGTATGTTTGAGAATAAAACCACTTTGCCTACTAATATAGCTTT

NC_001451.1      TCAAGGTGTAGAAAAGGCAGTTTTTGTTAATCAAACAACTCTACCTACATCTGTGGCGTT

NC_010646.1      GGGGCTTCAAGATGTTTTGATTTTTGAGAACAAGACTAGTATGGCTACTAGTATTGCTTT

                           **        * ***   **  * **        **     * * **  *



NC_045512.2      TGAGCTTTGGGCTAAGCGCAACATTAAACCAGTACCAGAGGTGAAAATACTCAATAATTT

NC_004718.3      TGAGCTTTGGGCTAAGCGTAACATTAAACCAGTGCCAGAGATTAAGATACTCAATAATTT

KU131570.1       TGAATTATTTGCCAAGCGCAGTGTTCGACACCACCCAGAGCTTAAGCTCTTTAGAAATTT

MH940245.1       TGAATTATTTACAAAACGTAGTATTCGCCATCACCCTGAACTTAAGATTCTTAGAAATTT

KU291448.1       TGAACTTTTTGCTAAGAGGAAGGTTGGTTTAACACCACCCCTCAGTATTCTCAAAAACCT

KF530114.1       TGAATTGTTTGCAAAACGAAAAATGGGTTTAACACCACCATTGTCTATTCTCAAAAATCT

NC_019843.3      TGAACTCTATGCTAAGCGTGCTGTACGCTCGCA

KU291448.1       TAAGACGCTGCAAGAATTAAGTTACAAATTGCCAAATTACACTGTTCCAGACCTAGTTGT

KF530114.1       TAAAACATTACAAGAGTTTGCACAAAACTTACCAAAGTATGTTAAGCCTAATTTTGACTT

NC_019843.3      --ACTTCCAAGATGAGTTGG---ATGAGTTTTTCAAAAATGTTAGCACCAGTATACCTAA

NC_001451.1      TGACGAATTGTCAAAATGG---TGGAATGATACTAAGCATGAGCTACCAGACTTTGACAA

NC_010646.1      TAAAGAGTTTGATAAATTG---TATGAGAAATGGAACTATACGCTCGAGGAGTTGGAGAA

                               * *                  *                 *      



NC_045512.2      CATTAATGCTTCAGTTGTAAACA----TTCAAAAAGAAAT--TGACCGCCTCAATGAGGT

NC_004718.3      CATTAACGCTTCTGTCGTCAACA----TTCAAAAAGAAAT--TGACCGCCTCAATGAGGT

KU131570.1       TTTTAGAGACTACATAACAAACAGAACTTTTATGATTCGTAGTTGCTATAGCGGTCGTGT

MH940245.1       TTTTAAAGATTTTTTGACTAATAAAACTTACACTATACTTCCTTGTTATTCTGGTAGAGT

KU291448.1       TGAACAGTACAACCAGACTATTTTGAATTTGACCAGTGAAATTAGCACCCTTGAAAATAA

KF530114.1       GACTCCTTTTAATTTAACATATCTTAATTTGAGTTCTGAGTTGAAGCAACTCGAAGCTAA

NC_019843.3      TTTTGGTTCC-CTAACACAGATTAATACTACAT

NC_004718.3      ------------------------------------------------------------

KU131570.1       ------------------------------------------------------------

MH940245.1       ------------------------------------------------------------

KU291448.1       ------------------------------------------------------------

KF530114.1       ------------------------------------------------------------

NC_019843.3      ------------------------------------------------------------

NC_001451.1      ------------------------------------------------------------

NC_010646.1      CATATGTTACCTGGGTGAAAGGAGATGGAGCGAAGACCAGTCAGTTGTCTGAAGCAGGTT

                                                                             



NC_045512.2      ------------------------------------------------------------

NC_004718.3      ------------------------------------------------------------

KU131570.1       ------------------------------------------------------------

MH940245.1       ---------------------------------