In [1]:
!pip install Bio

Collecting Bio
  Downloading bio-1.7.1-py3-none-any.whl.metadata (5.7 kB)
Collecting biopython>=1.80 (from Bio)
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting gprofiler-official (from Bio)
  Downloading gprofiler_official-1.0.0-py3-none-any.whl.metadata (11 kB)
Collecting mygene (from Bio)
  Downloading mygene-3.2.2-py2.py3-none-any.whl.metadata (10 kB)
Collecting biothings-client>=0.2.6 (from mygene->Bio)
  Downloading biothings_client-0.4.1-py3-none-any.whl.metadata (10 kB)
Downloading bio-1.7.1-py3-none-any.whl (280 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.0/281.0 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m29.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading gprofiler_official-1.0.0-py3-none-any.whl (9.3

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
### Thử ghi file
import time

file_path = "/content/drive/MyDrive/DataHuynhKom/test_write.txt"

with open(file_path, "a") as f:
    for i in range(10):
        f.write(f"Row {i + 1}: {time.ctime()}\n")
        f.flush()
        print(f"Line recorded {i + 1}")
        time.sleep(2)
print(f"The file has been written to {file_path}")

In [None]:
import time
import random
from Bio import Entrez, SeqIO

Entrez.email = "anhkhoa.24052003@gmail.com"
Entrez.api_key = "f1a896e83b4b353cc94bd31a67dbc4931a08"

def fetch_sequences(amp_count=1000000, namp_count=1000000, output_file='/content/drive/MyDrive/DataHuynhKom/1_13AM_27_3_peptides.fasta'):
    def robust_esearch(db, term, retmax):
        attempts = 0
        while attempts < 5:
            try:
                handle = Entrez.esearch(db=db, term=term, retmax=retmax)
                record = Entrez.read(handle)
                handle.close()
                return record['IdList']
            except Exception as e:
                print(f"Esearch error: {e}. Retry ({attempts + 1}/5)...")
                time.sleep(2 ** attempts + random.uniform(0, 1))
                attempts += 1
        return []

    def robust_efetch(db, id_list, rettype, retmode):
        attempts = 0
        while attempts < 5:
            try:
                fetch_handle = Entrez.efetch(db=db, id=','.join(id_list), rettype=rettype, retmode=retmode)
                return fetch_handle
            except Exception as e:
                print(f"Efetch error: {e}. Retry ({attempts + 1}/5)...")
                time.sleep(2 ** attempts + random.uniform(0, 1))
                attempts += 1
        return None
    print("Looking for AMP...")
    search_term_amp = "antimicrobial peptide[Title] OR antimicrobial peptide[All Fields]"
    amp_ids = robust_esearch(db="protein", term=search_term_amp, retmax=amp_count)
    print("Looking for nAMP...")
    search_term_namp = "peptide[Title] NOT (antimicrobial peptide[Title] OR antimicrobial peptide[All Fields])"
    namp_ids = robust_esearch(db="protein", term=search_term_namp, retmax=namp_count)
    all_ids = {'AMP': amp_ids, 'nAMP': namp_ids}
    sequences = []
    with open(output_file, 'w') as fasta_file:
        for label, id_list in all_ids.items():
            print(f"\nLoading {label} ({len(id_list)} peptides)...")
            for i in range(0, len(id_list), 200):
                batch_ids = id_list[i:i + 200]
                fetch_handle = robust_efetch(db="protein", id_list=batch_ids, rettype="fasta", retmode="text")
                if fetch_handle:
                    for record in SeqIO.parse(fetch_handle, "fasta"):
                        seq_length = len(record.seq)
                        if 10 <= seq_length <= 50:
                            fasta_entry = f">{record.id} {label}\n{record.seq}"
                            fasta_file.write(fasta_entry + "\n")
                            sequences.append(fasta_entry)
                    fetch_handle.close()
                time.sleep(0.2)

    print(f"\nTook {len(sequences)} valid peptides (AMP: {len(amp_ids)}, nAMP: {len(namp_ids)}).")
    print(f"Data saved in {output_file}")
fetch_sequences()

Looking for AMP...
Looking for nAMP...
