In [5]:

# pip install selenium webdriver-manager biopython requests

import os
import time
import requests
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from Bio.PDB import PDBParser, PPBuilder
from Bio import pairwise2

# Dohvati 50 PDB ID-eva sa OPM-a
def get_pdb_ids_from_opm(url="https://opm.phar.umich.edu/protein_classes/1", count=50):
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in headless mode
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")

    driver = webdriver.Chrome(options=chrome_options)

    driver.get(url)
    time.sleep(3)

    pdb_ids = set()
    retries = 0

    while len(pdb_ids) < count and retries < 100:
        rows = driver.find_elements(By.CLASS_NAME, "ReactVirtualized__Table__row")
        for row in rows:
            try:
                cells = row.find_elements(By.CLASS_NAME, "ReactVirtualized__Table__rowColumn")
                if len(cells) >= 3:
                    pdb_id = cells[2].text.strip().lower()
                    if len(pdb_id) == 4 and pdb_id.isalnum():
                        pdb_ids.add(pdb_id)
            except Exception:
                continue
        driver.execute_script("window.scrollBy(0, 200);")
        time.sleep(0.3)
        retries += 1

    driver.quit()
    return list(pdb_ids)[:count]

# Preuzmi .pdb datoteke
def download_pdb_file(pdb_id, destination_folder, source="rcsb"):
    os.makedirs(destination_folder, exist_ok=True)
    path = os.path.join(destination_folder, f"{pdb_id}.pdb")
    if os.path.exists(path):
        print(f"[{source.upper()}] Već preuzeto: {pdb_id}")
        return

    url = f"https://files.rcsb.org/download/{pdb_id}.pdb" if source == "rcsb" else f"https://opm-assets.storage.googleapis.com/pdb/{pdb_id}.pdb"
    response = requests.get(url)
    if response.status_code == 200:
        with open(path, "w") as f:
            f.write(response.text)
        print(f"[{source.upper()}] Preuzeto: {pdb_id}")
    else:
        print(f"[{source.upper()}] Neuspješno: {pdb_id}")

# Parsiraj sekvencu iz PDB datoteke
def extract_sequence_from_pdb(file_path):
    parser = PDBParser(QUIET=True)
    try:
        structure = parser.get_structure("X", file_path)
        ppb = PPBuilder()
        sequence = ""
        for pp in ppb.build_peptides(structure):
            sequence += str(pp.get_sequence())
        return sequence
    except Exception as e:
        print(f"[Parse Error] {file_path}: {e}")
        return ""

# Dohvati FASTA sekvencu s RCSB
def fetch_fasta_sequence(pdb_id):
    url = f"https://www.rcsb.org/fasta/entry/{pdb_id.upper()}/display"
    response = requests.get(url)
    if response.status_code == 200:
        lines = response.text.splitlines()
        sequence = "".join(line.strip() for line in lines if not line.startswith(">"))
        return sequence
    else:
        print(f"[FASTA]: Neuspješno dohvaćanje za {pdb_id}")
        return ""

# Dohvati početni indeks aminokiseline iz PDB strukture
def get_pdb_starting_index(file_path):
    parser = PDBParser(QUIET=True)
    try:
        structure = parser.get_structure("X", file_path)
        for model in structure:
            for chain in model:
                residues = [res for res in chain.get_residues() if res.id[0] == " "]
                if residues:
                    return residues[0].id[1]
    except:
        pass
    return 1

# Lokalno poravnanje
def sequences_similar(seq1, seq2, threshold=0.9):
    alignments = pairwise2.align.localxx(seq1, seq2)
    if not alignments:
        return False
    best = alignments[0]
    identity = best[2] / max(len(seq1), len(seq2))
    return identity >= threshold

# Glavna funkcija usporedbe
def compare_sequences_and_save(pdb_ids, rcsb_dir="rcsb_structures", opm_dir="opm_structures", out_file="matched_sequences.txt"):
    out_path = os.path.join(os.getcwd(), out_file)

    with open(out_path, "w") as f_out:
        for pdb_id in pdb_ids:
            rcsb_file = os.path.join(rcsb_dir, f"{pdb_id}.pdb")
            opm_file = os.path.join(opm_dir, f"{pdb_id}.pdb")

            if not (os.path.exists(rcsb_file) and os.path.exists(opm_file)):
                print(f"[!] Nedostaje file za {pdb_id}")
                continue

            seq_rcsb = extract_sequence_from_pdb(rcsb_file)
            seq_opm = extract_sequence_from_pdb(opm_file)
            seq_fasta = fetch_fasta_sequence(pdb_id)

            if not seq_fasta:
                continue

            starting_index = get_pdb_starting_index(rcsb_file)
            if starting_index > 1:
                seq_fasta = "_" * (starting_index - 1) + seq_fasta

            print(f"{pdb_id}: RCSB({len(seq_rcsb)}), OPM({len(seq_opm)}), FASTA({len(seq_fasta)})")
            print(f"RCSB Start: {seq_rcsb[:30]}")
            print(f"OPM  Start: {seq_opm[:30]}")
            print(f"FASTAStart: {seq_fasta[:30]}")

            if (seq_rcsb and seq_opm and seq_fasta and
                    sequences_similar(seq_rcsb, seq_opm) and
                    sequences_similar(seq_rcsb, seq_fasta)):
                f_out.write(f">{pdb_id} (RCSB)\n{seq_rcsb}\n")
                f_out.write(f">{pdb_id} (OPM)\n{seq_opm}\n")
                f_out.write(f">{pdb_id} (FASTA)\n{seq_fasta}\n")
                f_out.write("-" * 30 + '\n')
                print(f"[✓] Podudarne sekvence: {pdb_id}")
            else:
                print(f"[≠] Različite sekvence: {pdb_id}")

print("\nScraping OPM za PDB ID-eve...")
pdb_ids = get_pdb_ids_from_opm()
print(f"\nNađeno {len(pdb_ids)} PDB ID-eva:", pdb_ids)

print("\nPreuzimanje .pdb datoteka...")
for pdb_id in pdb_ids:
    download_pdb_file(pdb_id, destination_folder="rcsb_structures", source="rcsb")
    download_pdb_file(pdb_id, destination_folder="opm_structures", source="opm")

print("\nUspoređivanje sekvenci...")
compare_sequences_and_save(pdb_ids)
print("\nPodudarne sekvence su spremljene u 'matched_sequences.txt' datoteku.")

main()





Scraping OPM za PDB ID-eve...

Nađeno 50 PDB ID-eva: ['6k6k', '7vgv', '2l6x', '4yzi', '7l1e', '7bmh', '7w9w', '8anq', '5jsi', '6csm', '2ei4', '7w74', '3vvk', '1ap9', '6gux', '4qi1', '4wav', '1iw6', '7shs', '6k6i', '6nwf', '6lm1', '4knf', '6lm0', '6gyh', '1py6', '5azd', '7sfk', '7sfj', '4jr8', '7zou', '4fbz', '7zow', '7e6z', '3a7k', '3ug9', '5b2n', '7avp', '2zzl', '1fbk', '1e12', '1uaz', '7zov', '1vgo', '4pxk', '7zoy', '6eyu', '6eid', '5vn7', '7q37']

Preuzimanje .pdb datoteke...
[RCSB] Preuzeto: 6k6k
[OPM] Preuzeto: 6k6k
[RCSB] Preuzeto: 7vgv
[OPM] Preuzeto: 7vgv
[RCSB] Preuzeto: 2l6x
[OPM] Preuzeto: 2l6x
[RCSB] Preuzeto: 4yzi
[OPM] Preuzeto: 4yzi
[RCSB] Preuzeto: 7l1e
[OPM] Preuzeto: 7l1e
[RCSB] Preuzeto: 7bmh
[OPM] Preuzeto: 7bmh
[RCSB] Preuzeto: 7w9w
[OPM] Preuzeto: 7w9w
[RCSB] Preuzeto: 8anq
[OPM] Preuzeto: 8anq
[RCSB] Preuzeto: 5jsi
[OPM] Preuzeto: 5jsi
[RCSB] Preuzeto: 6csm
[OPM] Preuzeto: 6csm
[RCSB] Preuzeto: 2ei4
[OPM] Preuzeto: 2ei4
[RCSB] Preuzeto: 7w74
[OPM] Preuzeto: 7w74