In [1]:
#PREPARAÇÃO DE DADOS
import pandas as pd
from lxml import etree

#Acesso à base de dados DrugBank
xml_path = r"C:\Users\filip\Bioinformática\Semestre 2\Projeto\Algoritmo\Datasets\Drugbank\drugbank_all_full_database.xml\full database.xml"
ns = {'db': 'http://www.drugbank.ca'}

drugs_data = []
context = etree.iterparse(xml_path, events=('end',), tag='{http://www.drugbank.ca}drug')

for event, elem in context:
    def get_text(xpath):
        node = elem.find(xpath, namespaces=ns)
        return node.text.strip() if node is not None and node.text else ""

    def get_all_text(xpath):
        return '|'.join([
            n.text.strip() for n in elem.findall(xpath, namespaces=ns)
            if n is not None and n.text
        ])

    # Extração de dados dos fármacos
    drug = {
        'drugbank_id': get_text('db:drugbank-id'),
        'name': get_text('db:name'),
        'indication': get_text('db:indication'),
        'mechanism_of_action': get_text('db:mechanism-of-action'),
        'targets': get_all_text('db:targets/db:target/db:name'),
    }
    drugs_data.append(drug)

    elem.clear()
    while elem.getprevious() is not None:
        del elem.getparent()[0]

# Criar DataFrame
df = pd.DataFrame(drugs_data)

#Normalização
df["targets_lower"] = df["targets"].str.lower()
df["indication_lower"] = df["indication"].str.lower()
df["moa_lower"] = df["mechanism_of_action"].str.lower()

#Filtração de fármacos antimicrobianos
keywords_micro = [
    "dna gyrase", "topoisomerase iv", "penicillin-binding protein", 
    "30s", "50s", "ribosomal protein", "beta-lactamase", 
    "pbp", "mur", "cell wall", "bacterial", "dihydrofolate reductase", 
    "folate synthase", "bacter", "e.coli", "mycobacterium", 
    "pseudomonas", "efflux pump"
]

df_micro = df[df["targets_lower"].str.contains('|'.join(keywords_micro), na=False)]

#Filtração de fármacos antitumorais
keywords_tumor = [
    "egfr", "her2", "braf", "alk", "vegfr", "pdgfr", 
    "kit", "parp", "cdk", "mtor", "hdac", "topoisomerase i", 
    "topoisomerase ii", "aurora kinase", "mdm2", "checkpoint kinase",
    "tyrosine kinase", "oncogene", "tumor suppressor", "cancer", "tumor", "neoplasm", "leukemia", "lymphoma", "carcinoma", "sarcoma"
]

df_tumor = df[
    df["indication_lower"].str.contains('|'.join(keywords_tumor), na=False) |
    df["moa_lower"].str.contains('|'.join(keywords_tumor), na=False)
]

#Guardar datasets
df_micro.to_csv("drugbank_antimicrobianos.csv", index=False)
df_tumor.to_csv("drugbank_antitumorais.csv", index=False)


In [8]:
#CRIAR CONJUNTOS POSITIVOS E NEGATIVOS
import pandas as pd

df = pd.read_csv(r"C:\Users\filip\Bioinformática\Semestre 2\Projeto\Re_ cnt maquina virtual\drugbank_antimicrobianos.csv")
drug_ids = set(df["drugbank_id"].dropna().unique())

#Ler ficheiro FASTA
def read_fasta(filepath):
    sequences = []
    with open(filepath, "r") as f:
        header = None
        seq = []
        for line in f:
            line = line.strip()
            if line.startswith(">"):
                if header:
                    sequences.append((header, ''.join(seq)))
                header = line[1:]
                seq = []
            else:
                seq.append(line)
        if header:
            sequences.append((header, ''.join(seq)))
    return sequences

def write_fasta(sequences, filename):
    with open(filename, "w") as f:
        for h, s in sequences:
            f.write(f">{h}\n")
            for i in range(0, len(s), 70):
                f.write(s[i:i+70] + "\n")

valid_aa = set("ACDEFGHIKLMNPQRSTVWY")
def is_valid(seq):
    return seq and all(aa in valid_aa for aa in seq.upper())

#Separação dos conjuntos em positivos e negativos
fasta = read_fasta(r"C:\Users\filip\Bioinformática\Semestre 2\Projeto\drugbank_all_target_polypeptide_sequences.fasta\protein.fasta")
positivos = []
negativos = []

for h, s in fasta:
    if not is_valid(s):
        continue
    if any(did in h for did in drug_ids):
        positivos.append((h, s))
    else:
        negativos.append((h, s))
        
write_fasta(positivos, "conjunto_positivos_am.fasta")
write_fasta(negativos, "conjunto_negativos_am.fasta")

