In [None]:
import esm
import torch
import pandas as pd
import numpy as np
import os


def extract_and_save_embeddings_to_csv(df_fasta, model, alphabet, device, batch_size=32, save_every=1000, output_prefix="esm2_embeddings"):
    #device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    #device = 'cpu'
    print("Dispositivo:", device)
    
    batch_converter = alphabet.get_batch_converter()

    all_rows = []  # Cada item será: [header, f1, f2, ..., f640]
    saved_batches = 0

    batch_sequences = []
    batch_headers = []

    for i, row in enumerate(df_fasta.itertuples(index=False)):
        header, sequence = row.header, row.sequence
        batch_headers.append(header)
        batch_sequences.append((header, sequence))

        if len(batch_sequences) == batch_size or i == len(df_fasta) - 1:
            try:
                batch_labels, batch_strs, batch_tokens = batch_converter(batch_sequences)
                batch_tokens = batch_tokens.to(device)

                with torch.no_grad():
                    results = model(batch_tokens, repr_layers=[30], return_contacts=False)
                    token_representations = results["representations"][30]

                    for j, tokens_len in enumerate((batch_tokens != alphabet.padding_idx).sum(1)):
                        seq_embedding = token_representations[j, 1:tokens_len - 1].mean(0).cpu().numpy()
                        all_rows.append([batch_headers[j]] + seq_embedding.tolist())

            except RuntimeError as e:
                print(f"Erro de memória em {batch_headers}: {e}")
                torch.cuda.empty_cache()
                continue

            # Libera GPU
            del batch_tokens, results, token_representations
            torch.cuda.empty_cache()

            batch_sequences = []
            batch_headers = []

            if (i + 1) % save_every == 0 or i == len(df_fasta) - 1:
                save_path = f"{output_prefix}"
                print(f"Salvando {len(all_rows)} embeddings em {save_path}...")

                df_out = pd.DataFrame(all_rows)
                df_out.to_csv(save_path, index=False, header=False)
                saved_batches += 1
                all_rows = []

    print("Extração finalizada.")

def make_fasta(fasta_path):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model, alphabet = esm.pretrained.esm2_t30_150M_UR50D()
    model = model.to(device)
    model.eval()
    batch_converter = alphabet.get_batch_converter()
    headers = []
    seqs = []
    
    with open(fasta_path, 'r') as f:
        current_header = None
        current_seq = []
    
        for line in f:
            line = line.strip()
            if line.startswith(">"):
                if current_header is not None:
                    headers.append(current_header)
                    seqs.append(''.join(current_seq))
                current_header = line[1:].split()[0]
                current_seq = []
            else:
                current_seq.append(line)
        # última seq
        if current_header is not None:
            headers.append(current_header)
            seqs.append(''.join(current_seq))
    
    df_fasta = pd.DataFrame({"header": headers, "sequence": seqs})
    print(f"Total de sequências: {len(df_fasta)}")
    print(df_fasta.head())
    return df_fasta, device, model, alphabet

fasta_path = "jabuticaba/jabuticaba.fasta"
output = "jabuticaba/jabuticaba_feats.csv"

df_fasta, device, model, alphabet = make_fasta(fasta_path)
extract_and_save_embeddings_to_csv(df_fasta, model, alphabet, device, batch_size=1, save_every=10000000, output_prefix=output)

Total de sequências: 83
       header                                           sequence
0  A0A384TSM3  MTAILERRESESLWGRFCNWITSTENRLYIGWFGVLMIPTLLTATS...
1  A0A384SYQ7  MKTLYSLRRFYPVETLFNGTLALAGRDQETTGFAWWAGNARLINLS...
2  A0A384SYR4  MSPQTETKASVGFKAGVKDYKLNYYTPDYETKDTDILAAFRVTPQP...
3  A0A384SYS1            MTIDRTYPIFTVRWLAVHGLAVPTVSFLGSISAMQFIQR
4  A0A384SYS4  MQGRLSAWLVKHGLVHRSLGFDYQGIETLQIKPEDWHSIAVILYVY...
Dispositivo: cpu


In [None]:
import pandas as pd

feats_path = "jabuticaba/jabuticaba_feats.csv"
ints_path = "jabuticaba/jabuticaba_interactions.csv"


df_feats = pd.read_csv(feats_path, header=None)
df_feats = df_feats.rename(columns={0: "protein1"})
df_combined = pd.read_csv(ints_path)
df_combined = df_combined[['protein1','protein2']]
df_merged = df_combined.merge(df_feats, on="protein1", how="left")
df_feats = df_feats.rename(columns={'protein1': "protein2"})
df_merged = df_merged.merge(df_feats, on="protein2", how="left")

In [17]:
feature_cols = df_merged.columns[2:]
X = df_merged[feature_cols]
del df_merged

In [21]:
import joblib
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, f1_score
from sklearn.neural_network import MLPClassifier


scaler_path = "modelos/scaler_planta_900sc.pkl"
mlp_loaded = "modelos/MLP_model_planta(512, 256, 128, 64)_e25_920sc.pkl"

scaler = joblib.load(scaler_path)
mlp_loaded = joblib.load(mlp_loaded)  #MLP_model_planta(256, 128, 64, 16)_e10_900sc.pkl")
X_scaled = scaler.transform(X)


y_proba = mlp_loaded.predict_proba(X_scaled)[:, 1]
'''y_pred = mlp_loaded.predict(X_scaled)

tn, fp, fn, tp = confusion_matrix(y, y_pred).ravel()
especificidade = tn / (tn + fp)
sensibilidade = tp / (tp + fn)
acuracia = (tp + tn) / (tp + tn + fp + fn)
precisao = tp / (tp + fp)
f1 = 2 * (precisao * sensibilidade) / (precisao + sensibilidade)

print("Acurácia:", acuracia)
print("Precisão:", precisao)
print("Sensibilidade:", sensibilidade)
print("Especificidade:", especificidade)
print("F1:", f1)'''

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


TypeError: __randomstate_ctor() takes from 0 to 1 positional arguments but 2 were given

In [None]:
import subprocess

makeblastdb_path = r"C:\Users\Bruno\Downloadsncbi-blast-2.16.0+\bin\makeblastdb.exe"
input_fasta = r"Metazoa\seg_ali\fasta\plantas_0_50_920sc_max1000.fasta"
cmd = [
    makeblastdb_path,
    "-in", input_fasta,
    "-dbtype", "prot"  # Para banco de proteínas
]
try:
    result = subprocess.run(cmd, capture_output=True, text=True, check=True)
    print("Banco de dados criado com sucesso!")
    print(result.stdout)
except subprocess.CalledProcessError as e:
    print("Erro ao criar o banco de dados:")
    print(e.stderr)

In [None]:
import subprocess

blastp_path = r"C:\Users\Bruno\Downloads\ncbi-blast-2.16.0+\bin\blastp.exe"
query_fasta = "Metazoa/jabuticaba/jabuticaba.fasta"
output_file = "Metazoa/jabuticaba/alinhamento_planta920_0-50sp_e_jabuticaba.txt"     
db_path = r"Metazoa\seg_ali\fasta\db_920\plantas_0_50_920sc_max1000.fasta"  #r"modelo_200sp\0-200sp_db\string_200sp_max1000.fasta"

cmd = [
    blastp_path,
    "-query", query_fasta,
    "-db", db_path,
    "-out", output_file,
    "-outfmt",  "6 qseqid sseqid pident ppos nident mismatch gapopen qlen slen length qstart qend sstart send evalue bitscore",
    "-max_target_seqs", "5",
    "-num_threads", "16" 
]


# Executa
try:
    result = subprocess.run(cmd, capture_output=True, text=True, check=True)
    print("BLAST concluído com sucesso!")
    print(f"Resultados salvos em: {output_file}")
except subprocess.CalledProcessError as e:
    print("Erro ao executar o BLAST:")
    print(e.stderr)



import pandas as pd

colunas = [
    "qseqid", "sseqid", "pident", "ppos", "nident", "mismatch", "gapopen",
    "qlen", "slen", "length", "qstart", "qend", "sstart", "send", "evalue", "bitscore"
]

df = pd.read_csv(output_file, sep="\t", names=colunas)
df.to_csv(output_file, sep="\t", index=False)
print("Arquivo salvo com cabeçalho.")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

colunas = [
    "qseqid", "sseqid", "pident", "ppos", "nident", "mismatch", "gapopen",
    "qlen", "slen", "length", "qstart", "qend", "sstart", "send", "evalue", "bitscore"
]
df = pd.read_csv(
    output_file,
    sep="\t",
    header=None,
    names=colunas
)

df_max = df.loc[df.groupby("qseqid")["pident"].idxmax()]

df_top5 = df.sort_values(by=["qseqid", "pident"], ascending=[True, False])
df_top5 = df_top5.groupby("qseqid").head(1)
df_max = df_top5.groupby("qseqid")["pident"].mean().reset_index()

print(len(df_max))

plt.hist(df_max["pident"], bins=50, edgecolor='black')
plt.xlabel("Identidade (%)")
plt.ylabel("Frequência")
plt.title("Histograma do maior pident por qseqid")
plt.show()

In [None]:
import pandas as pd

def filtrar_interacoes_por_dois_pidents(df_primeiro, df_combined, intervalo_max, intervalo_min):
    """
    Filtra df_combined para manter apenas interações onde:
    - As duas proteínas estão presentes em df_primeiro.
    - O MAIOR pident entre as duas proteínas está dentro de intervalo_max.
    - O MENOR pident entre as duas proteínas está dentro de intervalo_min.

    Parâmetros:
    - df_primeiro: DataFrame com colunas ['qseqid', 'pident']
    - df_combined: DataFrame com colunas ['protein1', 'protein2', 'Label']
    - intervalo_max: str no formato "inicio-fim", ex: "40-60"
    - intervalo_min: str no formato "inicio-fim", ex: "80-100"

    Retorna:
    - DataFrame filtrado.
    """
    try:
        inicio_max, fim_max = map(float, intervalo_max.split("-"))
        inicio_min, fim_min = map(float, intervalo_min.split("-"))
    except:
        raise ValueError("Os intervalos devem estar no formato 'inicio-fim', por exemplo '40-60'.")

    # Conjunto de proteínas válidas
    proteinas_validas = set(df_primeiro["qseqid"])
    
    # Filtra só as interações onde as duas proteínas estão presentes
    df_filtrado = df_combined[
        (df_combined["protein1"].isin(proteinas_validas)) &
        (df_combined["protein2"].isin(proteinas_validas))
    ].copy()

    # Dicionário com pident de cada proteína
    pident_dict = df_primeiro.set_index("qseqid")["pident"].to_dict()

    # Mapeia o pident
    df_filtrado["pident1"] = df_filtrado["protein1"].map(pident_dict)
    df_filtrado["pident2"] = df_filtrado["protein2"].map(pident_dict)

    # Calcula máximo e mínimo
    df_filtrado["pident_max"] = df_filtrado[["pident1", "pident2"]].max(axis=1)
    df_filtrado["pident_min"] = df_filtrado[["pident1", "pident2"]].min(axis=1)

    # Aplica os dois filtros
    df_resultado = df_filtrado[
        (df_filtrado["pident_max"] > inicio_max) & (df_filtrado["pident_max"] <= fim_max) &
        (df_filtrado["pident_min"] > inicio_min) & (df_filtrado["pident_min"] <= fim_min)
    ]

    #print(f"Total de interações com pident_max na faixa {intervalo_max}% e pident_min na faixa {intervalo_min}%: {len(df_resultado)}")
    return df_resultado

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

def avalia_df(df_resultado, y, y_pred, y_proba, faixa1, faixa2, plot_hist=True):
    """
    df_resultado: DataFrame filtrado que define os índices a serem usados
    y, y_pred, y_proba: arrays ou Series com rótulos verdadeiros, predições e probabilidades
    faixa1, faixa2: strings indicando as faixas de origem (ex: "0-40", "40-60")
    plot_hist: se True, plota o histograma final
    """

    indices_filtrados = df_resultado.index

    # Seleciona as amostras correspondentes
    y_filtrado = y.loc[indices_filtrados] if hasattr(y, 'loc') else y[indices_filtrados]
    y_pred_filtrado = y_pred.loc[indices_filtrados] if hasattr(y_pred, 'loc') else y_pred[indices_filtrados]
    y_proba_filtrado = y_proba.loc[indices_filtrados] if hasattr(y_proba, 'loc') else y_proba[indices_filtrados]

    # Transforma em DataFrame
    df = pd.DataFrame({
        'y_true': y_filtrado,
        'y_pred': y_pred_filtrado,
        'y_proba': y_proba_filtrado
    })

    # Balanceamento
    df_pos = df[df['y_true'] == 1]
    df_neg = df[df['y_true'] == 0]
    print(len(df_pos), len(df_neg))
    min_count = min(len(df_pos), len(df_neg))
    df_balanced = pd.concat([
        df_pos.sample(n=min_count, random_state=42),
        df_neg.sample(n=min_count, random_state=42)
    ]).sample(frac=1, random_state=42)
    print(len(df_balanced), faixa1, faixa2)

    # Acurácia com predição clássica
    acc = accuracy_score(df_balanced['y_true'], df_balanced['y_pred'])

    resultados = []
    thresholds = [0.5] #, 0.7, 0.9, 0.95, 0.99, 0.995, 0.999, 0.9999, 0.99999]

    for thresh in thresholds:
        y_pred_thresh = (df_balanced['y_proba'] >= thresh).astype(int)
        prec = precision_score(df_balanced['y_true'], y_pred_thresh, zero_division=0)
        rec = recall_score(df_balanced['y_true'], y_pred_thresh, zero_division=0)
        tn, fp, fn, tp = confusion_matrix(df_balanced['y_true'], y_pred_thresh).ravel()
        espec = tn / (tn + fp) if (tn + fp) > 0 else 0.0

        resultados.append({
            'Faixa1': faixa1,
            'Faixa2': faixa2,
            'Threshold': thresh,
            'Precisão': prec,
            'Recall': rec,
            'Especificidade': espec,
            'Acurácia_Padrao': acc
        })

    df_resultado_final = pd.DataFrame(resultados)

    # Histograma opcional
    if plot_hist:
        probas_pos = df_balanced[df_balanced['y_true'] == 1]['y_proba']
        probas_neg = df_balanced[df_balanced['y_true'] == 0]['y_proba']

        plt.hist(probas_pos, bins=20, edgecolor='black', density=True, alpha=0.5, label='Positivas (y_true = 1)', color='blue')
        plt.hist(probas_neg, bins=20, edgecolor='black', density=True, alpha=0.5, label='Negativas (y_true = 0)', color='red')

        plt.xlabel("Probabilidade da Classe Positiva")
        plt.ylabel("Densidade")
        plt.title(f"Histograma - {faixa1} x {faixa2} - {len(df_balanced)} Amostras Balanceadas")
        plt.legend()
        #plt.show()

    return df_resultado_final