In [None]:
import os
import sys

"""
Prediction Script for the proteusAI MLDE package.
This script is based on the proteus mlde_demo-file and therefore uses proteusAI libraries and requires the proteusAI environment to be activated.
To install and create the proteusAI environment, please follow the instructions in the proteusAI documentation at:
    https://github.com/jonfunk21/ProteusAI

The script is designed to be run in a Jupyter Notebook or as a standalone Python script.
It uses a labeled csv-file as input, including at least the sequence and the label columns. A name or description column for the provided AA-sequence is not mandatory but highly recommended, since proteus requires a "names_column" for a description.

The output is a dataframe with the training results and the predicted labels for the input data."""

# os.system("conda init bash")
# os.system("conda activate proteusAI")
import proteusAI as pai

print(os.getcwd())
sys.path.append("src/")

In [None]:
# will initiate storage space - else in memory
dataset = "NOD_AT_edit.csv"
y_column = "Data"

In [None]:
# load data from csv or excel: x should be sequences, y should be labels, y_type class or num
library = pai.Library(
    source=dataset,
    seqs_col="Sequence",
    y_col=y_column,
    y_type="num",
    names_col="Description",
)

In [None]:
# proteus includes the following encoding methods: "esm1v", "esm2", "esm2_650M", "esm2_150M", "esm2_35M", "esm2_8M", "ohe", "blosum62", "blosum50" and "vae"
# compute and save ESM-2 representations at example_lib/representations/esm2
library.compute(method="ohe", batch_size=10)


In [None]:

# proteus includes the following model types: "rf", "knn", "svm", "ffnn", "ridge", "gp" and for clustering: "hdbscan"
# possible in memory representations (x): "ohe", "blosum50", "blosum62"
model = pai.Model(library=library, k_folds=5, model_type="rf", x="blosum62")

# train model
model.train()

In [None]:
training_df = model.out_df
print(training_df)

In [None]:
outpath = "test/demo_results/"
if not os.path.exists(outpath):
    os.makedirs(outpath, exist_ok=True)
training_df.to_csv(os.path.join(outpath, "training_test_results.csv"))

In [None]:
# search for new mutants
# out_search = model.search(optim_problem="max")

In [None]:
# save searching results
# outpath = "test/results/"
# if not os.path.exists(outpath):
#     os.makedirs(outpath, exist_ok=True)
#
# out_search.to_csv(os.path.join(outpath, "results.csv"))