In [2]:
import os
import sys

import numpy as np

"""
Prediction Script for the proteusAI MLDE package.
This script is based on the proteus mlde_demo-file and therefore uses proteusAI libraries and requires the proteusAI environment to be activated.
To install and create the proteusAI environment, please follow the instructions in the proteusAI documentation at:
    https://github.com/jonfunk21/ProteusAI

The script is designed to be run in a Jupyter Notebook or as a standalone Python script.
It uses a labeled csv-file as input, including at least the sequence and the label columns. A name or description column for the provided AA-sequence is not mandatory but highly recommended, since proteus requires a "names_column" for a description.

The output is a dataframe with the training results and the predicted labels for the input data."""

# os.system("conda init bash")
# os.system("conda activate proteusAI")
import proteusAI as pai

print(os.getcwd())
sys.path.append("src/")

  alphabet = torch.load(os.path.join(Path(__file__).parent, "alphabet.pt"))


/home/iwe80/Documents/Enzyme_Activity_Prediction/02_Playground/MAP


In [3]:
# will initiate storage space - else in memory
dataset = "data/NOD_AT_edit.csv"
y_column = "Data"

In [4]:
# load data from csv or excel: x should be sequences, y should be labels, y_type class or num
library = pai.Library(
    source=dataset,
    seqs_col="Sequence",
    y_col=y_column,
    y_type="num",
    names_col="Description",
)

In [5]:
# proteus includes the following encoding methods: "esm1v", "esm2", "esm2_650M", "esm2_150M", "esm2_35M", "esm2_8M", "ohe", "blosum62", "blosum50" and "vae (variational autoencoder)"
# compute and save ESM-2 representations at example_lib/representations/esm2
library.compute(method="ohe", batch_size=10)


tensor([[[0., 0., 0.,  ..., 0., 0., 0.],
         [1., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [1., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.],
         [1., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [1., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.],
         [1., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [1., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        ...,

        [[0., 0., 0.,  ..., 0., 0., 0.],
         [1., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [1., 0., 0., 

In [6]:

# proteus includes the following model types: "rf", "knn", "svm", "ffnn", "ridge", "gp" and for clustering: "hdbscan"
# possible in memory representations (x): "ohe", "blosum50", "blosum62"
model = pai.Model(library=library, k_folds=5, model_type="rf", x="blosum62")

# train model
model.train()


KeyboardInterrupt



KeyboardInterrupt: 

In [29]:
training_df = model.out_df
print(training_df)

                                   name  \
0    Y32G+F46S+L56S+V97G+P49P+R51R+I53L   
1    Y32G+F46S+L56S+V97G+P49P+R51G+I53L   
2    Y32G+F46S+L56S+V97G+P49P+R51F+I53L   
3    Y32G+F46S+L56S+V97G+P49P+R51G+I53F   
4    Y32G+F46S+L56S+V97G+P49P+R51F+I53F   
..                                  ...   
561  Y32V+F46C+L56H+V97V+P49H+R51L+I53C   
562  Y32V+F46C+L56H+V97V+P49I+R51F+I53L   
563  Y32V+F46C+L56H+V97V+P49N+R51R+I53I   
564  Y32V+F46C+L56H+V97V+P49N+R51V+I53V   
565  Y32V+F46C+L56H+V97V+P49Y+R51V+I53F   

                                              sequence  y_true  y_predicted  \
0    MAPTLSEQTRQLVRASVPALQKHSVAISATMGRLLFERYPETRSLS...   0.873     0.793416   
1    MAPTLSEQTRQLVRASVPALQKHSVAISATMGRLLFERYPETRSLS...   0.822     0.781641   
2    MAPTLSEQTRQLVRASVPALQKHSVAISATMGRLLFERYPETRSLS...   0.789     0.771142   
3    MAPTLSEQTRQLVRASVPALQKHSVAISATMGRLLFERYPETRSLS...   0.841     0.768365   
4    MAPTLSEQTRQLVRASVPALQKHSVAISATMGRLLFERYPETRSLS...   0.809     0.765243   
..       

In [30]:
outpath = "test/demo_results/"
if not os.path.exists(outpath):
    os.makedirs(outpath, exist_ok=True)
training_df.to_csv(os.path.join(outpath, "training_test_results.csv"))

In [31]:
# predict binding affinity of new sequences with the trained model
infile_to_predict = "data/to_predict.csv"
new_sequences = dict()
with open(infile_to_predict, "r") as infile:
    for line in infile.readlines()[1:]:
        new_sequences.update({line.split(",")[0]: line.split(",")[1][:-1]})

proteins = []
for entry in new_sequences:
    #     if no encodings are provided, protein model.score will determine the proper encoding of the sequences before automatically
    proteins.append(pai.Protein(name=entry, seq=new_sequences[entry]))

# for entry in new_sequences:
#     print(new_sequences[entry])


In [32]:
scores = model.predict(proteins)
print(scores)

([proteusAI.Protein():
____________________
name	: Y32G+F46S+L56S+V97G+P49P+R51N+I53T+G32P
seq	: MAPTLSEQTRQLVRASVPALQKHSVAISATMPRLLFERYPETRSLSELPENQTHKSASALLAYARSIDNPSALQAAIRRMVLSHARAGVQAVHYPLGWECLRDAIKEVLGPDATETLLQAWKEAYDFLAHLLSTKEAQVYAVLAE
rep	: []
y:	None
y_pred:	0.46240821428571444
y_sig:	0.08809159419870555
struc:	None
, proteusAI.Protein():
____________________
name	: Y32G+F46S+L56S+V97G+P49P+R51R+I53A+L60E
seq	: MAPTLSEQTRQLVRASVPALQKHSVAISDDAEGFERYPETRSLSELPERQAHKSASAELAYARSIDNPSALQAAIRRMVLSHARAGVQAVHYPLGWECLRDAIKEVLGPDATETLLQAWKEAYDFLAHLLSTKEAQVYAVLAE
rep	: []
y:	None
y_pred:	0.06011
y_sig:	0.04630254291073008
struc:	None
, proteusAI.Protein():
____________________
name	: Y32G+F46S+L56S+V97G+S46T
seq	: MAPTLSEQTRQLVRASVPALQKHSVAISATAAAFERYPETRSLTELPERQIHKSASALLAYARSIDNPSALQAAIRRMVLSHARAGVQAVHYPLGWECLRDAIKEVLGPDATETLLQAWKEAYDFLAHLLSTKEAQVYAVLAE
rep	: []
y:	None
y_pred:	0.03443999999999999
y_sig:	0.07944710970198977
struc:	None
, proteusAI.Protein():
____________________
name	:

In [33]:
# search for new mutants
# out_search = model.search(optim_problem="max")

In [34]:
# save searching results
# outpath = "test/results/"
# if not os.path.exists(outpath):
#     os.makedirs(outpath, exist_ok=True)
#
# out_search.to_csv(os.path.join(outpath, "results.csv"))

In [14]:
import numpy as np
tensor = [[[1, 2, 3], [2, 2, 3]], [[1, 2, 3], [4, 3, 2]], [[4, 2, 4], [3, 4, 2]]]
np_tensor = np.array(tensor)
print(np_tensor.shape)

(3, 2, 3)
