In [None]:
import sys
import os
# Get the directory path of the current script
current_script_directory = os.path.dirname(os.path.abspath(__file__))
# Construct the path to the src directory
src_directory = os.path.join(current_script_directory, "..", "src")
srcpro_directory = os.path.join(current_script_directory, "..", "src/propythia")

# Add the src directory to sys.path
sys.path.append(src_directory)
sys.path.append(srcpro_directory)



# Quickstart building and using Word Embeddings with Propythia and application to ML and DL models

This notebook intends to go over the building and application of Word embedding vectors to describe biological sequences and their use with ML and DL. The notebook uses protein sequences but the same principle may be used for DNA sequences. 

The Python module Bumblebee was developed for processing biological sequences aiming to search for semantic
meaning in sequence ”words” (such as nucleotides and amino
acids). This module was then integrated in ProPythia. It is organized in sub-modules so that
the user can use them in different specific tasks and adapt
them to fit the problem that is working on. The user can set
specific values for the majority of the parameters, but default
values are established. 


This include: 

    1) Read sequence sub-module: To read and/or change sequences. This is especially important to replace nonrelevant/not-common AAs simplifying the vocabulary.
    
    2) Sequence processing sub-module: To generate subsequences; Implements the segmentation of sequences by grams of size n and overlapping (or not) method.
    
    3) Create vocabulary list sub-module: To get all the vocabulary in the dataset, necessary to train the WE.
    It allows to fetch a list of n-grams from pre-existing JSON file or create the list if it is not present.
    
    4) Training word embedding models sub-module: To train and save WE models; It is possible to train W2V
    and FastText models with both CBOW or SG algorithms (based on gensim library). 
    
    5) Load models list sub-module: To load a pre-trained embedding model;
    
    6) Protein Vector representation sub-module: To get a vector representation of a sequence or the matrix of
    vectors accordingly to a model. It obtains a vector for a given n-gram and the number of occurrences of that
    n-gram. Three methods of representing sequences as vectors are implemented as described above.
    
    7) Interpretability sub-module: To visualize WE in space and get similarities between vectors. It uses t-SNE to
    create plots related to physicochemical properties of individual AA, including charge, volume, mass, Van der Waals Volume, polarity and hydrophobicity. For ngrams larger than 1, mean values of these properties are presented as described for Asgari et al. The sub-module also includes binding free energy values for trigrams, based on experimental data. If needed, users can define additional characteristics. Additionally, the models can also retrieve scores of similarity and neighborhood of the n-grams to aid in understanding vector similarities.


Important decisions to take are: 

    Either use a pretrained WE model or train your own model. 
    
    Choose the size of the biological 'words' and the way to represent the final sequence
    
    
For a more detailed explanation of the several modes please check the Quickstart_WordEmbedding jupyter.
We will use the pretrained protvec model in this tutorial. But, as explained in the Quickstart_WordEmbedding jupyter one can train a model with different data and parameters.

In [2]:
import pandas as pd
import numpy as np

# 1. Getting the data

First, lets get the data. We will use an enzyme dataset as example.
For simplification purposes we will just use the first level of EC number and delete enzymes that have more than one enzyme


In [3]:
data = pd.read_csv('./enzyme/datasets/ecpred_uniprot_uniref_90.csv')
# drop entries without sequence
data=data.dropna(subset=['sequence'])
# drop entries with ! characters
data =data[~data['sequence'].str.contains("!")]

# remove promiscue enzymes
data = data[~data['ec_number'].str.contains(';')]
#get first level
data['ec_1'] = data['ec_number'].str.split('.').str[0]

# just shuffling
data=data.sample(frac=1)

print(data.shape)
# Count occurrences of specific values in the new column
value_counts = data['ec_1'].value_counts()
print(value_counts)


(169497, 9)
2    51877
3    32939
0    22708
1    18343
6    16158
4    12639
5     8006
7     6827
Name: ec_1, dtype: int64


# 2 Preprocess sequences
After, we will replace not common aminoacids. Furthermore, protein sequences are of different length. Depending of the method of WE you are using, you may need to use the same length for all sequences. For comparison purposes, we are setting all sequences to a max length of 500. 

In [4]:
def transform_seq(seq, max_len):
        seq1 = seq.replace('B', 'N')  # asparagine N / aspartic acid  D - asx - B
        seq2 = seq1.replace('Z', 'Q')  # glutamine Q / glutamic acid  E - glx - Z
        seq3 = seq2.replace('U',
                            'C')  # selenocisteina, the closest is the cisteine. but it is a different aminoacid . take care.
        seq4 = seq3.replace('O', 'K')  # Pyrrolysine to lysine
        seq = seq4.replace('X', '')  # unknown character eliminated
        if max_len:
            seq = seq[0:max_len]
        return seq


seqs = data['sequence']
max_len = 500
seqs_new = list(map(lambda seq:transform_seq(seq, max_len),seqs))


# 3 load the WE model
Here we will use the Protvec model. This means that we will open the WordEmbedding class with a matrix file ( Protvec). The ngram len will be 3, used in Ptotvec and the vector dim is 100. 

protvec file can be obtained at https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/JMFHTN


In [5]:
from propythia.wordembedding.word_embedding import WordEmbedding as wv

protvec_file = '/home/martinha/propythia/propythia/src/propythia/wordembedding/protVec_100d_3grams.csv'

w2v = wv(emb_matrix_file=protvec_file,
         ngram_len=3 , sequence_max_len= max_len , vectordim=100)

2023-08-15 10:34:45.778005: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1


WordEmbedding is running..
--MATRIX LOADED--


# 4 get protein vectors representations
With the WE model loaded now we will transform the sequences into vectors. 
Three methods can be used: 

    • Method 1: Substitute directly the n-grams presented in the sequence by the WE vector. Being K the dimension of the word and N the dimension of the WE vector, a sequence of size L will be represented by a final vector of
    (L − k − 1) ∗ N elements. 
    This method preserves the spatial information of the location of biological words.
    
    • Method 2: k-mer word frequencies are calculated and multiplied by the corresponding WE vectors. A sequence,
    independent of the size, will be represented by a matrix of dimensions Number of words ∗ N.
    
    • Method 3: All the vectors of Method 2 are summed to reproduce a single vector of dimension N.
    
The method choosed will also impact the model choice. Deep learning architectures such as LSTM make more sense with method 1, where the sequence order is maintained. Method 3, for simplicity is suitable for ML models. 

# 4.1 Protein representations with method 3 for ML

Method3: Each sequence will be represented by a vector of 100 dimension. 

In [6]:
seqs_to_consider = seqs_new

# Initialize an empty numpy array
num_sequences = len(seqs_to_consider)  # Number of sequences
result_array = np.zeros((num_sequences, 100))

# Loop through the sequences and append vectors to the array
for idx, i in enumerate(seqs_new):
    vector = w2v.convert_seq2vec(method=3, sequence=i, padding = True)
    result_array[idx] = vector

print(result_array.shape)

(169497, 100)


In [7]:
y = data['ec_1']

In [8]:
from propythia.wordembedding.word_embedding import WordEmbedding as wv

protvec_file = '/home/martinha/propythia/propythia/src/propythia/wordembedding/protVec_100d_3grams.csv'

w2v = wv(emb_matrix_file=protvec_file,
         ngram_len=3 , sequence_max_len= max_len , vectordim=100)

WordEmbedding is running..
--MATRIX LOADED--


In [9]:
# Initialize an empty numpy array
num_sequences = len(seqs_to_consider)  # Number of sequences
result_array = np.zeros((num_sequences, 100))

# Loop through the sequences and append vectors to the array
for idx, i in enumerate(seqs_to_consider ):
    vector = w2v.convert_seq2vec(method=3, sequence=i, padding = True)
    result_array[idx] = vector

print(result_array.shape)

(169497, 100)


splitting the data

In [10]:
from sklearn.model_selection import train_test_split

print('splitting')
X = result_array
y = data['ec_1']

df_x_train, df_x_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)
print('train_x', df_x_train.shape)
print('test_x', df_x_test.shape)


splitting
train_x (113562, 100)
test_x (55935, 100)


Import and open Shallow ML propythia class

In [11]:
from propythia.ml.shallow_ml import ShallowML
from sklearn.metrics import make_scorer, matthews_corrcoef

# define ml class
# create Machine learning object

report = 'ml_svm_mth3_3ngram_100dim_protvec'
ml = ShallowML(x_train=df_x_train, x_test=df_x_test, y_train=y_train, y_test=y_test,
               report_name=report, columns_names= None)


'numpy.ndarray' object has no attribute 'columns'
no features names listed




try a RF with no param grid ( will use one bby default)

In [12]:
report = 'ml_rf_mth3_3ngram_100dim_protvec'
ml = ShallowML(x_train=df_x_train, x_test=df_x_test, y_train=y_train, y_test=y_test,
               report_name=report, columns_names= None)



# TRAIN BEST MODEL
best_model = ml.train_best_model(model = 'rf', scaler=None,
                                     score=make_scorer(matthews_corrcoef),
                                     cv=3, optType='gridSearch',
                                     param_grid=None,
                                     n_jobs=40, random_state=1, refit=True)

# scores, report, cm, cm2 = ml.score_testset(classifier=best_model)
# print(report)
# print(cm)
# print(scores)

'numpy.ndarray' object has no attribute 'columns'
no features names listed
performing gridSearch...
GridSearchCV took 1009.68 seconds for 6 candidate parameter settings.
GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('scl', None),
                                       ('clf',
                                        RandomForestClassifier(random_state=1))]),
             n_jobs=40,
             param_grid=[{'clf__bootstrap': [True], 'clf__criterion': ['gini'],
                          'clf__max_features': ['sqrt', 'log2'],
                          'clf__n_estimators': [10, 100, 500]}],
             scoring=make_scorer(matthews_corrcoef))
Model with rank: 1
 Mean validation score: 0.531 (std: 0.002)
 Parameters: {'clf__bootstrap': True, 'clf__criterion': 'gini', 'clf__max_features': 'sqrt', 'clf__n_estimators': 500}
 

Model with rank: 2
 Mean validation score: 0.524 (std: 0.002)
 Parameters: {'clf__bootstrap': True, 'clf__criterion': 'gini', 'clf__max_features': 'log2', '

In [13]:
scores, report, cm, cm2 = ml.score_testset(classifier=best_model)
print(report)
print(cm)
print(scores)

              precision    recall  f1-score   support

           0       0.68      0.69      0.68      7494
           1       0.88      0.33      0.49      6053
           2       0.54      0.87      0.66     17120
           3       0.66      0.56      0.61     10870
           4       0.97      0.40      0.57      4171
           5       0.98      0.26      0.41      2642
           6       0.73      0.69      0.71      5332
           7       0.95      0.63      0.76      2253

    accuracy                           0.64     55935
   macro avg       0.80      0.56      0.61     55935
weighted avg       0.71      0.64      0.63     55935

[[ 5186    22  1381   817     5     4    41    38]
 [  190  2024  2916   630    12     0   271    10]
 [ 1021    71 14883   756     8     1   359    21]
 [  922    56  3463  6137     6     2   277     7]
 [  121    46  1755   433  1660     4   152     0]
 [   83    33  1342   267    11   684   222     0]
 [   65    37  1383   170     0     0  3677

<Figure size 640x480 with 0 Axes>

Dizer que modelos dá para usar o propylixia

Use scikit learn instead of Propythia

In [14]:
from sklearn import svm
clf = svm.SVC(C=10,gamma=0.001,kernel='rbf')

best_model = clf.fit(df_x_train, y_train)
y_pred = clf.predict(df_x_test)
score = matthews_corrcoef(y_test, y_pred)

print(score)


0.7664658695164139


In [15]:

# # SVC
# param_grid = {'clf__C': [0.1, 1.0, 10],
#                         'clf__kernel': ['rbf'],
#                         'clf__gamma': [0.001,0.0001]}
# # TRAIN BEST MODEL

# # we will use as score the MCC
# best_model = ml.train_best_model(model_name= None , model = 'svc', scaler=None,
#                                      score=make_scorer(matthews_corrcoef),
#                                      cv=3, optType='gridSearch',
#                                      param_grid=param_grid,
#                                      # podes ver as param grids em e podes subsitituir por uma mais pequena propythia/src/propythia/adjuv_functions/ml_deep/parameters_shallow.py
#                                      n_jobs=40, random_state=1, n_iter=15, refit=True)

# ##########################################
# # scores = ml.cross_val_score_model(model_name = None,model='svm',
# #                               score='accuracy',
# #                               cv=3,
# #                               n_jobs=10,
# #                               random_state=1)



# scores, report, cm, cm2 = ml.score_testset(classifier=best_model)



In [16]:
# # SVC
# param_grid = {'clf__C': [0.1, 1.0],
#                         'clf__kernel': ['rbf'],
#                         'clf__gamma': [0.001,0.0001]}
# # TRAIN BEST MODEL
# best_model = ml.train_best_model(model = 'svc', scaler=None,
#                                      score=make_scorer(matthews_corrcoef),
#                                      cv=3, optType='gridSearch',
#                                      param_grid=param_grid,
#                                      n_jobs=40, random_state=1, refit=True)

Run SVC algorithm. 

    We will run an hyperparameter search with a defined param grid. 
    We will use a cross validation with 3 folds (for simplicity). 
    Use MCC as the score for search for best model
    Calculate test scores 

Lets start with method 1. Each sequence will be represented by a vector of 498 trigrams with 100 len, this is, 498 * 100 size. 
This vector can be then flat if necessary. 
We will use just 200 seuences to simplicity purposes

In [17]:
seqs_to_consider = seqs_new[:200]

# Initialize an empty numpy array
num_sequences = len(seqs_to_consider)  # Number of sequences
result_array = np.zeros((num_sequences, 498, 100))

# Loop through the sequences and append vectors to the array
for idx, i in enumerate(seqs_new[:200]):
    vector = w2v.convert_seq2vec(method=1, sequence=i, padding = True)
    result_array[idx] = vector

print(result_array.shape)

(200, 498, 100)
