# Load a fasta file containing protein sequences


In [15]:
import numpy as np
import pandas as pd

from my_library import read_fasta

# example dataset from the paper
fasta_file = './datasets/Cas_1_2/cas_1_2_db.fasta'



# show the headers and sequences in the context of a dataframe
label_df = pd.read_csv('./datasets/Cas_1_2/cas_1_2_db_sample.csv').sort_values(by='Protein accession').reset_index(drop=True)



# Generate fixed-size embeddings for each protein sequence

In [18]:
import torch
import esm 

# specify the device for running the model
# if you have a good enough gpu, you can try "cuda" or "cuda:0"
device = 'cuda' 

# load the esm-1b protein language model
model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()
batch_converter = alphabet.get_batch_converter()

model.eval() # disable dropout for deterministic results
model = model.to(device)

In [20]:
import sys

embeddings = []
sequences = label_df['seq'].values

with torch.no_grad():
    for n, s in enumerate(sequences):
        sys.stderr.write(f'Progress : {n+1} / {len(sequences)}\r')
        
        batch_labels, batch_strs, batch_tokens = batch_converter([[None, s]])
        batch_tokens = batch_tokens.to(device)
        
        # generate the full size embedding vector
        # this can be kinda big, so we won't save it for this example
        result = model(batch_tokens, repr_layers=[33], return_contacts=False)
        full_size = result["representations"][33].to('cpu')[0]
        
        # derive a fixed size embedding vector
        # there are many ways to do this, but for this example we take the mean of all residue tokens
        fixed_size = full_size[1:-1].mean(0).numpy()
        
        # save the fixed size embedding to a list
        embeddings.append(fixed_size)

# format the embeddings as a single numpy array
embeddings = np.array(embeddings)
embeddings

Progress : 1587 / 1587

array([[ 0.01961797, -0.04840439,  0.0356046 , ..., -0.05662034,
        -0.10600322,  0.08624989],
       [ 0.04697178, -0.06621468,  0.00049452, ..., -0.04547768,
        -0.01199891, -0.1073371 ],
       [ 0.03388705, -0.04778203,  0.00604392, ..., -0.05398354,
        -0.10566252,  0.08738875],
       ...,
       [ 0.06615708, -0.06479326,  0.03419146, ..., -0.09508316,
        -0.08889318, -0.01814755],
       [ 0.04937788, -0.0562344 ,  0.03136954, ..., -0.1028733 ,
        -0.06802339,  0.00472262],
       [ 0.03422253, -0.06352205,  0.02624848, ..., -0.04406855,
        -0.10359808,  0.091972  ]], dtype=float32)

In [21]:
label_df['embeddings'] = embeddings.tolist()

In [22]:
label_df.to_csv('./datasets/Cas_1_2/cas_1_2_db_sample.csv', index=False)

make labels

In [38]:
def make_labels(meta_df,type,id_key,type_key):
    """
    accession: the accession number of the protein
    label: the label of the protein
    """
    tem_label_df = pd.DataFrame({'accession':[],'label':[]})
    for i in range(len(meta_df)):
        tem_label_df = tem_label_df.append({'accession':meta_df.iloc[i][id_key],'label':meta_df.iloc[i][type_key]},ignore_index=True)
    return tem_label_df

cas_protein_df = make_labels(label_df,'cas_protein','Protein accession','cas_protein')
cas_type_df = make_labels(label_df,'cas_type','Protein accession','type')
cas_subtype_df = make_labels(label_df,'cas_subtype','Protein accession','subtype')

cas_protein_df.to_csv('./datasets/Cas_1_2/cas_1_2_protein_labels.csv', index=False)
cas_type_df.to_csv('./datasets/Cas_1_2/cas_1_2_type_labels.csv', index=False)
cas_subtype_df.to_csv('./datasets/Cas_1_2/cas_1_2_subtype_labels.csv', index=False)

  tem_label_df = tem_label_df.append({'accession':meta_df.iloc[i][id_key],'label':meta_df.iloc[i][type_key]},ignore_index=True)
  tem_label_df = tem_label_df.append({'accession':meta_df.iloc[i][id_key],'label':meta_df.iloc[i][type_key]},ignore_index=True)
  tem_label_df = tem_label_df.append({'accession':meta_df.iloc[i][id_key],'label':meta_df.iloc[i][type_key]},ignore_index=True)
  tem_label_df = tem_label_df.append({'accession':meta_df.iloc[i][id_key],'label':meta_df.iloc[i][type_key]},ignore_index=True)
  tem_label_df = tem_label_df.append({'accession':meta_df.iloc[i][id_key],'label':meta_df.iloc[i][type_key]},ignore_index=True)
  tem_label_df = tem_label_df.append({'accession':meta_df.iloc[i][id_key],'label':meta_df.iloc[i][type_key]},ignore_index=True)
  tem_label_df = tem_label_df.append({'accession':meta_df.iloc[i][id_key],'label':meta_df.iloc[i][type_key]},ignore_index=True)
  tem_label_df = tem_label_df.append({'accession':meta_df.iloc[i][id_key],'label':meta_df.iloc[i][type_k

# Calculate the all-vs-all distance matrix

In [6]:
from scipy.spatial.distance import cdist

# calculate a distance matrix for the embeddings
# for this example, we will use cosine distance
distmat_embeddings = cdist(embeddings, embeddings, metric='cosine')

ValueError: XA must be a 2-dimensional array.

# Calculate an NJ Tree

In [5]:
from my_library import neighbor_joining

# neighbor joining algorithm
nj_newick_tree = neighbor_joining(distmat_embeddings, label_df['headers'].values)

NameError: name 'distmat_embeddings' is not defined

NameError: name 'nj_newick_tree' is not defined

In [138]:
from ete3 import Tree, TextFace, TreeStyle, NodeStyle

# plot the tree using ete3
t = Tree(nj_newick_tree) # initialize from newick string
t.ladderize() # sort the tree

ts = TreeStyle() # set up the visual style
ts.mode = "c"
t.show(tree_style=ts)

: 

: 