<a href="https://colab.research.google.com/github/klanita/PoincareMSA/blob/master/PoincareMSA_colab_MMseqs2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<img src="https://github.com/klanita/PoincareMSA/blob/master/.github/PoincareMSA_small_logo.png?raw=true" height="100" style="height:100px;margin-left: 0px;">

# Poincaré maps for visualization of large protein famillies

**Authors**: Anna Klimovskaia-Susmelj, Yani Ren, Yann Vander Meersche, Jean-Christophe Gelly and Tatiana Galochkina

PoincaréMSA builds an interactive projection of an input protein multiple sequence alignemnt (MSA) using a method based on Poincaré maps described by Klimovskaia et al [1]. It reproduces both local proximities of protein sequences and hierarchy contained in give data. Thus, sequences located closer to the center of projection correspond to the proteins sharing the most general functional properites and/or appearing at the earlier stages of evolution. Source code is available at https://github.com/klanita/PoincareMSA.

[1] Klimovskaia, A., Lopez-Paz, D., Bottou, L. et al. Poincaré maps for analyzing complex hierarchies in single-cell data. Nat Commun 11, 2966 (2020).

# Notebook initialization

In [None]:
#@title ### Load PoincaréMSA Github repository & install dependencies
print("1. Load PoincaréMSA Github repository")
import os
if os.getcwd() == "/content":
    !git clone https://github.com/klanita/PoincareMSA.git
    %cd PoincareMSA

# Check if the GPU is activated
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print('\nUsing device:', device)

#Install missing module
print("\n2. Install dependencies")
!pip install adjustText
!pip install -U kaleido
!pip install ncbi-taxonomist

#Install HHfilter
if not os.path.isdir("/content/hh-suite/"):
    !mkdir /content/hh-suite
    %cd /content/hh-suite
    !wget https://github.com/soedinglab/hh-suite/releases/download/v3.3.0/hhsuite-3.3.0-AVX2-Linux.tar.gz
    !tar xfz hhsuite-3.3.0-AVX2-Linux.tar.gz
    %cd /content/PoincareMSA/
    print("HH-suite correctly installed.")

import numpy as np
import pandas as pd
import subprocess
import json
import warnings
warnings.filterwarnings('ignore')

#File import
from google.colab import files
import io

#Import visualization functions
from scripts.visualize_projection.pplots_new import read_embeddings, plot_embedding, plot_embedding_interactive, rotate, get_colors
from scripts.prepare_data.mmseqs2_api import run_mmseqs2
from scripts.prepare_data.uniprot_idmapping_api import submit_id_mapping, check_id_mapping_results_ready, get_id_mapping_results_link, get_id_mapping_results_search
%matplotlib inline

In [None]:
#@title # Compute MSA with MMseqs2
sequence = "GGTLAIQAQGDLTLAQKKIVRKTWHQLMRNKTSFVTDVFIRIFAYDPSAQNKFPQMAGMSASQLRSSRQMQAHAIRVSSIMSEYVEELDSDILPELLATLARTHDLNKVGADHYNLFAKVLMEALQAELGSDFNEKTRDAWAKAFSVVQAVLLVKHGN" #@param {type:"string"}
#@markdown Maximum pairwise sequence identity (%) [0-100].
seq_identity = 50 #@param {type:"number"}
#@markdown Minimum coverage with query (%) [0-100].
cov = 75 #@param {type:"number"}
#@markdown Target diversity of alignment [1,inf].
neff = 7 #@param {type:"number"}

if os.path.isdir("_env"):
    !rm -rf _env
msa = run_mmseqs2(sequence, "./")

nb_seq = 0
mfasta = "_env/uniref.a3m"
with open(mfasta, "r") as f_in:
    for line in f_in:
        if line[0] == ">":
            nb_seq += 1

#Use HHFilter to reduce sequence identity
!/content/hh-suite/bin/hhfilter -i $mfasta -o _env/uniref_filtered.a3m -id $seq_identity -cov $cov -neff $neff -v 0
#Convert a3m to mfasta
!/content/hh-suite/scripts/reformat.pl a3m fas _env/uniref_filtered.a3m _env/uniref_filtered.mfasta -v 0

UnP_ids = []
nb_fseq = 0
mfasta = "_env/uniref_filtered.mfasta"
with open(mfasta, "r") as f_in:
    for line in f_in:
        if line[0] == ">":
            nb_fseq += 1
            UnP_ids.append(line[1:].split()[0])

#Split UniProtKB and UniParc IDs
uniparc_ids = []
uniprot_ids = []
for unp in UnP_ids:
    if len(unp) == 13 and unp[:2] == "UP":
        uniparc_ids.append(unp)
    elif unp == "101":
        pass
    else:
        uniprot_ids.append(unp)

#Fetch UniProtKB annotations
job_id = submit_id_mapping(
    from_db="UniProtKB_AC-ID", to_db="UniParc", ids=uniprot_ids
)

if check_id_mapping_results_ready(job_id):
    link = get_id_mapping_results_link(job_id)
    results = get_id_mapping_results_search(link)

#Fetch UniParc annotations
job_id = submit_id_mapping(
    from_db="UniParc", to_db="UniParc", ids=uniparc_ids
)

if check_id_mapping_results_ready(job_id):
    link = get_id_mapping_results_link(job_id)
    results2 = get_id_mapping_results_search(link)

#Create annotation dataframe
df_annotation = pd.DataFrame(UnP_ids[1:], columns=["UnP_ID"])
df_annotation["organism"] = ""
df_annotation["proteinName"] = ""
df_annotation["taxonId"] = ""
df_annotation["species"] = ""
df_annotation["genus"] = ""
df_annotation["family"] = ""
df_annotation["order"] = ""
df_annotation["class"] = ""
df_annotation["phylum"] = ""
df_annotation["clade"] = ""
df_annotation["superkingdom"] = ""

#Fill the annotation DataFrame
for dict_res in results["results"] + results2["results"]:
    try:
        unp = dict_res["from"]
    except:
        continue
    try:
        prot_name = dict_res["to"]["uniParcCrossReferences"][0]["proteinName"]
        df_annotation.loc[df_annotation["UnP_ID"] == unp, "proteinName"] = prot_name
    except KeyError:
        continue
    try:
        scientific_name = dict_res["to"]["uniParcCrossReferences"][0]["organism"]["scientificName"]
        taxid = dict_res["to"]["uniParcCrossReferences"][0]["organism"]["taxonId"]
        df_annotation.loc[df_annotation["UnP_ID"] == unp, "organism"] = scientific_name
        df_annotation.loc[df_annotation["UnP_ID"] == unp, "taxonId"] = taxid
    except KeyError:
        continue

#Add lineage from NCBI Taxonomist
taxon_ids = df_annotation.loc[df_annotation["taxonId"].notnull(), 'taxonId'].to_numpy()
taxon_ids = list(set(taxon_ids))
taxon_ids = list(map(str, taxon_ids))
bash_command = f"ncbi-taxonomist resolve -t {','.join(taxon_ids)}"

list_taxon = subprocess.run(bash_command, shell=True, capture_output=True, text=True).stdout.strip().split("\n")

for taxon in list_taxon:
    jsonString = taxon
    taxon_dict = json.loads(jsonString)
    query = taxon_dict["query"]
    for lineage in taxon_dict["lineage"]:

        rank = lineage["rank"]

        if rank in ["species", "genus", "family", "order", "class", "phylum", "clade", "superkingdom"]:
            name = lineage["name"]
            df_annotation.loc[df_annotation["taxonId"] == int(query), rank] = name

#Add query line in the DataFrame
df_annotation.loc[-1] = ['query', 'query', 'query', "query", 'query', 'query', 'query', "query", 'query', 'query', 'query', 'query']
df_annotation.index = df_annotation.index + 1
df_annotation.sort_index(inplace=True) 

#Save annotation to csv
path_annotation = "auto_annot.csv"
df_annotation.to_csv(path_annotation, index=False)
annotation_names = ["proteins_id"] + list(df_annotation.columns)

print(f"\nNumber of sequences found: {nb_seq}.")
print(f"Number of filtered sequences: {nb_fseq}.")

nb_seq = nb_fseq

# Data preparation
Here we clean the input .mfasta alignment and translate each sequence to a vector ready for projection.

In [None]:
#@title ## Parameters for data preparation
#@markdown ### Job name
#@markdown Name for the output folder
out_name = "poincareMSA" #@param {type:"string"}

#@markdown ### Threshold for filtering gapped positions
#@markdown Positions with proportion of gaps above the given threshold are removed from the alignment. If your alignment is very gapped, you may want to increase this value.
gapth = 0.9 #@param {type:"number"}

#@markdown ## Run data preparation
#@markdown Data preparation consists in `.mfasta` cleaning according to a gap threshold and translation of each sequence to the PSSM profile.

print("1. Data preparation")
prep_parameters = "scripts/prepare_data" + " " + mfasta + " " + out_name + " " + out_name + " " + str(gapth)
bash_projection = "bash scripts/prepare_data/create_projection.sh " + prep_parameters
!{bash_projection}

# Projection

In [None]:
#@title ### Projection parameters
#@markdown Here you control different parameters of Poincaré maps. In our computational experiments the best results were achieved for the following values provided by default. The impact of different parameters is analyzed in the original paper [1].
knn = 5 #@param {type:"number"}
gamma = 2 #@param {type:"number"}
sigma = 1 #@param {type:"number"}
cospca = 0 #@param {type:"number"}
batchs = 4 #@param {type:"number"}
epochs = 1000 #@param {type:"number"}
seed = 0 #@param {type:"number"}


print("\n2. Data projection using Poincaré disk")
#@markdown ## Building projection and preparing data for visualization
#@markdown This step creates a projection of encoded sequences to a Poincaré disk.
bash_pm = "python3 "+ "scripts/build_poincare_map/main.py --input_path " + out_name + "/fasta" + str(gapth) + " --output_path " + out_name + "/projections/ --gamma "+ str(gamma) +" --pca "+ str(cospca) + " --epochs "+ str(epochs) +" --seed "+ str(seed) + " --knn " + str(knn)
!{bash_pm}

# Projection visualization

In [None]:
#@title ### Prepare data for visualization
#@markdown Check if the annotation file is provided and prepares a dataframe for visualization.

#print("\n3. Format data for visualization")
#Check that an annotation file was provided. Create a dummy one instead
if not path_annotation:
    df_annotation = pd.DataFrame(list(range(1,nb_seq+1)), columns=["id"], dtype="float")
    df_annotation.to_csv("dummy_annotation.csv", index=False)
    path_annotation = "dummy_annotation.csv"
    annotation_names = ["id"]

#Create the DataFrame for visualization
path_embedding = f"{out_name}/projections/PM{knn:1.0f}sigma={sigma:2.2f}gamma={gamma:2.2f}cosinepca={cospca:1.0f}_seed{seed:1.0f}.csv"
df_embedding = read_embeddings(path_embedding, path_annotation, withroot=False)

#@markdown Here are different labels found in your annotation file (if one uploaded):
print(f"{len(annotation_names)} annotations found: {annotation_names}.")

In [None]:
#@title ### Create interactive plot
#@markdown Here you can set different parameters to color & annotate the resulting projection:

title = "" #@param {type:"string"}

#Labels name
#@markdown ---
#@markdown #### Select the coloring from annotation .csv file:
labels_name = "" #@param {type:"string"}
if labels_name == "":
    labels_name = None
elif labels_name not in annotation_names:
    raise NameError(f"labels_name {labels_name} is not in the availables annotations.\nAvailables annotations: {annotation_names}")

#Labels text
#@markdown #### Select classes to label among the "labels_name" or "second_labels_name" column (comma separated list):
second_labels_name = "" #@param {type:"string"}
if second_labels_name == "":
    second_labels_name = None
elif second_labels_name not in annotation_names:
    raise NameError(f'"second_labels_name" {second_labels_name} is not in the availables annotations.\nAvailables annotations: {annotation_names}')


labels_text = "" #@param {type:"string"}
if labels_text:
    try:
        labels_text = [s.strip() for s in labels_text.split(",")]
    except:
        print('Error: "label_text" field is not a valid list.')
else:
    labels_text = [""]

#Convert labels_text to labels_name dtype
if labels_name and second_labels_name is None:
    if labels_name and labels_text != [""]:
        try:
            labels_text_dtype = df_annotation[labels_name].dtypes
            labels_text = list(np.array(labels_text).astype(labels_text_dtype))
        except:
            raise TypeError(f'"labels_text" is not compatible with {labels_name}" data format ({labels_text_dtype}).')
else:
    if second_labels_name and labels_text != [""]:
        try:
            labels_text_dtype = df_annotation[second_labels_name].dtypes
            labels_text = list(np.array(labels_text).astype(labels_text_dtype))
        except:
            raise TypeError(f'"labels_text" is not compatible with {second_labels_name}" data format ({labels_text_dtype}).')

show_text = True #@param {type:"boolean"}
#@markdown ---

#@markdown #### Use a custom color palette:
color_palette = None #@param {type:"raw"}
use_custom_palette = False #@param {type:"boolean"}

if not use_custom_palette:
    color_palette = None

#Plot graph
fig = plot_embedding_interactive(df_embedding, 
                                 labels_name = labels_name,
                                 second_labels_name = second_labels_name, 
                                 show_text = show_text,
                                 labels_text = labels_text,
                                 color_palette = color_palette, 
                                 title = title, 
                                 fontsize = 11)
fig.show()

In [None]:
#@title Save plot to file
output_name = "figure" #@param {type:"string"}
output_format = "html" #@param ["png", "html", "pdf", "svg"]

if output_format != "html":
    fig.write_image(f"{output_name}.{output_format}", engine="kaleido")
else:
    fig.write_html(f"{output_name}.{output_format}")
files.download(f"{output_name}.{output_format}")

In [None]:
#@title Download intermediate data
bash_command = f"zip -r -q {out_name}.zip {out_name}"
!{bash_command}

files.download(f"{out_name}.zip")

# Help

### Enabling the GPU

To enable GPU in your notebook, select the following menu options −
```
Runtime / Change runtime type
```

<figure>
<center>
<img src="https://github.com/klanita/PoincareMSA/blob/master/.github/colab_gpu.png?raw=true" width=500>
</center>
</figure>

