[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/yannvm/PoincareMSA/blob/master/PoincareMSA_colab.ipynb)

<img src="https://raw.githubusercontent.com/yannvm/PoincareMSA/master/.github/PoincareMSA_small_logo.png?token=GHSAT0AAAAAABX5HFQATL7AEOFUWTXCCAN4YZMFXEQ" height="100" style="height:100px;margin-left: 0px;">

# Poincaré maps for visualization of large protein famillies

PoincareMSA builds a projection of protein multiple sequence alignemnt (MSA) on a Poincaré disk. The proximity of the points to the disk center corresponds to their hierarchy and correlates with the proximity of the proteins to the root of the phylogenetic tree. Thus, must central point often correspond to the ancestor proteins and protein located close to the border to the leaves of phylogenetic tree.

# Notebook initialization

In [None]:
#@title ### 1. Load PoincaréMSA Github repository
print("1. Load PoincaréMSA Github repository")
import os
if os.getcwd() == "/content":
    !git clone https://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@github.com/yannvm/PoincareMSA.git
    %cd PoincareMSA

# Check if the GPU is activated
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print('\nUsing device:', device)


#@markdown ### 2. Install dependencies
#Install missing module
print("\n2. Install dependencies")
!pip install adjustText
!pip install -U kaleido

import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

#File import
from google.colab import files
import io

#Import visualization functions
from scripts.visualize_projection.pplots_new import read_embeddings, plot_embedding, plot_embedding_interactive, rotate, get_colors
%matplotlib inline

#Create optional variables
path_annotation = ""

# Data upload

In [None]:
#@title ### Upload MSA in mfasta format
uploaded = files.upload()
mfasta = next(iter(uploaded))

nb_seq = 0
with open(mfasta, "r") as f_in:
    for line in f_in:
        if line[0] == ">":
            nb_seq += 1

print(f"\nNumber of sequences found: {nb_seq}.")

In [None]:
#@title ### Upload annotation file (optional)
uploaded = files.upload()
path_annotation = next(iter(uploaded))
try:
    df_annotation = pd.read_csv(path_annotation)
    if len(df_annotation) != nb_seq:
        raise ValueError("Annotation file doesn't match the .mfasta file length.")
except:
    raise ValueError("Annotation file is not in .csv format.")

print("\nAnnotation file correctly loaded.")
annotation_names = list(df_annotation.columns)
print(f"{len(annotation_names)} annotations found: {annotation_names}.")

# Settings

In [None]:
#@title ### Job name
out_name = "poincareMSA" #@param {type:"string"}
#@markdown ---

#@markdown ### Threshold for filtering gapped positions
gapth = 0.9 #@param {type:"number"}
#@markdown ---

#@markdown ### Projection parameters
knn = 5 #@param {type:"number"}
gamma = 2 #@param {type:"number"}
sigma = 1 #@param {type:"number"}
cospca = 0 #@param {type:"number"}
batchs = 4 #@param {type:"number"}
epochs = 1000 #@param {type:"number"}
seed = 0 #@param {type:"number"}

# Data preparation and data projection using Poincaré disk

In [None]:
#@title ## 1. Data preparation
#@markdown Data preparation consists in `.mfasta` cleaning according to a gap threshold and translation of each sequence to the PSSM profile.

print("1. Data preparation")
prep_parameters = "scripts/prepare_data" + " " + mfasta + " " + out_name + " " + out_name + " " + str(gapth)
bash_projection = "bash scripts/prepare_data/create_projection.sh " + prep_parameters
!{bash_projection}

print("\n2. Data projection using Poincaré disk")
#@markdown ## 2. Data projection using Poincaré disk
#@markdown This step creates a projection of encoded sequences to a Poincaré disk.
bash_pm = "python3 "+ "scripts/build_poincare_map/main.py --input_path " + out_name + "/fasta" + str(gapth) + " --output_path " + out_name + "/projections/ --gamma "+ str(gamma) +" --pca "+ str(cospca) + " --epochs "+ str(epochs) +" --seed "+ str(seed) + " --knn " + str(knn)
!{bash_pm}

print("\n3. Format data for visualization")
#@markdown ## 3. Format data for visualization
#Check that an annotation file was provided. Create a dummy one instead
if not path_annotation:
    df_annotation = pd.DataFrame(list(zip(list(range(1,nb_seq+1)), np.full(nb_seq, "-", dtype=object))), columns=["id", "default"])
    df_annotation.to_csv("dummy_annotation.csv", index=False)
    path_annotation = "dummy_annotation.csv"
    annotation_names = ["id", "default"]


path_embedding = f"{out_name}/projections/PM{knn:1.0f}sigma={sigma:2.2f}gamma={gamma:2.2f}cosinepca={cospca:1.0f}_seed{seed:1.0f}.csv"
df_embedding = read_embeddings(path_embedding, path_annotation, withroot=False)

# Projection visualization

In [None]:
#@title ### Create interactive plot
title = "" #@param {type:"string"}

#Labels name
#@markdown ---
#@markdown #### Select the coloring from annotation .csv file:
labels_name = "" #@param {type:"string"}
if labels_name == "" and path_annotation == "dummy_annotation.csv":
    labels_name = "default"
elif labels_name == "":
    labels_name = None
elif labels_name not in annotation_names:
    raise NameError(f"labels_name {labels_name} is not in the availables annotations.\nAvailables annotations: {annotation_names}")

#Labels text
#@markdown #### Select classes to label among the "labels_name" or "second_labels_name" column (comma separated list):
second_labels_name = "" #@param {type:"string"}
if second_labels_name == "" and path_annotation == "dummy_annotation.csv":
    second_labels_name = "default"
elif second_labels_name == "":
    second_labels_name = None
elif second_labels_name not in annotation_names:
    raise NameError(f'"second_labels_name" {second_labels_name} is not in the availables annotations.\nAvailables annotations: {annotation_names}')


labels_text = "" #@param {type:"string"}
if labels_text:
    try:
        labels_text = [s.strip() for s in labels_text.split(",")]
    except:
        print('Error: "label_text" field is not a valid list.')
else:
    labels_text = [""]

#Convert labels_text to labels_name dtype
if labels_name and second_labels_name is None:
    if labels_name and labels_name != "default" and labels_text != [""]:
        try:
            labels_text_dtype = df_annotation[labels_name].dtypes
            labels_text = list(np.array(labels_text).astype(labels_text_dtype))
        except:
            raise TypeError(f'"labels_text" is not compatible with {labels_name}" data format ({labels_text_dtype}).')
else:
    if second_labels_name and second_labels_name != "default" and labels_text != [""]:
        try:
            labels_text_dtype = df_annotation[second_labels_name].dtypes
            labels_text = list(np.array(labels_text).astype(labels_text_dtype))
        except:
            raise TypeError(f'"labels_text" is not compatible with {second_labels_name}" data format ({labels_text_dtype}).')

show_text = True #@param {type:"boolean"}
#@markdown ---

#@markdown #### Use a custom color palette:
color_palette = None #@param {type:"raw"}
use_custom_palette = False #@param {type:"boolean"}

if not use_custom_palette:
    color_palette = None

#Plot graph
fig = plot_embedding_interactive(df_embedding, 
                                 labels_name = labels_name,
                                 second_labels_name = second_labels_name, 
                                 show_text = show_text,
                                 labels_text = labels_text,
                                 color_palette = color_palette, 
                                 title = title, 
                                 fontsize = 11)
fig.show()

In [None]:
#@title Save plot to file
output_name = "fig1" #@param {type:"string"}
output_format = "png" #@param ["png", "html", "pdf", "svg"]

if output_format != "html":
    fig.write_image(f"{output_name}.{output_format}", engine="kaleido")
else:
    fig.write_html(f"{output_name}.{output_format}")
files.download(f"{output_name}.{output_format}")

In [None]:
#@title Download intermediate data
bash_command = f"zip -r -q {out_name}.zip {out_name}"
!{bash_command}

files.download(f"{out_name}.zip")

# Help

### Enabling the GPU

To enable GPU in your notebook, select the following menu options −
```
Runtime / Change runtime type
```

<figure>
<center>
<img src="https://raw.githubusercontent.com/yannvm/PoincareMSA/master/.github/colab_gpu.png?token=GHSAT0AAAAAABX5HFQBGHIJWB35CRUUK2P6YZMFZEQ" width=500>
</center>
</figure>

