# ⌛️ FIERLENIUS 🦙

✍ Authors:

**ROUAUD Lucas**

Master 2 Bio-informatics at *Univerité de Paris*

[![Python 3.10.8](https://img.shields.io/badge/python-%E2%89%A5_3.10.8-blue.svg)](https://www.python.org/downloads/release/python-397/)
[![Conda 22.11.1](https://img.shields.io/badge/miniconda-%E2%89%A5_22.11.1-green.svg)](https://docs.conda.io/en/latest/miniconda.html)
[![GitHub last commit](https://img.shields.io/github/last-commit/FilouPlains/FIERLENIUS.svg)](https://github.com/FilouPlains/FIERLENIUS)
![GitHub stars](https://img.shields.io/github/stars/FilouPlains/FIERLENIUS.svg?style=social)

In [2]:
# Import packages

# [M]
import matplotlib as mat
# [N]
import numpy as np
# [P]
import plotly.express as px
import plotly.graph_objects as go
# [R]
from random import seed

# [M]
from matplotlib import cm, colors
# [P]
from plotly import offline
# [S]
from sklearn.decomposition import PCA
from sklearn.manifold import MDS, TSNE
# [U]
from umap import UMAP



In [3]:
seed(1)
offline.init_notebook_mode()

# Get data.
data: object = np.load("../../data/word_data_2023-01-31_14-14-13.npy", allow_pickle=True)

# Embedding data projection

In this notebook, we look different approaches to observe our embedding data after a 2D projection.

## Principal component analysis or PCA

In [4]:
# Do a PCA.
pca = PCA(n_components=3)
transform_data_pca: object = np.transpose(pca.fit(data).transform(data))

print(pca.explained_variance_ratio_)
print(pca.singular_values_)


[0.84206049 0.09750159 0.02474504]
[10.1684014   3.46008603  1.74311139]


In [5]:
# Get the number of data.
length: int = transform_data_pca[0].shape[0]

# Create a new figure.
plot_pca: object = go.Figure()
# To plot colors.
viridis: object = cm.get_cmap("viridis", length)
color: "list[str]" = []

# Transform object colors to hexadecimal colors codes.
for i in range(length):
    color += [colors.rgb2hex(viridis(i))]

# Multiple scatter plot.
plot_pca.add_trace(go.Splom(
    dimensions=[dict(label="<b>PCA 1</b>", values=transform_data_pca[0]),
                dict(label="<b>PCA 2</b>", values=transform_data_pca[1]),
                dict(label="<b>PCA 3</b>", values=transform_data_pca[2])],
    marker=dict(color=color),
    name="PCA"
))

# Add the rectangle border.
plot_pca.add_shape(
    type="rect",
    xref="paper",
    yref="paper",
    x0=0,
    y0=0,
    x1=1,
    y1=1,
    line=dict(color="black", width=2)
)

# Add a sub-rectangle border.
plot_pca.add_shape(
    type="rect",
    xref="paper",
    yref="paper",
    x0=1/3,
    y0=0,
    x1=2/3,
    y1=1,
    line=dict(color="black", width=1)
)

# Add a sub-rectangle border.
plot_pca.add_shape(
    type="rect",
    xref="paper",
    yref="paper",
    x0=0,
    y0=1/3,
    x1=1,
    y1=2/3,
    line=dict(color="black", width=1)
)

plot_pca.update_traces()

# Modify general plot properties.
plot_pca.update_layout(
    plot_bgcolor="white",
    legend_title="<b>Principal component<br />analysis</b>",
    margin=dict(l=20, r=20, t=25, b=20),
    height=800,
    font=dict(size=14, color="black")
)

plot_pca["data"][0]["showlegend"]=True

# Modify axis properties.
plot_pca.update_xaxes(showline=True, linewidth=1, linecolor="black")
plot_pca.update_yaxes(showline=True, linewidth=1, linecolor="black")

# Show the plot.
plot_pca.show()

## Multidimensional scaling or MDS

In [6]:
# Perfoms a MDS.
# /!\ Take from 30 to 35 s!
embedding: object = MDS(n_components=2)
transform_data_mds: object = np.transpose(embedding.fit_transform(data))

print(transform_data_mds.shape)

(2, 1916)


In [7]:
# Get the number of data.
length: int = transform_data_mds[0].shape[0]

# Create a new figure.
plot_mds: object = go.Figure()
# To plot colors.
viridis: object = cm.get_cmap("viridis", length)
color: "list[str]" = []

# Transform object colors to hexadecimal colors codes.
for i in range(length):
    color += [colors.rgb2hex(viridis(i))]

# Multiple scatter plot.
plot_mds.add_trace(go.Scatter(
    x = transform_data_mds[0],
    y = transform_data_mds[1],
    mode="markers",
    marker=dict(color=color),
    name="MDS"
))

# Add the rectangle border.
plot_mds.add_shape(
    type="rect",
    xref="paper",
    yref="paper",
    x0=0,
    y0=0,
    x1=1,
    y1=1,
    line=dict(color="black", width=2)
)

plot_mds.update_traces()

# Modify general plot properties.
plot_mds.update_layout(
    plot_bgcolor="white",
    legend_title="<b>Multidimensional<br />scaling</b>",
    margin=dict(l=20, r=20, t=25, b=20),
    height=500,
    font=dict(size=14, color="black"),
    xaxis_title="<b>First dimension</b>",
    yaxis_title="<b>Second dimension</b>"
)

plot_mds["data"][0]["showlegend"]=True

# Modify axis properties.
plot_mds.update_xaxes(showline=True, linewidth=1, linecolor="black")
plot_mds.update_yaxes(showline=True, linewidth=1, linecolor="black")

# Show the plot.
plot_mds.show()

## Uniform Manifold Approximation and Projection for Dimension Reduction or UMAP

In [8]:
# Performs a UMAP.
# /!\ Take around 5 s.
reducer: object = UMAP()
transform_data_umap: object  = np.transpose(reducer.fit_transform(data))

print(transform_data_umap.shape)

(2, 1916)


In [9]:
# Get the number of data.
length: int = transform_data_umap[0].shape[0]

# Create a new figure.
plot_umap: object = go.Figure()
# To plot colors.
viridis: object = cm.get_cmap("viridis", length)
color: "list[str]" = []

# Transform object colors to hexadecimal colors codes.
for i in range(length):
    color += [colors.rgb2hex(viridis(i))]

# Multiple scatter plot.
plot_umap.add_trace(go.Scatter(
    x=transform_data_umap[0],
    y=transform_data_umap[1],
    mode="markers",
    marker=dict(color=color),
    name="UMAP"
))

# Add the rectangle border.
plot_umap.add_shape(
    type="rect",
    xref="paper",
    yref="paper",
    x0=0,
    y0=0,
    x1=1,
    y1=1,
    line=dict(color="black", width=2)
)

plot_umap.update_traces()

# Modify general plot properties.
plot_umap.update_layout(
    plot_bgcolor="white",
    legend_title="<b>Uniform manifold<br />approximation and projection<br />for dimension reduction</b>",
    margin=dict(l=20, r=20, t=25, b=20),
    height=500,
    font=dict(size=14, color="black"),
    xaxis_title="<b>First dimension</b>",
    yaxis_title="<b>Second dimension</b>"
)

plot_umap["data"][0]["showlegend"] = True

# Modify axis properties.
plot_umap.update_xaxes(showline=True, linewidth=1, linecolor="black")
plot_umap.update_yaxes(showline=True, linewidth=1, linecolor="black")

# Show the plot.
plot_umap.show()


## T-distributed stochastic neighbor embedding or t-SNE

In [10]:
# Performs a UMAP.
# /!\ Take around 10 s.

transform_data_tsne = np.transpose(TSNE(
    n_components=2,
    init="pca",
    learning_rate="auto").fit_transform(data))

print(transform_data_tsne.shape)



The PCA initialization in TSNE will change to have the standard deviation of PC1 equal to 1e-4 in 1.2. This will ensure better convergence.



(2, 1916)


In [11]:
# Get the number of data.
length: int = transform_data_tsne[0].shape[0]

# Create a new figure.
plot_tnse: object = go.Figure()
# To plot colors.
viridis: object = cm.get_cmap("viridis", length)
color: "list[str]" = []

# Transform object colors to hexadecimal colors codes.
for i in range(length):
    color += [colors.rgb2hex(viridis(i))]

# Multiple scatter plot.
plot_tnse.add_trace(go.Scatter(
    x=transform_data_tsne[0],
    y=transform_data_tsne[1],
    mode="markers",
    marker=dict(color=color),
    name="t-SNE"
))

# Add the rectangle border.
plot_tnse.add_shape(
    type="rect",
    xref="paper",
    yref="paper",
    x0=0,
    y0=0,
    x1=1,
    y1=1,
    line=dict(color="black", width=2)
)

plot_tnse.update_traces()

# Modify general plot properties.
plot_tnse.update_layout(
    plot_bgcolor="white",
    legend_title="<b>T-distributed stochastic<br />neighbor embedding</b>",
    margin=dict(l=20, r=20, t=25, b=20),
    height=500,
    font=dict(size=14, color="black"),
    xaxis_title="<b>First dimension</b>",
    yaxis_title="<b>Second dimension</b>"
)

plot_tnse["data"][0]["showlegend"] = True

# Modify axis properties.
plot_tnse.update_xaxes(showline=True, linewidth=1, linecolor="black")
plot_tnse.update_yaxes(showline=True, linewidth=1, linecolor="black")

# Show the plot.
plot_tnse.show()
