In [28]:
import pandas as pd
import numpy as np
import umap.umap_ as umap
import prince
from sklearn.preprocessing import StandardScaler
import pacmap
from sklearn.manifold import SpectralEmbedding
from scipy.spatial.distance import cdist
import plotly.express as px

In [29]:
df = pd.read_csv('penguins.csv').dropna()

# FAMD

In [30]:
famd = prince.FAMD(n_components=2)
famd = famd.fit(df)
reduced = famd.row_coordinates(df)
reduced.columns = ['X', 'Y']
tot_inertia = f"{round(100*famd.explained_inertia_.sum(),2)}"
#st.write(f'FAMD Visualization of Clusters ({tot_inertia}%) :')
labs = {
    "X" : f"Component 0 - ({round(100*famd.explained_inertia_[0],2)}% inertia)",
    "Y" : f"Component 1 - ({round(100*famd.explained_inertia_[1],2)}% inertia)",
}
fig = px.scatter(reduced, 
                x='X',y='Y',
                labels=labs,title = f'FAMD ({tot_inertia}% inertia) :')
fig.show()

# Laplacian

In [31]:
df2 = df.copy()
if 'cluster' in df2.columns:
    df2.pop('cluster')
numerical = df2.select_dtypes('number')
categorical = df2.select_dtypes('object')
scaler = StandardScaler()
numerical = scaler.fit_transform(numerical)
categorical = categorical.apply(lambda x: x.replace(x.unique(),list(range(1,1+len(x.unique())))))
gamma = np.mean(np.std(numerical))/2
distances = (cdist(numerical,numerical,'sqeuclidean')) + cdist(categorical,categorical,'hamming')*gamma
kernel = pd.DataFrame(distances).apply(lambda x: np.exp(-x/1))
lap = SpectralEmbedding(2,affinity="precomputed").fit_transform(kernel)
lap = pd.DataFrame(lap)
lap.columns = ['X', 'Y']
fig = px.scatter(lap, 
                x='X',y='Y',
                title = f'Laplacian Eigenmaps')
fig.show()

# UMAP

In [32]:
um = umap.UMAP(n_components=2,metric='precomputed').fit_transform(distances)
um = pd.DataFrame(um)
um.columns = ['X', 'Y']
fig = px.scatter(um, 
                x='X',y='Y',
                title = f'UMAP')
fig.show()


using precomputed metric; inverse_transform will be unavailable



# PaCMAP

In [33]:
pm = pacmap.PaCMAP(n_components=2,apply_pca=False).fit_transform(reduced)
pm = pd.DataFrame(pm)
pm.columns = ['X', 'Y']
fig = px.scatter(pm, 
                x='X',y='Y',
                title = f'PaCMAP')
fig.show()


Running ANNOY Indexing on high-dimensional data. Nearest-neighbor search may be slow!

