In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.neighbors import NearestNeighbors
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('.../data_reduced_1.csv')

codes = df["Code"]
descriptors = df.drop(columns=["Code"])
numeric_descriptors = descriptors.select_dtypes(include=[np.number])

imputer = SimpleImputer(strategy="constant", fill_value=0)
descriptors_imputed = imputer.fit_transform(numeric_descriptors)

scaler = StandardScaler()
descriptors_scaled = scaler.fit_transform(descriptors_imputed)

In [None]:
##there will be warnings, ignore them

k = 10
nbrs = NearestNeighbors(n_neighbors=k + 1).fit(descriptors_scaled)
distances, _ = nbrs.kneighbors(descriptors_scaled)
density = 1 / (np.mean(distances[:, 1:], axis=1) + 1e-5)

In [None]:
tsne = TSNE(n_components=2, random_state=42, perplexity=30)
tsne_result = tsne.fit_transform(descriptors_scaled)

In [None]:
vis_df = pd.DataFrame({
    "Zeolite": codes.values,
    "TSNE-1": tsne_result[:, 0],
    "TSNE-2": tsne_result[:, 1],
    "Density": density
})

vis_df_sorted = vis_df.sort_values(by="Density", ascending=False).reset_index(drop=True)

In [None]:
plt.figure(figsize=(12, 8))
scatter = plt.scatter(vis_df_sorted["TSNE-1"], vis_df_sorted["TSNE-2"],
                      c=vis_df_sorted["Density"], cmap="viridis", s=60)
plt.colorbar(scatter, label="Density")
plt.title("Zeolite Descriptor Visualization (t-SNE) Colored by Continuous Density")
plt.xlabel("TSNE-1")
plt.ylabel("TSNE-2")
plt.tight_layout()
targets = ["AFI", "MFI", "BEA", "ERI", "AFS", "CHA", "SSF"]
targets2 = ["DOH", "THO", "SVR", 'SVY',"SOD"]
for code in targets:
    subset = vis_df_sorted[vis_df_sorted["Zeolite"] == code]
    if not subset.empty:
        x, y = subset["TSNE-1"].values[0], subset["TSNE-2"].values[0]
        plt.scatter(x, y, s=200, edgecolors="red", facecolors="none", linewidths=2)
        plt.text(x + 1, y - 0.5, code, color="red", fontsize=12, weight="bold")
for code in targets2:
    subset = vis_df_sorted[vis_df_sorted["Zeolite"] == code]
    x, y = subset["TSNE-1"].values[0], subset["TSNE-2"].values[0]
    plt.scatter(x, y, s=200, edgecolors="orange", facecolors="none", linewidths=2)
    plt.text(x - 1, y - 1.5, code, color="orange", fontsize=12, weight="bold")

plt.show()