In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
!pip install umap-learn
import umap
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import fetch_openml
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import plotly.express as px

# Fetch the MNIST dataset
mnist = fetch_openml('mnist_784', version=1)
X = mnist.data
y = mnist.target.astype(int)

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 1: Apply UMAP for initial dimensionality reduction
umap_model = umap.UMAP(n_neighbors=30, min_dist=0.1, n_components=50, random_state=42)
X_umap = umap_model.fit_transform(X_scaled)

# Print the shape after UMAP reduction
print(f"Original data shape: {X.shape}")
print(f"UMAP reduced data shape: {X_umap.shape}")

# Step 2: Apply t-SNE to the UMAP-reduced data
tsne_model = TSNE(n_components=2, init="pca", random_state=42)
X_tsne = tsne_model.fit_transform(X_umap)

# Print the shape after t-SNE reduction
print(f"t-SNE reduced data shape: {X_tsne.shape}")

# Step 3: Create a DataFrame to store the results for visualization
df_tsne = pd.DataFrame(X_tsne, columns=['TSNE1', 'TSNE2'])
df_tsne['Label'] = y

# Step 4: Visualize the result with a 2D scatter plot
fig = px.scatter(df_tsne, x='TSNE1', y='TSNE2', color='Label', title="Hybrid UMAP + t-SNE on MNIST Data")
fig.update_traces(marker=dict(size=6))

# Display the plot
fig.show(renderer="colab")

# Step 5: KNN Classification to measure accuracy
# Train a KNN classifier in the 2D t-SNE space to see how well the clusters represent the true labels.
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_tsne, y)

# Predict on the same data (for simplicity, using the same dataset to avoid overfitting)
y_pred = knn.predict(X_tsne)

# Calculate accuracy
accuracy = accuracy_score(y, y_pred)
print(f"Accuracy of KNN classifier on t-SNE reduced data: {accuracy:.4f}")


Collecting umap-learn
  Downloading umap_learn-0.5.7-py3-none-any.whl.metadata (21 kB)
Collecting pynndescent>=0.5 (from umap-learn)
  Downloading pynndescent-0.5.13-py3-none-any.whl.metadata (6.8 kB)
Downloading umap_learn-0.5.7-py3-none-any.whl (88 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.8/88.8 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pynndescent-0.5.13-py3-none-any.whl (56 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.9/56.9 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pynndescent, umap-learn
Successfully installed pynndescent-0.5.13 umap-learn-0.5.7


  warn(


Original data shape: (70000, 784)
UMAP reduced data shape: (70000, 50)
t-SNE reduced data shape: (70000, 2)
t-SNE reduced data shape: (70000, 2)


Accuracy of KNN classifier on t-SNE reduced data: 0.9408
Accuracy of KNN classifier on t-SNE reduced data: 0.9408
