### Step 1: Install Required Libraries

In [None]:
!pip install torch torchvision datasets transformers scikit-learn matplotlib

### Step 2: Import Necessary Libraries

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, accuracy_score
from sklearn.manifold import TSNE
from scipy.stats import mode

### Step 3: Load Fashion MNIST Dataset from HuggingFace Manually

In [None]:
splits = {'train': 'fashion_mnist/train-00000-of-00001.parquet',
          'test': 'fashion_mnist/test-00000-of-00001.parquet'}

df = pd.read_parquet("hf://datasets/zalando-datasets/fashion_mnist/" + splits["train"])
df.head()

### Step 4: Convert Image Bytes to NumPy Arrays

In [None]:
from PIL import Image
import io

def decode_image(example):
    return np.array(Image.open(io.BytesIO(example['bytes'])))

X = np.stack([decode_image(row) for row in df['image']])
X = X.reshape(len(X), -1).astype("float32") / 255.0  # Normalize to (0, 1)
y = df['label'].to_numpy()

### Step 5: Define Autoencoder Model

In [None]:
class Autoencoder(nn.Module):
    def __init__(self):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(784, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32)
        )
        self.decoder = nn.Sequential(
            nn.Linear(32, 64),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, 784),
            nn.Sigmoid()
        )

    def forward(self, x):
        z = self.encoder(x)
        out = self.decoder(z)
        return out

### Step 6: Train Autoencoder

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Autoencoder().to(device)

train_loader = DataLoader(TensorDataset(torch.tensor(X)), batch_size=128, shuffle=True)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

for epoch in range(20):
    total_loss = 0
    for batch in train_loader:
        x_batch = batch[0].to(device).float()
        output = model(x_batch)
        loss = criterion(output, x_batch)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/20, Loss: {total_loss / len(train_loader):.4f}")

### Step 7: Encode Data for Clustering

In [None]:
model.eval()
with torch.no_grad():
    compressed = model.encoder(torch.tensor(X).to(device).float()).cpu().numpy()

### Step 8: KMeans Clustering + Silhouette Score

In [None]:
kmeans = KMeans(n_clusters=10, random_state=42)
cluster_labels = kmeans.fit_predict(compressed)
print("Silhouette Score:", silhouette_score(compressed, cluster_labels))

### Step 9: Purity Score

In [None]:
def purity_score(y_true, y_pred):
    labels = np.zeros_like(y_pred)
    for i in range(10):
        mask = (y_pred == i)
        if np.any(mask):
            labels[mask] = mode(y_true[mask])[0]
    return accuracy_score(y_true, labels)

purity = purity_score(y, cluster_labels)
print(f"Purity Score: {purity:.4f}")

### Step 10: Visualize Clusters using t-SNE

In [None]:
tsne = TSNE(n_components=2, random_state=42)
tsne_result = tsne.fit_transform(compressed)

plt.figure(figsize=(8, 6))
plt.scatter(tsne_result[:, 0], tsne_result[:, 1], c=cluster_labels, cmap='tab10', s=10)
plt.title("t-SNE Clustering of Encoded Data")
plt.colorbar()
plt.show()

### Step 11: Adjusted Rand Index (ARI)

In [None]:
from sklearn.metrics import adjusted_rand_score
ari = adjusted_rand_score(y, cluster_labels)
print(f"Adjusted Rand Index: {ari:.4f}")

### Step 12: Normalized Mutual Information (NMI)

In [None]:
from sklearn.metrics import normalized_mutual_info_score
nmi = normalized_mutual_info_score(y, cluster_labels)
print(f"Normalized Mutual Information: {nmi:.4f}")

### Step 13: Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

mapped_labels = np.zeros_like(cluster_labels)
for i in range(10):
    mask = (cluster_labels == i)
    if np.any(mask):
        mapped_labels[mask] = mode(y[mask])[0]

conf_mat = confusion_matrix(y, mapped_labels)
plt.figure(figsize=(10, 8))
sns.heatmap(conf_mat, annot=True, fmt="d", cmap="Blues", xticklabels=range(10), yticklabels=range(10))
plt.title("Confusion Matrix")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.show()