In [19]:
"""
Implementation for Improved Deep Embedded Clustering as described in paper:

        Xifeng Guo, Long Gao, Xinwang Liu, Jianping Yin. Improved Deep Embedded Clustering with Local Structure
        Preservation. IJCAI 2017.

Usage:
    Weights of Pretrained autoencoder for mnist are in './ae_weights/mnist_ae_weights.h5':
        python IDEC.py mnist --ae_weights ./ae_weights/mnist_ae_weights.h5
    for USPS and REUTERSIDF10K datasets
        python IDEC.py usps --update_interval 30 --ae_weights ./ae_weights/usps_ae_weights.h5
        python IDEC.py reutersidf10k --n_clusters 4 --update_interval 3 --ae_weights ./ae_weights/reutersidf10k_ae_weights.h5

Author:
    Xifeng Guo. 2017.4.30
"""

from time import time
import numpy as np
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

import csv
from keras.models import Model
from keras import callbacks
from keras.optimizers import SGD
from tensorflow.keras.utils import plot_model

from sklearn.cluster import KMeans
from sklearn import metrics

from dec_utils import cluster_acc, ClusteringLayer, autoencoder
print("🧩 Imported autoencoder from:", autoencoder.__code__.co_filename)

class IDEC(object):
    def __init__(self,
                 dims,
                 n_clusters=10,
                 alpha=1.0,
                 batch_size=256,
                 init='glorot_uniform'):

        super(IDEC, self).__init__()

        self.dims = dims
        self.input_dim = dims[0]
        self.n_stacks = len(self.dims) - 1

        self.n_clusters = n_clusters
        self.alpha = alpha
        self.batch_size = batch_size
        self.autoencoder, self.encoder = autoencoder(self.dims, init=init)

    def pretrain(self, x, y=None, optimizer='adam', epochs=200, batch_size=256, save_dir='results/temp'):
        print('...Pretraining...')
        self.autoencoder.compile(optimizer=optimizer, loss='mse')

        os.makedirs(save_dir, exist_ok=True)
        csv_logger = callbacks.CSVLogger(os.path.join(save_dir, 'pretrain_log.csv'))
        cb = [csv_logger]

        if y is not None:
            class PrintACC(callbacks.Callback):
                def __init__(self, x, y, encoder):
                    super(PrintACC, self).__init__()
                    self.x = x
                    self.y = y
                    self.encoder = encoder

                def on_epoch_end(self, epoch, logs=None):
                    if int(epochs / 10) != 0 and epoch % int(epochs / 10) != 0:
                        return
                    features = self.encoder.predict(self.x)
                    km = KMeans(n_clusters=len(np.unique(self.y)), n_init=20)
                    y_pred = km.fit_predict(features)
                    acc = cluster_acc(self.y, y_pred)
                    nmi = metrics.normalized_mutual_info_score(self.y, y_pred)
                    print(f'        |==>  acc: {acc:.4f},  nmi: {nmi:.4f}  <==|')

            cb.append(PrintACC(x, y, self.encoder))

        t0 = time()
        self.autoencoder.fit(x, x, batch_size=batch_size, epochs=epochs, callbacks=cb)
        print('Pretraining time: %ds' % round(time() - t0))

        save_path = os.path.join(save_dir, 'ae_weights.weights.h5')
        self.autoencoder.save_weights(save_path)
        print(f'✅ Pretrained AE weights saved to {save_path}')
    
    def initialize_model(self, ae_weights=None, gamma=0.1, optimizer='adam'):
        if ae_weights is not None:
            self.encoder.load_weights(ae_weights)
            print( 'Pretrained AE weights are loaded successfully.')
        else:
            print('ae_weights must be given. E.g.')
            print('python IDEC.py mnist --ae_weights weights.h5')
            exit()

        hidden = self.encoder.get_layer(name='encoder_%d' % (self.n_stacks - 1)).output
        clustering_layer = ClusteringLayer(self.n_clusters, name='clustering')(hidden)

        self.model = Model(inputs=self.autoencoder.input,
                   outputs=[clustering_layer, self.autoencoder.output])
        self.model.compile(loss={'clustering': 'kld', 'decoder_0': 'mse'},
                           loss_weights=[gamma, 1],
                           optimizer=optimizer)

    def load_weights(self, weights_path):  # load weights of IDEC model
        self.model.load_weights(weights_path)

    def extract_feature(self, x):  # extract features from before clustering layer
        encoder = Model(self.model.input, self.model.get_layer('encoder_%d' % (self.n_stacks - 1)).output)
        return encoder.predict(x)

    def predict_clusters(self, x):  # predict cluster labels using the output of clustering layer
        q, _ = self.model.predict(x, verbose=1)
        return q.argmax(1)

    @staticmethod
    def target_distribution(q):  # target distribution P which enhances the discrimination of soft label Q
        weight = q ** 2 / q.sum(0)
        return (weight.T / weight.sum(1)).T

    def clustering(self, x, y=None,
                   tol=1e-3,
                   update_interval=140,
                   maxiter=2e4,
                   save_dir='./results/idec'):

        print('Update interval', update_interval)
        save_interval = x.shape[0] / self.batch_size * 5  # 5 epochs
        print('Save interval', save_interval)

        # initialize cluster centers using k-means
        print('Initializing cluster centers with k-means.')
        kmeans = KMeans(n_clusters=self.n_clusters, n_init=20)
        y_pred = kmeans.fit_predict(self.encoder.predict(x))
        y_pred_last = y_pred
        self.model.get_layer(name='clustering').set_weights([kmeans.cluster_centers_])

        # logging file
        import csv, os
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        logfile = open(os.path.join(save_dir, 'idec_log.csv'), mode='w', newline='')
        logwriter = csv.DictWriter(logfile, fieldnames=['iter', 'acc', 'nmi', 'ari', 'L', 'Lc', 'Lr'])
        logwriter.writeheader()

        loss = [0, 0, 0]
        index = 0
        for ite in range(int(maxiter)):
            if ite % update_interval == 0:
                q, _ = self.model.predict(x, verbose=0)
                p = self.target_distribution(q)  # update the auxiliary target distribution p

                # evaluate the clustering performance
                y_pred = q.argmax(1)
                delta_label = np.sum(y_pred != y_pred_last).astype(np.float32) / y_pred.shape[0]
                y_pred_last = y_pred
                if y is not None:
                    acc = np.round(cluster_acc(y, y_pred), 5)
                    nmi = np.round(metrics.normalized_mutual_info_score(y, y_pred), 5)
                    ari = np.round(metrics.adjusted_rand_score(y, y_pred), 5)
                    loss = np.round(loss, 5)
                    logdict = dict(iter=ite, acc=acc, nmi=nmi, ari=ari, L=loss[0], Lc=loss[1], Lr=loss[2])
                    logwriter.writerow(logdict)
                    print('Iter', ite, ': Acc', acc, ', nmi', nmi, ', ari', ari, '; loss=', loss)

                # check stop criterion
                if ite > 0 and delta_label < tol:
                    print('delta_label ', delta_label, '< tol ', tol)
                    print('Reached tolerance threshold. Stopping training.')
                    logfile.close()
                    break

            # train on batch
            if (index + 1) * self.batch_size > x.shape[0]:
                loss = self.model.train_on_batch(x=x[index * self.batch_size::],
                                                 y=[p[index * self.batch_size::], x[index * self.batch_size::]])
                index = 0
            else:
                loss = self.model.train_on_batch(x=x[index * self.batch_size:(index + 1) * self.batch_size],
                                                 y=[p[index * self.batch_size:(index + 1) * self.batch_size],
                                                    x[index * self.batch_size:(index + 1) * self.batch_size]])
                index += 1

            # save intermediate model
            if ite % save_interval == 0:
                # save IDEC model checkpoints
                self.model.save_weights(save_dir + '/IDEC_model_' + str(ite) + '.weights.h5')
                print('saving model to:', save_dir + '/IDEC_model_' + str(ite) + '.weights.h5')

            ite += 1

        # save the trained model
        logfile.close()
        self.model.save_weights(save_dir + '/IDEC_model_final.weights.h5')
        print('saving model to:', save_dir + '/IDEC_model_final.weights.h5')
        
        return y_pred


if __name__ == "__main__":
    # === Hiperparámetros ===
    class Args:
        dataset = 'mnist'
        n_clusters = 10
        batch_size = 256
        maxiter = int(2e4)
        gamma = 0.1
        update_interval = 140
        tol = 1e-4
        ae_weights = './ae_weights/ae_weights.weights.h5'
        save_dir = './results/idec'

    args = Args()
    print(args)

    # === Cargar dataset ===
    from datasets import load_mnist, load_usps, load_reuters
    if args.dataset == 'mnist':
        x, y = load_mnist()
    elif args.dataset == 'usps':
        x, y = load_usps('data/usps')
    elif args.dataset == 'reutersidf10k':
        x, y = load_reuters('data/reuters')

    # === Crear modelo IDEC ===
    idec = IDEC(dims=[x.shape[-1], 500, 500, 2000, 10], n_clusters=args.n_clusters)

    # === Pretraining (solo si no existen pesos) ===
    if args.ae_weights is None or not os.path.exists(args.ae_weights):
        print("⚙️  Pretraining autoencoder...")
        idec.pretrain(x=x, y=y, optimizer='adam', epochs=300, batch_size=args.batch_size ,save_dir=os.path.dirname(args.ae_weights))
        args.ae_weights = args.ae_weights  # ya se guarda en ese path
    else:
        print("📂 Loading pretrained weights...")
        idec.autoencoder.load_weights(args.ae_weights)

    # === Inicializar modelo clustering ===
    idec.initialize_model(ae_weights=args.ae_weights, gamma=args.gamma, optimizer='adam')

    # === Clustering ===
    idec.model.summary()
    print("🚀 Starting clustering training...")
    t0 = time()
    y_pred = idec.clustering(x, y=y, tol=args.tol, maxiter=args.maxiter,
                             update_interval=args.update_interval, save_dir=args.save_dir)
    print(f"✅ Clustering finished in {round(time() - t0)}s")
    print("📊 Final ACC:", cluster_acc(y, y_pred))
    print("📊 Final NMI:", metrics.normalized_mutual_info_score(y, y_pred))


🧩 Imported autoencoder from: /home/USERS/didac.reyes/TFG/Python IDEC/dec_utils.py
<__main__.Args object at 0x7fb01a7c15b0>
MNIST samples (70000, 784)
⚙️  Pretraining autoencoder...
...Pretraining...
Epoch 1/300
[1m2188/2188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 849us/steposs: 0.0532  
        |==>  acc: 0.5578,  nmi: 0.5021  <==|
[1m274/274[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 25ms/step - loss: 0.0531
Epoch 2/300
[1m274/274[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.0224 
Epoch 3/300
[1m274/274[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.0192 
Epoch 4/300
[1m274/274[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.0175 
Epoch 5/300
[1m274/274[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.0164 
Epoch 6/300
[1m274/274[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.0157 
Epoch 7/300
[1m274/274[0m [32m━━━━━━━━━━━━

🚀 Starting clustering training...
Update interval 140
Save interval 1367.1875
Initializing cluster centers with k-means.
[1m2188/2188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 737us/step
Iter 0 : Acc 0.83296 , nmi 0.76768 , ari 0.72254 ; loss= [0 0 0]
saving model to: ./results/idec/IDEC_model_0.weights.h5
Iter 140 : Acc 0.84956 , nmi 0.79209 , ari 0.75507 ; loss= [0.01221 0.04499 0.00771]
Iter 280 : Acc 0.86243 , nmi 0.81412 , ari 0.78146 ; loss= [0.01513 0.06548 0.00858]
Iter 420 : Acc 0.87207 , nmi 0.82969 , ari 0.79912 ; loss= [0.01901 0.09341 0.00968]
Iter 560 : Acc 0.8772 , nmi 0.84085 , ari 0.81004 ; loss= [0.02325 0.12588 0.01066]
Iter 700 : Acc 0.87983 , nmi 0.84848 , ari 0.8178 ; loss= [0.02664 0.15145 0.0115 ]
Iter 840 : Acc 0.88046 , nmi 0.85263 , ari 0.82128 ; loss= [0.02894 0.16896 0.01204]
Iter 980 : Acc 0.8814 , nmi 0.8547 , ari 0.8231 ; loss= [0.03042 0.17997 0.01242]
Iter 1120 : Acc 0.88097 , nmi 0.85581 , ari 0.82362 ; loss= [0.03138 0.18671 0.01271]
Ite

In [17]:
print(metrics.normalized_mutual_info_score(y,y_pred))

0.8513706523041479
