In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np

import tensorflow as tf

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [None]:
DATA_PATH = '/content/drive/MyDrive/Univerui/0 MAGISTRAS/tiriamasis/embeddings/'

bert_data = pd.read_csv(DATA_PATH+'bert_embeddings.csv')
labse_data = pd.read_csv(DATA_PATH+'labse_embeddings.csv')
w2v_data = pd.read_csv(DATA_PATH+'w2v_embeddings.csv')

In [None]:
class ClusteringLayer(tf.keras.layers.Layer):
  def __init__(self, n_clusters, encoding_dim, alpha, **kwargs):
    super(ClusteringLayer, self).__init__(**kwargs)
    self.n_clusters = n_clusters
    self.clusters = tf.Variable(initial_value=tf.zeros([n_clusters, encoding_dim]),
                                trainable=True)
    self.alpha = alpha

  def call(self, inputs, **kwargs):
    q = 1. / (1. + (tf.reduce_sum(tf.square(tf.expand_dims(inputs, axis=1) - self.clusters), axis=2) / self.alpha))
    q **= (self.alpha + 1.) / 2.
    q = tf.transpose(tf.transpose(q) / tf.reduce_sum(q, axis=1))
    return q

In [None]:
def autoencoder_model(input_dim, latent_dim):
  input_layer = tf.keras.layers.Input(shape=(input_dim,))
  encoder = tf.keras.layers.Dense(int(input_dim/2.), activation='relu')(input_layer)
  encoder = tf.keras.layers.Dense(int(input_dim/4.), activation='relu')(encoder)
  encoder = tf.keras.layers.Dense(int(input_dim/8.), activation='relu')(encoder)

  encoder_output = tf.keras.layers.Dense(latent_dim, activation='relu')(encoder)

  decoder = tf.keras.layers.Dense(int(input_dim/8.), activation='relu')(encoder_output)
  decoder = tf.keras.layers.Dense(int(input_dim/4.), activation='relu')(decoder)
  decoder = tf.keras.layers.Dense(int(input_dim/2.), activation='relu')(decoder)

  decoder_output = tf.keras.layers.Dense(input_dim, activation='relu')(decoder)

  autoencoder = tf.keras.models.Model(inputs=input_layer, outputs=decoder_output)
  encoder = tf.keras.models.Model(inputs=input_layer, outputs=encoder_output)

  return autoencoder, encoder

def target_distribution(q):
  weight = q ** 2 / tf.reduce_sum(q, axis=0)
  return tf.transpose(tf.transpose(weight) / tf.reduce_sum(weight, axis=1))

In [None]:
def DEC(data, num_clusters, name, encoding_dim=16, batch_size=256, maxiter=10000,
        update_interval=140, threshold=0.001, alpha=1.):
  scaler = StandardScaler()
  x_data = scaler.fit_transform(data)

  input_dim = x_data.shape[1]

  # AutoEncoder
  autoencoder, encoder = autoencoder_model(input_dim, encoding_dim)
  autoencoder.compile(optimizer=tf.keras.optimizers.Adam(), loss='mse')
  autoencoder.fit(x_data, x_data, epochs=50, batch_size=batch_size, verbose=0)

  # Clustering centers
  cluster_model = KMeans(n_clusters=num_clusters, n_init='auto')
  y_pred = cluster_model.fit_predict(encoder.predict(x_data))
  cluster_centers = cluster_model.cluster_centers_

  # Clustering layers
  clustering_layer = ClusteringLayer(num_clusters, encoding_dim, alpha, name='clustering')(encoder.output)
  model = tf.keras.models.Model(inputs=encoder.input, outputs=clustering_layer)
  model.get_layer(name='clustering').clusters.assign(cluster_centers)

  model.compile(optimizer=tf.keras.optimizers.Adam(), loss='kld')

  index_array = np.arange(x_data.shape[0])

  # Training
  for i in range(maxiter):
    if i % update_interval == 0:
      q = model.predict(x_data, verbose=0)
      p = target_distribution(q)
      y_pred = q.argmax(axis=1)
      y_proba = q

      if i > 0 and np.sum(y_pred != y_pred_last) / y_pred.shape[0] <  threshold:
        break
      y_pred_last = y_pred

    idx = index_array[i * batch_size: min((i+1) * batch_size, x_data.shape[0])]
    loss = model.train_on_batch(tf.gather(x_data, idx), tf.gather(p, idx))

    if i % update_interval == 0:
      print(f"Iteration: {i}: loss = {loss}")

  encoder.save(f'/content/drive/MyDrive/Univerui/0 MAGISTRAS/tiriamasis/clustering_outputs/DEC/{name}_encoder.h5')
  model.save(f'/content/drive/MyDrive/Univerui/0 MAGISTRAS/tiriamasis/clustering_outputs/DEC/{name}_clustering_model.h5')

  return y_pred, y_proba

In [None]:
bert_pred, bert_proba = DEC(bert_data, 20, 'bert', batch_size=32)

[1m876/876[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step
Iteration: 0: loss = 0.1215776577591896
Iteration: 140: loss = 0.06486725062131882
Iteration: 280: loss = 0.1424783617258072
Iteration: 420: loss = 0.19619962573051453
Iteration: 560: loss = 0.2278680056333542
Iteration: 700: loss = 0.25229695439338684
Iteration: 840: loss = 0.2674711048603058
Iteration: 980: loss = nan




In [None]:
class ClusteringLayer(tf.keras.layers.Layer):
  def __init__(self, n_clusters, encoding_dim, alpha, **kwargs):
    super(ClusteringLayer, self).__init__(**kwargs)
    self.n_clusters = n_clusters
    self.clusters = tf.Variable(initial_value=tf.zeros([n_clusters, encoding_dim]),
                                trainable=True)
    self.alpha = alpha

  def call(self, inputs, **kwargs):
    q = 1. / (1. + (tf.reduce_sum(tf.square(tf.expand_dims(inputs, axis=1) - self.clusters), axis=2) / self.alpha))
    q **= (self.alpha + 1.) / 2.
    q = tf.transpose(tf.transpose(q) / tf.reduce_sum(q, axis=1))
    return q

def autoencoder_model(input_dim, latent_dim):
  input_layer = tf.keras.layers.Input(shape=(input_dim,))
  encoder = tf.keras.layers.Dense(int(input_dim/2.), activation='relu')(input_layer)
  encoder = tf.keras.layers.Dense(int(input_dim/4.), activation='relu')(encoder)
  encoder = tf.keras.layers.Dense(int(input_dim/8.), activation='relu')(encoder)

  encoder_output = tf.keras.layers.Dense(latent_dim, activation='relu')(encoder)

  decoder = tf.keras.layers.Dense(int(input_dim/8.), activation='relu')(encoder_output)
  decoder = tf.keras.layers.Dense(int(input_dim/4.), activation='relu')(decoder)
  decoder = tf.keras.layers.Dense(int(input_dim/2.), activation='relu')(decoder)

  decoder_output = tf.keras.layers.Dense(input_dim, activation='relu')(decoder)

  autoencoder = tf.keras.models.Model(inputs=input_layer, outputs=decoder_output)
  encoder = tf.keras.models.Model(inputs=input_layer, outputs=encoder_output)

  return autoencoder, encoder

def target_distribution(q):
  weight = q ** 2 / (tf.reduce_sum(q, axis=0) + tf.keras.backend.epsilon())
  return tf.transpose(tf.transpose(weight) / (tf.reduce_sum(weight, axis=1)+ tf.keras.backend.epsilon()))

def DEC(data, num_clusters, name, encoding_dim=16, learning_rate=0.001, batch_size=256, maxiter=10000,
        update_interval=140, threshold=0.001, alpha=1.):
  scaler = StandardScaler()
  x_data = scaler.fit_transform(data)

  input_dim = x_data.shape[1]

  # AutoEncoder
  autoencoder, encoder = autoencoder_model(input_dim, encoding_dim)
  autoencoder.compile(optimizer=tf.keras.optimizers.Adam(), loss='mse')
  autoencoder.fit(x_data, x_data, epochs=50, batch_size=batch_size, verbose=0)

  # Clustering centers
  cluster_model = KMeans(n_clusters=num_clusters, n_init='auto')
  y_pred = cluster_model.fit_predict(encoder.predict(x_data, verbose=0))
  cluster_centers = cluster_model.cluster_centers_

  # Clustering layers
  clustering_layer = ClusteringLayer(num_clusters, encoding_dim, alpha, name='clustering')(encoder.output)
  model = tf.keras.models.Model(inputs=encoder.input, outputs=clustering_layer)
  model.get_layer(name='clustering').clusters.assign(cluster_centers)

  model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate,
                                                   clipvalue=1.),
                loss='kld')

  index_array = np.arange(x_data.shape[0])

  # Training
  for i in range(maxiter):
    if i % update_interval == 0:
      q = model.predict(x_data, verbose=0)
      p = target_distribution(q)
      y_pred = q.argmax(axis=1)
      y_proba = q

      if i > 0 and np.sum(y_pred != y_pred_last) / y_pred.shape[0] <  threshold:
        break
      y_pred_last = y_pred

    idx = index_array[i * batch_size: min((i+1) * batch_size, x_data.shape[0])]
    loss = model.train_on_batch(tf.gather(x_data, idx), tf.gather(p, idx))

    if i % update_interval == 0:
      print(f"Iteration: {i}: loss = {loss}")

  encoder.save(f'/content/drive/MyDrive/Univerui/0 MAGISTRAS/tiriamasis/clustering_outputs/DEC/{name}_encoder.h5')
  model.save(f'/content/drive/MyDrive/Univerui/0 MAGISTRAS/tiriamasis/clustering_outputs/DEC/{name}_clustering_model.h5')

  return y_pred, y_proba

In [None]:
bert_pred, bert_proba = DEC(bert_data, 20, 'bert', encoding_dim=64, batch_size=32, learning_rate=0.0001)

Iteration: 0: loss = 0.057842668145895004
Iteration: 140: loss = 0.03253987058997154
Iteration: 280: loss = 0.04777824506163597
Iteration: 420: loss = 0.071541927754879
Iteration: 560: loss = 0.09459365159273148
Iteration: 700: loss = 0.11490293592214584
Iteration: 840: loss = 0.1309777945280075
Iteration: 980: loss = nan




In [None]:
bert_pred[:3]

array([ 1, 11,  2])

In [None]:
bert_proba[2]

array([0.03422335, 0.02310062, 0.27634946, 0.03301444, 0.03332696,
       0.03662706, 0.0267931 , 0.02875627, 0.02862794, 0.04006774,
       0.01736554, 0.09502963, 0.02680646, 0.06378695, 0.06004831,
       0.0657844 , 0.03439815, 0.03058293, 0.0234688 , 0.02184187],
      dtype=float32)