<a href="https://colab.research.google.com/github/FabrizioC95/SelfScheduled_Neural_Clustering/blob/main/SelfScheduled_Neural_Clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# This file contains an example of running the self-scheduled network.

*   Libraries are installed
*   Dependencies are defined
*   Example of using the network
*   Code so you can run on your own data

# Install and run libraries

In [2]:
#----- Install the following libraries
!pip install torch --quiet
#!pip install sklearn --quiet

#-- General Libs
import pandas as pd
import numpy as np
import random

#-- Libraries for Neural Network
import torch
import torch.nn as nn
import torch.optim as optim

#-- Shallow Clustering Methods (for pre-training)
from sklearn.cluster import KMeans

#-- Datasets and Dataloder
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

# ------------------------------------------------------------------------------
# Dependencies: **Simply run this code block (click the ▶ button next to 'cell hidden')**

1.   Neural Architecture
2.   Utils (Dataloader, Pytorch seed generator, Inference function)
3.   The various trainers that are needed
4.   Function orchestrating everything

# ---

In [24]:
#-------------------------------------------------------------------------------
#-------------------------------------------------------------------------------
#-----------------------------Architecture--------------------------------------
#-------------------------------------------------------------------------------
#-------------------------------------------------------------------------------

#-----------------------------
#----- Single Autoencoder
class AutoEncoder(nn.Module):
    def __init__(self, data_dim, hidden_dim, batch_normalize=False, dropout=[False, 0.2]):
        super(AutoEncoder, self).__init__()

        # Store parameters
        self.data_dim = data_dim
        self.hidden_dim = hidden_dim
        self.batch_normalize = batch_normalize
        self.dropout = dropout[0]  #True/False
        self.dropout_p = dropout[1]  #Dropout probability

        #-- Encoder Layers
        encoder_layers = []
        input_size = data_dim

        for idx, h_dim in enumerate(hidden_dim[:-1]):
            layer = nn.Linear(input_size, h_dim)
            nn.init.xavier_uniform_(layer.weight)
            encoder_layers.append(layer)

            if batch_normalize:
                encoder_layers.append(nn.BatchNorm1d(h_dim))

            encoder_layers.append(nn.ELU())

            #-- Dropout
            if self.dropout:
                encoder_layers.append(nn.Dropout(p=self.dropout_p))

            input_size = h_dim

        #--- Embedding Layer
        self.embedding_layer = nn.Linear(hidden_dim[-2], hidden_dim[-1])
        nn.init.xavier_uniform_(self.embedding_layer.weight)

        #--- Decoder Layers
        decoder_layers = []
        input_size = hidden_dim[-1]
        for idx, h_dim in enumerate(reversed(hidden_dim[:-1])):
            layer = nn.Linear(input_size, h_dim)
            nn.init.xavier_uniform_(layer.weight)
            decoder_layers.append(layer)

            if batch_normalize:
                decoder_layers.append(nn.BatchNorm1d(h_dim))

            decoder_layers.append(nn.ELU())

            #-- Dropout
            if self.dropout:
                decoder_layers.append(nn.Dropout(p=self.dropout_p))

            input_size = h_dim

        #--- Final output layer
        self.final_layer = nn.Sequential(
            nn.Linear(input_size, data_dim),
            nn.Sigmoid()
        )
        nn.init.xavier_uniform_(self.final_layer[0].weight)

        #-- Assign encoder and decoder as sequential modules
        self.encoder = nn.Sequential(*encoder_layers)
        self.decoder = nn.Sequential(*decoder_layers)

    def forward(self, x):
        encoded = self.encoder(x)
        embedding = self.embedding_layer(encoded)
        decoded = self.decoder(embedding)
        output = self.final_layer(decoded)

        return embedding, output

#-----------------------------
#----- Stack of autoencoders
class KAutoEncoders(nn.Module):
    def __init__(self, k, data_dim, hidden_dim, batch_normalize=False, dropout=[False, 0.0]):
        super(KAutoEncoders, self).__init__()
        self.k = k
        self.data_dim = data_dim
        self.hidden_dim = hidden_dim
        self.dropout = dropout

        # Initialize k autoencoders with feature selection
        self.autoencoders = nn.ModuleList(
            [AutoEncoder(data_dim,
                         hidden_dim,
                         batch_normalize,
                         dropout=self.dropout)
             for _ in range(k)])

    #--- Forward pass
    def forward(self, x):
        reconstructions = []
        embeddings = []

        for autoencoder in self.autoencoders:
            embedding, reconstruction = autoencoder(x)
            embeddings.append(embedding)
            reconstructions.append(reconstruction)

        reconstructions = torch.stack(reconstructions, dim=1)  # Shape: (batch_size, k, data_dim)
        embeddings = torch.stack(embeddings, dim=1)  # Shape: (batch_size, k, embedding_dim)

        return embeddings, reconstructions


#-----------------------------
#----- Clustering Network
class MixtureAssignmentNetwork(nn.Module):
    def __init__(self, k, data_dim, cluster_hidden_sizes, batch_normalize=False):
        super(MixtureAssignmentNetwork, self).__init__()

        #-- Define needed layers
        layers = []
        in_dim = data_dim

        #-- Hidden Layers
        for hidden_size in cluster_hidden_sizes:
            linear_layer = nn.Linear(in_dim, hidden_size)
            nn.init.xavier_uniform_(linear_layer.weight)
            layers.append(linear_layer)

            if batch_normalize:
                layers.append(nn.BatchNorm1d(hidden_size))

            layers.append(nn.ELU())
            in_dim = hidden_size

        #-- Output layer
        output_layer = nn.Linear(in_dim, k)
        nn.init.xavier_uniform_(output_layer.weight)
        layers.append(output_layer)
        layers.append(nn.Softmax(dim=1))

        #-- Make sequential
        self.network = nn.Sequential(*layers)

    def forward(self, x):
        return self.network(x)

#-----------------------------
#----- Wrapper to call it all
class ClusteringAutoEncoder(nn.Module):
    def __init__(self, k, data_dim, hidden_dim, cluster_hidden_sizes, batch_normalize=False, cluster_batch_normalize=False, dropout=[False, 0.0]):
        super(ClusteringAutoEncoder, self).__init__()

        #--- K AutoEncoders
        self.k_autoencoders = KAutoEncoders(
            k=k,
            data_dim=data_dim,
            hidden_dim=hidden_dim,
            batch_normalize=batch_normalize,
            dropout=dropout  # Pass dropout as [bool, float]
        )

        #--- Classification Network
        self.cluster_net = MixtureAssignmentNetwork(
            k=k,
            data_dim=data_dim,
            cluster_hidden_sizes=cluster_hidden_sizes,
            batch_normalize=cluster_batch_normalize
        )

    def forward(self, x):
        #--- Pass through K autoencoders
        embeddings, reconstructions = self.k_autoencoders(x)

        #--- Forward pass through classification network
        cluster_probs = self.cluster_net(x)

        return embeddings, reconstructions, cluster_probs
        #Shape: (batch_size, k, embedding_dim)
        #Shape: (batch_size, k, data_dim)
        #Shape: (batch_size, num_clusters)



#-------------------------------------------------------------------------------
#-------------------------------------------------------------------------------
#---------------------------------Utils-----------------------------------------
#-------------------------------------------------------------------------------
#-------------------------------------------------------------------------------
def load_data(df, categorical_cols=None, numerical_cols=None, k=None, batch_size=None, generator=None):
  #-------------------------------#
  #------ Initialize encoders ---#
  #------------------------------#
  scaler = MinMaxScaler()
  categorical_cols = categorical_cols if categorical_cols is not None else []
  numerical_cols = numerical_cols if numerical_cols is not None else []

  #--------------------------
  #--- General daloader -----
  #--------------------------
  class NormalDataloader(Dataset):
    def __init__(self, dataframe):
      self.X = torch.tensor(dataframe.to_numpy(), dtype=torch.float32)

    def __len__(self):
      return len(self.X)

    def __getitem__(self, idx):
      return self.X[idx], idx
  #----------------------------
  #--------- Warning Messages
  #---------

  #- Check that the columns passed are list types
  if not isinstance(categorical_cols, list) or not isinstance(numerical_cols, list):
    raise ValueError("Both 'categorical_cols' and 'numerical_cols' must be a 'list' object type.")

  #- Check that at least one of the columns is provided
  if not categorical_cols and not numerical_cols:
    raise ValueError("You need to define 'categorical_cols' and/or 'numerical_cols' ")

  #- Check that the columns exist
  missing_cols = [col for col in (numerical_cols + categorical_cols) if col not in df.columns]
  if missing_cols:
    raise ValueError(f"The following columns not found in the dataframe: {missing_cols}")

  #- Check that "k" is provided
  if not isinstance(k, int) or k <= 0:
    raise ValueError("Number of clusters ('k = __') must be provided. E.g., how many should there be?")

  #- Check that there are no missing values
  if df.isnull().values.any():
    raise ValueError("Data contains missing values. Please handle NaNs before using this function")


  #----------------------------
  #--------- Encoding categorical variables
  #---------
  if categorical_cols:
    df = pd.get_dummies(df, columns=categorical_cols, dtype=float)

  #----------------------------
  #--------- Encoding numerical variables
  #---------
  if numerical_cols:
    df[numerical_cols] = df[numerical_cols].astype(float)
    df[numerical_cols] = scaler.fit_transform(df[numerical_cols])


  #----------------------------
  #--------- Extras
  shape = df.shape[1]
  dataset = NormalDataloader(df)
  dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, generator=generator)

  return dataset, dataloader, k, shape, df






#---------------------------------------------
#------------- Seed Generator ----------------
#--------------------------------------------
#-- We require a seed generator for numpy operations and pytorch operations
def reset_seed(seed):
  torch.manual_seed(seed) #- For Pytorch generator
  np.random.seed(seed) #- For custom operators, according to PyTorch docs
  random.seed(seed) #- For python Operations

  #--- CUDA Seed
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

  #--- Fixing dataloader shuffle
  generator = torch.Generator()
  generator.manual_seed(seed)

  return generator


#---------------------------------------------
#------------- Inference ----------------
#--------------------------------------------
def run_inference(model, dataloader, dataset, device, k):
  model.eval()
  predictions = []


 #-- Training
  with torch.no_grad():
    for batch, indices in dataloader:
      batch = batch.to(device)

      #- Forward pass
      _, _, probs = model(batch)

      #- Cluster assignment
      best_autoencoder = torch.argmax(probs, dim=1).cpu().numpy()
      indices = indices.cpu().numpy()

      #- Store results
      for idx, cluster in zip(indices, best_autoencoder):
        predictions.append((idx, cluster))

  #-- Save prediction as dataframe
  results = pd.DataFrame(predictions, columns=['Index', 'Cluster'])
  return results



#-------------------------------------------------------------------------------
#-------------------------------------------------------------------------------
#-------------------------------Trainers----------------------------------------
#-------------------------------------------------------------------------------
#-------------------------------------------------------------------------------
#-----------------------------------------
#----- Pre-training Classification Network
def shallow_pt_first(k, input_features, targets=None, model='kmeans', generator=None, random_seed=None):
    #-- Convert to NumPy
    if isinstance(input_features, pd.DataFrame):
        input_features = input_features.to_numpy()

    if isinstance(input_features, torch.Tensor):
        input_features = input_features.numpy()

    #--- Use pytorch seed generator if provided
    #--- If not this uses the default RNG generator

    #- Raise error if both arguments are passed
    if generator is not None and random_seed is not None:
      raise ValueError("Both 'generator' and 'random_seed' arguments cannot be provided at the same time")

    if generator is not None:
      random_state = generator.initial_seed()

    elif random_seed is not None:
      random_state = random_seed

    else:
      #- Default to regular, unfixed, RNG if neither arguments are provided
      random_state = random_seed

    #-- Conduct K-means clustering
    if model == 'kmeans':
        p_kmeans = KMeans(n_clusters=k, random_state=random_state)
        pseudo_labels = p_kmeans.fit_predict(input_features)

    #-- Create aligned dataset with pseudo-labels
    feature_columns = [f"feature_{i}" for i in range(input_features.shape[1])]
    aligned_df = pd.DataFrame(input_features, columns=feature_columns)
    aligned_df['pseudo_labels'] = pseudo_labels

    return aligned_df

#-----------------------------------------
#----- Pre-training Classification Network
#- Uses the pseudo-labels to train the clustering network through a classification task.
def pretrain_mixture_assignment_network(k, pseudo_data, data_dim, cluster_hidden_sizes,
                                        batch_normalize=True, pt_num_epochs=10,
                                        pt_batch_size=64, pre_lr=0.001, weight_decay=.001, generator=None, device=None):
  class MixtureDataLoader(Dataset):
    def __init__(self, dataframe):
        self.features = dataframe.drop(columns=['pseudo_labels']).values
        self.pseudo_labels = dataframe['pseudo_labels'].values

        self.X = torch.tensor(self.features, dtype=torch.float32)
        self.y = torch.tensor(self.pseudo_labels, dtype=torch.long)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        """Return the data point, index, and pseudo-label for the given index."""
        return self.X[idx], idx, self.y[idx]

    #--- Initialize classification network
  mixture_assignment_net = MixtureAssignmentNetwork(
      k=k,
      data_dim=data_dim,
      cluster_hidden_sizes=cluster_hidden_sizes,
      batch_normalize=batch_normalize
  ).to(device)

  #--- Initialize dataloader
  dataset = MixtureDataLoader(pseudo_data)
  dataloader = DataLoader(dataset, batch_size=pt_batch_size, shuffle=True, generator=generator)

  #--- Define loss
  criterion = torch.nn.CrossEntropyLoss()
  optimizer = optim.Adam(mixture_assignment_net.parameters(), lr=pre_lr, weight_decay=weight_decay)

  #--- Training loop
  mixture_assignment_net.train()

  for epoch in range(pt_num_epochs):

    for batch_data, _, pseudo_labels in dataloader:
      optimizer.zero_grad()
      batch_data, pseudo_labels = batch_data.to(device), pseudo_labels.to(device)

      #--- Forward pass
      outputs = mixture_assignment_net(batch_data)

      #--- Loss
      loss = criterion(outputs, pseudo_labels)

      #--- Backprop
      loss.backward()
      optimizer.step()

  return mixture_assignment_net


#-----------------------------------------
#----- General Trainer
def samplewise_trainer(model, dataloader, dataset, optimizer, num_epochs, alpha, beta, k, device, schedule='batch'):
  model.train()

  for epoch in range(num_epochs):
    for batch, indices in dataloader:
      batch=batch.to(device)
      optimizer.zero_grad()

      #-- Forward pass
      embeddings, reconstructions, probs = model(batch)

      #-- Weighted Reconstruction Loss
      batch_expanded = batch.unsqueeze(1)
      diff = batch_expanded - reconstructions
      l2_norm = torch.sum(diff ** 2, dim=-1)
      weighted_errors = probs * l2_norm
      ae_losses = torch.sum(weighted_errors, dim=1)

      #-- Sample-Wise Entropy
      sample_entropy = -torch.sum(probs * torch.log(probs + 1e-8), dim=1)

      #-- Batch-Wise Entropy
      avg_probs = probs.mean(dim=0)
      batch_entropy = -torch.sum(avg_probs * torch.log(avg_probs + 1e-8))

      #-- Loss Function
      total_loss = (torch.sum(ae_losses + alpha * sample_entropy) / batch.size(0)) - beta * batch_entropy

      #-- Backprop
      total_loss.backward()
      optimizer.step()

      #-- Batch Scheduled Training
      if schedule == 'batch':
        with torch.no_grad():
          alpha = 1
          samplewise_term = (torch.sum(ae_losses + alpha * sample_entropy) / batch.size(0))
          batch_entr_magnitude = -torch.sum(avg_probs * torch.log(avg_probs + 1e-8))
          beta = samplewise_term / (batch_entr_magnitude + 1e-8)

    #-- Epoch Scheduled Training
    if schedule == 'epoch':
      with torch.no_grad():
        alpha = 1
        samplewise_term = (torch.sum(ae_losses + alpha * sample_entropy) / batch.size(0))
        batch_entr_magnitude = -torch.sum(avg_probs * torch.log(avg_probs + 1e-8))
        beta = samplewise_term / (batch_entr_magnitude + 1e-8)

  return model



#-----------------------------------------
#----- General Training Function
def train_model(data,
                k,
                categorical_cols,
                numerical_cols,
                batch_size,
                hidden_dim=[128,64,32],
                cluster_hidden_sizes=[64,32],
                num_epochs=100,
                pt_num_epochs=10,
                lr =.001,
                seed=10):

  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  print(f"Device: Using {device} for training")

  #--- Seed generator
  generator = reset_seed(seed)

  #--- Initialize dataloader
  print("Initializing dataloader..")
  dataset, dataloader, k, data_dim, df = load_data(
      df=data,
      categorical_cols=categorical_cols,
      numerical_cols=numerical_cols,
      k=k,
      batch_size=batch_size)

  #--- Pre-training
  pseudo_data = shallow_pt_first(k=k, input_features=df, model='kmeans', generator=generator)

  pretrained_man = pretrain_mixture_assignment_network(
      k=k,
      pseudo_data=pseudo_data,
      data_dim=data_dim,
      cluster_hidden_sizes=cluster_hidden_sizes,
      batch_normalize=True,
      pt_num_epochs=pt_num_epochs,
      pt_batch_size=batch_size,
      pre_lr=0.001,
      weight_decay=0.001,
      generator=generator,
      device=device)

  #--- Initialize Model
  model = ClusteringAutoEncoder(
      k=k,
      data_dim=data_dim,
      hidden_dim=hidden_dim,
      cluster_hidden_sizes=cluster_hidden_sizes,
      batch_normalize=True,
      cluster_batch_normalize=True,
      dropout=[False, 0.0]).to(device)


  #--- Load Pre-Trained Mixture Assignment Network Weights
  model.cluster_net.load_state_dict(pretrained_man.state_dict())

  #--- Model optimizer
  optimizer=torch.optim.Adam(model.parameters(), lr=lr)

  #--- Train model using appropriate training schedule
  print("Training Network..")
  trained_model = samplewise_trainer(
          model=model,
          dataloader=dataloader,
          dataset=dataset,
          optimizer=optimizer,
          num_epochs=num_epochs,
          alpha=5,
          beta=5,
          k=k,
          device=device,
          schedule="batch")

  #--- Run inference
  print("Running inference")
  inference_df = run_inference(
      model=trained_model,
      dataloader=dataloader,
      dataset=dataset,
      device=device,
      k=k)

  #--- Align predictions with original dataset
  results_df = df.copy()
  results_df['Cluster'] = results_df.index.map(inference_df.set_index('Index')['Cluster'])

  return results_df

#------------------------------------------------------------------------------
# **Example of using the network. This code does the following:**

### 1. Generate a dataset to play with  

### 2. List out columns by variable type (continuous, categorical)  
    - List containing the name of the columns with continuous values is defined
    - List containing the name of the columns with categorical values is defined  

### 3. Set settings for training:  
    - data = your cleaned dataset (cannot have missing values)  
    - k = number of clusters  
    - categorical_cols = The list with the names of the categorical columns  
    - numerical_cols = The list with the names of the continuous columns
    - batch_size = Defines how many data points are fed into the network per training step

### 4. The resulting dataset contains the **TRANSFORMED** variables, and a new column "Clusters" containing the cluster assignment for each person.


### **NOTE**: Other settings exist but these are the bare minimum ones  
# ---



In [25]:
#-----------------------------------------
#---------- 1.Generate a dataset ---------
#-----------------------------------------
n_clusters = 3
cluster_samples = 500 // n_clusters

df_store = []

for cluster_id in range(n_clusters):
    categorical_variable1 = np.random.choice(['A', 'B', 'C'], size=cluster_samples)
    categorical_variable2 = np.random.choice(['X', 'Y', 'Z'], size=cluster_samples)

    num1 = cluster_id * 50 + np.random.normal(scale=5, size=cluster_samples)
    num2 = cluster_id * 20 + np.random.normal(scale=3, size=cluster_samples)

    cluster_df = pd.DataFrame({'category1': categorical_variable1, 'category2': categorical_variable2, 'num1': num1, 'num2': num2})
    df_store.append(cluster_df)

df = pd.concat(df_store, ignore_index=True)

#-----------------------------------------
# 2. List out Columns by variable
#     (numerical, categorical)
#-----------------------------------------
numerical_columns = ['num1', 'num2']
categorical_columns = ['category1', 'category2']


#-----------------------------------------
#---------- 3. Train the Model -----------
#-----------------------------------------
results_df = train_model(
    data=df,
    k=3,
    categorical_cols=categorical_columns,
    numerical_cols=numerical_columns,
    batch_size=100)


#-----------------------------------------
#--------------- Results -----------------
#-----------------------------------------
#- The resulting dataframe contains a new column "Cluster" with the cluster assignment
results_df

Device: Using cpu for training
Initializing dataloader..
Training Network..
Running inference


Unnamed: 0,num1,num2,category1_A,category1_B,category1_C,category2_X,category2_Y,category2_Z,Cluster
0,0.139401,0.120370,0.0,1.0,0.0,0.0,1.0,0.0,1
1,0.092131,0.181113,0.0,1.0,0.0,0.0,0.0,1.0,1
2,0.103859,0.186419,1.0,0.0,0.0,1.0,0.0,0.0,2
3,0.100719,0.098470,1.0,0.0,0.0,0.0,1.0,0.0,2
4,0.104481,0.121441,0.0,1.0,0.0,1.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...
493,0.878775,0.886209,1.0,0.0,0.0,0.0,0.0,1.0,2
494,0.892436,0.879508,1.0,0.0,0.0,0.0,0.0,1.0,2
495,0.926543,0.809942,0.0,0.0,1.0,0.0,0.0,1.0,0
496,0.973000,0.751529,0.0,0.0,1.0,1.0,0.0,0.0,0


#------------------------------------------------------------------------------
# **Use on your Dataset**

### 1. Pre-process (***No missing values*** and any other data cleaning)
    - Make sure you've handled MISSING VALUES
    - Encoding variables into 0 -> 1 and one-hot encoding are handled by the tarining function

### 2. List out columns by variable type (continuous, categorical)  
    - List containing the name of the columns with continuous values is defined
    - List containing the name of the columns with categorical values is defined  

### 3. Set settings for training:  
    - data = your cleaned dataset (cannot have missing values)  
    - k = number of clusters  
    - categorical_cols = The list with the names of the categorical columns  
    - numerical_cols = The list with the names of the continuous columns
    - batch_size = Defines how many data points are fed into the network per training step

### 4. The resulting dataset contains the **TRANSFORMED** variables, and a new column "Clusters" containing the cluster assignment for each person.

In [None]:
#-----------------------------------------
#------------ Clean your Data ------------
#-----------------------------------------
#-- Make sure to handle missing values

#-----------------------------------------
#  List out Columns by variable type
#     (numerical, categorical)
#-----------------------------------------

# numerical_columns = ['age', 'income']
# categorical_columns = ['gender', 'subscribed_or_not']


#-----------------------------------------
#------------ Train the Model ------------
#-----------------------------------------
results_df = train_model(
    #- Pass the name of your dataset
    data = dataset_name,
    #- Pass the number of clusters
    k = number_of_clusters,
    #- Pass the lists with the column names
    categorical_cols = categorical_columns,
    numerical_cols = numerical_columns,
    #- Set the batch_size
    batch_size = 100)



#-----------------------------------------
#------------ Resulting Dataframe --------
#-----------------------------------------
results_df