<h1>Install the necessary packages</h1>

In [None]:
!pip install pandas
!pip install scikit-learn
!pip install numpy
!pip install munkres
!pip install scipy

<h1>Load the GRASP+PR algorithm</h1>

In [1]:
import numpy as np
from itertools import combinations
from sklearn.metrics.cluster import contingency_matrix
from munkres import Munkres
from scipy.spatial import distance


# This function returns the distance matrix for a dataset along with the sum of distances between every pair of points
def get_distances(data, ord=2):
  distances = distance.cdist(data, data, metric='minkowski', p=ord)
  weight = np.sum(distances) / 2
  return distances, weight


# This function calculates the minimum k-partition weight
def kcut(data, classes, k):
    data = np.array(data)
    classes = np.array(classes)
    
    weight = 0
    for cluster in range(k):
        locs = np.where(classes == cluster)[0]
        
        if len(locs) == 0:
            continue
            
        pairs = np.array(list(combinations(data[locs], 2)))
        if len(pairs) == 0:
            continue
        
        differences = pairs[:, 0] - pairs[:, 1]
        distances = np.linalg.norm(differences, ord=2, axis=1)
        weight += np.sum(distances)

    return weight
  

# Code taken and modified from
# https://stackoverflow.com/questions/55258457/find-mapping-that-translates-one-list-of-clusters-to-another-in-python
# This function finds maps a clustering to an equivalent clustering nearest to another specified clustering
def translate_labels(master_list, convert_list):
  cont_mat = contingency_matrix(master_list, convert_list)
  munkres = Munkres()
  mapping = munkres.compute(cont_mat.max() - cont_mat)

  master_labels = np.unique(master_list)
  to_convert = np.unique(convert_list)

  map = {}
  for master_label, convert_label in mapping:
    map[to_convert[convert_label]] = master_labels[master_label]

  return map


# This function determines how different two clusterings are
def set_difference(classes, guiding):
  mapping = translate_labels(classes, guiding)
  guiding = [mapping[class_name] for class_name in guiding]

  diff = [pos for pos, (x, y) in enumerate(zip(classes, guiding)) if x != y]

  return len(diff) / len(guiding)

In [2]:
import numpy as np
import random
import time


# Code for the greedy build phase of the algorithm
def greedy_build(data, k):
    built_data = []
    assigned_clusters = []
    for index, point in enumerate(data):
        built_data += [point]

        best = 0
        assigned = 0
        for cluster in range(k):
            weight = kcut(built_data, assigned_clusters + [cluster], k)
            if weight < best or best == 0:
                best = weight
                assigned = cluster

        assigned_clusters += [assigned]

    return np.array(assigned_clusters)


# Code for the local search phase of the algorithm
def local_search(data, classes, k):
    weight = kcut(data, classes, k)
    
    change = True
    while change:
        change = False
        for pos, v in enumerate(classes):
            new_cluster = v
            for cluster in range(k):
                if cluster == v:
                    continue

                classes[pos] = cluster
                new_weight = kcut(data, classes, k)
                if new_weight < weight:
                    weight = new_weight
                    new_cluster = cluster

            if new_cluster != v:
                classes[pos] = new_cluster
                change = True
                break
            else:
                classes[pos] = v

    return classes


# Code for the path relinking phase of the algorithm
def path_relinking(data, classes, k, elite_set):
  guiding = random.choice(elite_set).copy()
  
  mapping = translate_labels(classes, guiding)
  guiding = [mapping[class_name] for class_name in guiding]
  
  diff = [pos for pos, (x, y) in enumerate(zip(classes, guiding)) if x != y]

  weight_delta = 0
  best_weight_delta = 0

  best_classes = np.array(classes)
  while len(diff) > 1:
    movements = []

    for index in diff:
      relinked_classes = classes.copy()
      relinked_classes[index] = guiding[index]
      movements.append(kcut(data, relinked_classes, k) - kcut(data, classes, k))
    
    relink_index = np.argmin(movements)

    index = diff[relink_index]
    classes[index] = guiding[index]
    weight_delta += movements[relink_index]
    if weight_delta < best_weight_delta:
      best_weight_delta = weight_delta
      best_classes = np.array(classes)

    del diff[relink_index]

  return best_classes

In [3]:
from sklearn.metrics import rand_score
from scipy.spatial import distance
import numpy as np
import time
from sklearn.metrics import adjusted_rand_score

DEBUG = False
DEBUG_SEED = 0


def grasp_pr(data, iterations, k, max_elite):
  REPLACE_THRESHOLD = 0.01
  
  best_weight = -1
  best_solution = 0
  elite_set = []
  elite_set_weights = []

  print('|', end='')
  
  checkpoint = 1
  
  for i in range(iterations):
    shuffle_order = np.arange(data.shape[0])
    np.random.shuffle(shuffle_order)

    shuffled_data = data[shuffle_order]

    build = greedy_build(shuffled_data, k)

    build = local_search(shuffled_data, build, k)
    weight = kcut(shuffled_data, build, k)

    if best_weight == -1:
      best_weight = weight

      unshuffle_order = np.zeros_like(shuffle_order)
      unshuffle_order[shuffle_order] = np.arange(data.shape[0])
      unshuffled_build = build[unshuffle_order]

      best_solution = unshuffled_build
      elite_set += [unshuffled_build]
    else:
      unshuffle_order = np.zeros_like(shuffle_order)
      unshuffle_order[shuffle_order] = np.arange(data.shape[0])
      unshuffled_build = build[unshuffle_order]

      unshuffled_build = path_relinking(data, unshuffled_build, k, elite_set)

      weight = kcut(data, unshuffled_build, k)

      if len(elite_set) < max_elite:
        elite_set += [unshuffled_build]
        elite_set_weights += [weight]

      else:
        if weight < min(elite_set_weights):
          worst = np.argmax(elite_set_weights)
          elite_set[worst] = unshuffled_build
          elite_set_weights[worst] = weight

        elif weight < max(elite_set_weights):
          difference = min([set_difference(unshuffled_build, elite_solution) for elite_solution in elite_set])

          if difference > REPLACE_THRESHOLD:
            worst = np.argmax(elite_set_weights)
            elite_set[worst] = unshuffled_build
            elite_set_weights[worst] = weight

      if weight < best_weight:
        best_weight = weight

        best_solution = unshuffled_build
        elite_set += [unshuffled_build]

    if (i + 1) >= checkpoint * (iterations // 100):
      print(checkpoint%10, end='')
      checkpoint += 1

  print()
  return best_solution, best_weight

<h1>Select a dataset to load</h1>

<h3>Iris</h3>

In [4]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

DATASET = 'datasets/iris.csv'

df = pd.read_csv(DATASET, index_col=False)

data = df.iloc[:, :4].values
distances, all_weights = get_distances(data)
output = df.iloc[:, 4].values

K = len(np.unique(output))

<h3>Palmer Penguins</h3>

In [4]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer

DATASET = 'datasets/penguins.csv'

df = pd.read_csv(DATASET, index_col=[0])
numerical_data = df.get(['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g', 'year'])
categorical_data = df.get(['island', 'sex'])

num_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
numerical_data = num_imputer.fit_transform(numerical_data)

cat_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
categorical_data = cat_imputer.fit_transform(categorical_data)

encoder = OneHotEncoder()
categorical_data = encoder.fit_transform(categorical_data).toarray()

concatenated_data = np.append(numerical_data, categorical_data, axis=1)

scaler = StandardScaler()

data = scaler.fit_transform(concatenated_data)
distances, all_weights = get_distances(data)
output = df.get('species').values

K = len(np.unique(output))

<h3>MNIST</h3>

In [None]:
from sklearn.metrics import adjusted_rand_score
from collections import Counter
from sklearn.cluster import KMeans, SpectralClustering, AgglomerativeClustering
import pandas as pd
import math
import numpy as np

size = 15000

# The subset of the MNIST dataset used was preprocessed and pickled for convenience
# due to its large size

with open(f'datasets/pickles/mnist{size}_data.npy', 'rb') as data_pickle:
  data = np.load(data_pickle)

with open(f'datasets/pickles/mnist{size}_output.npy', 'rb') as output_pickle:
  output = np.load(output_pickle)

distances, all_weights = get_distances(data)

K = len(np.unique(output))

<h3>Crop Recommendation</h3>

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

DATASET = 'datasets/Crop_recommendation.csv'

df = pd.read_csv(DATASET, index_col=False)

data = df.iloc[:, :7].values
distances, all_weights = get_distances(data)
output = df.iloc[:, 7].values

K = len(np.unique(output))

<h3>Seeds</h3>

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer

DATASET = 'datasets/seeds_dataset.csv'

df = pd.read_csv(DATASET, index_col=None, header=None, sep='\t')

data = df.iloc[:, :7].values
distances, all_weights = get_distances(data)
output = df.iloc[:, 7].values

K = len(np.unique(output))

<h3>Leaf</h3>

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer

DATASET = 'datasets/leaf.csv'

df = pd.read_csv(DATASET, index_col=[1], header=None)

data = df.iloc[:, 1:].values
distances, all_weights = get_distances(data)
output = df.iloc[:, 0].values

K = len(np.unique(output))

<h3>Wine</h3>

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer

DATASET = 'datasets/wine.csv'

df = pd.read_csv(DATASET, index_col=None, header=None)

data = df.iloc[:, 1:].values
distances, all_weights = get_distances(data)
output = df.iloc[:, 0].values

K = len(np.unique(output))

<h3>G-set</h3>

In [None]:
# Only the G1, G2, G3, G14, G15, G16, G22, G23, G24, G35, G36, G37, G43, G44, G45, G48, G49, and G50 graphs are included

def load_G(num):
  global distances, all_weights
  with open(f'datasets/Gset/G{num}', 'r') as f:
    n, lines = map(int, f.readline().split())
  
    distances = np.zeros((n, n), dtype=np.int32)
    for _ in range(lines):
      a, b, weight = map(int, f.readline().split())
      a -= 1; b -= 1
      distances[a, b] = weight
      distances[b, a] = weight

  all_weights = np.sum(distances) / 2

print('Load G_:')
g_val = int(input())
load_G(g_val)
data = None
K = 2

<h1>Run the algorithm</h1>

In [5]:
from sklearn.metrics import adjusted_rand_score
from collections import Counter
from sklearn.cluster import KMeans, SpectralClustering, AgglomerativeClustering
import pandas as pd
import math

MAX_ELITE = 2
iterations = 10

solution, min_k_partition_weight = grasp_pr(data, iterations, K, MAX_ELITE)
max_k_cut_weight = all_weights - min_k_partition_weight

print(f'Solution has a max k-cut weight of: {max_k_cut_weight}')
print(f'Solution obtains an adjusted Rand index of: {adjusted_rand_score(solution, output)}')

|1234567890
Solution has a max k-cut weight of: 25016.881715538613
Solution obtains an adjusted Rand index of: 0.7561944834034595
