<h1>Install the necessary packages</h1>

In [None]:
!pip install pandas
!pip install scikit-learn
!pip install numpy
!pip install scipy
!pip install cvxpy
!pip install mosek

<h1>Load the GRASP+PR algorithm</h1>

In [1]:
import numpy as np
import cvxpy as cvx
from itertools import combinations
from scipy.spatial import distance


# This function returns the distance matrix for a dataset along with the sum of distances between every pair of points
def get_distances(data, ord=2):
  distances = distance.cdist(data, data, metric='minkowski', p=ord)
  weight = np.sum(distances) / 2
  return distances, weight


# This function calculates the minimum k-partition weight
def kcut(data, classes, k):
    data = np.array(data)
    classes = np.array(classes)
    
    weight = 0
    for cluster in range(k):
        locs = np.where(classes == cluster)[0]
        
        if len(locs) == 0:
            continue
            
        pairs = np.array(list(combinations(data[locs], 2)))
        if len(pairs) == 0:
            continue
        
        differences = pairs[:, 0] - pairs[:, 1]
        distances = np.linalg.norm(differences, ord=2, axis=1)
        weight += np.sum(distances)

    return weight

In [2]:
# This function solves the SDP relaxation of the max k-cut problem given in Frieze and Jerrum's 1997 paper
def max_k_cut(distances, k):
    n = len(distances)
  
    Y = cvx.Variable((n, n), PSD=True)
    constraints = [Y >= (-1 / (k - 1)), cvx.diag(Y) == 1]
    expr = cvx.sum(cvx.multiply(distances, np.ones((n, n)) - Y))
    problem = cvx.Problem(cvx.Maximize(expr), constraints)
    
    problem.solve(solver='MOSEK')

    y = Y.value
    eigenvalues = np.linalg.eigh(y)[0]

    if min(eigenvalues) < 0:
        y += abs(min(eigenvalues)) * np.identity(n) * 1.00001  # to fix floating-point imprecision

    return y


# This function performs the fixed-point iteration step described in Felzenszwalb et al.'s 2022 paper
def fixed_point(x, k, n):
    A = ((1 - k/2) / (k-1)) * np.ones(n)
    Y = cvx.Variable((n, n), PSD=True)

    constraints = [Y >= (-1 / (k - 1)), cvx.diag(Y) == 1]
    expr = cvx.sum(cvx.multiply(x + A, Y))
    problem = cvx.Problem(cvx.Maximize(expr), constraints)
  
    problem.solve(solver='MOSEK')
    
    y = Y.value
    eigenvalues = np.linalg.eigh(y)[0]

    if min(eigenvalues) < 0:
        y += abs(min(eigenvalues)) * np.identity(n) * 1.00001  # to fix floating-point imprecision
    
    return y

In [3]:
def sdp_fp(data, fp_iterations, k, distances=None):
  n = len(data)
  
  if distances is None:
    distances = distance.cdist(data, data, metric='minkowski', p=2)
  
  x = max_k_cut(distances, k)
  mat = np.reshape(x, (n, n))

  for i in range(fp_iterations):
    x = fixed_point(x, k, n)

  unused = [i for i in range(n)]
  classes = [-1 for i in range(n)]
  
  for i in range(K):
      current = unused[0]
  
      j = 0
      while j < len(unused):
          if x[current, unused[j]] > 0:
              classes[unused[j]] = i
              unused.remove(unused[j])
          else:
              j += 1

  return classes, kcut(data, classes, k)

<h1>Select a dataset to load</h1>

<h3>Iris</h3>

In [4]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

DATASET = 'datasets/iris.csv'

df = pd.read_csv(DATASET, index_col=False)

data = df.iloc[:, :4].values
distances, all_weights = get_distances(data)
output = df.iloc[:, 4].values

K = len(np.unique(output))

<h3>Palmer Penguins</h3>

In [4]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer

DATASET = 'datasets/penguins.csv'

df = pd.read_csv(DATASET, index_col=[0])
numerical_data = df.get(['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g', 'year'])
categorical_data = df.get(['island', 'sex'])

num_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
numerical_data = num_imputer.fit_transform(numerical_data)

cat_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
categorical_data = cat_imputer.fit_transform(categorical_data)

encoder = OneHotEncoder()
categorical_data = encoder.fit_transform(categorical_data).toarray()

concatenated_data = np.append(numerical_data, categorical_data, axis=1)

scaler = StandardScaler()

data = scaler.fit_transform(concatenated_data)
distances, all_weights = get_distances(data)
output = df.get('species').values

K = len(np.unique(output))

<h3>MNIST</h3>

In [None]:
from sklearn.metrics import adjusted_rand_score
from collections import Counter
from sklearn.cluster import KMeans, SpectralClustering, AgglomerativeClustering
import pandas as pd
import math
import numpy as np

size = 15000

# The subset of the MNIST dataset used was preprocessed and pickled for convenience
# due to its large size

with open(f'datasets/pickles/mnist{size}_data.npy', 'rb') as data_pickle:
  data = np.load(data_pickle)

with open(f'datasets/pickles/mnist{size}_output.npy', 'rb') as output_pickle:
  output = np.load(output_pickle)

distances, all_weights = get_distances(data)

K = len(np.unique(output))

<h3>Crop Recommendation</h3>

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

DATASET = 'datasets/Crop_recommendation.csv'

df = pd.read_csv(DATASET, index_col=False)

data = df.iloc[:, :7].values
distances, all_weights = get_distances(data)
output = df.iloc[:, 7].values

K = len(np.unique(output))

<h3>Seeds</h3>

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer

DATASET = 'datasets/seeds_dataset.csv'

df = pd.read_csv(DATASET, index_col=None, header=None, sep='\t')

data = df.iloc[:, :7].values
distances, all_weights = get_distances(data)
output = df.iloc[:, 7].values

K = len(np.unique(output))

<h3>Leaf</h3>

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer

DATASET = 'datasets/leaf.csv'

df = pd.read_csv(DATASET, index_col=[1], header=None)

data = df.iloc[:, 1:].values
distances, all_weights = get_distances(data)
output = df.iloc[:, 0].values

K = len(np.unique(output))

<h3>Wine</h3>

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer

DATASET = 'datasets/wine.csv'

df = pd.read_csv(DATASET, index_col=None, header=None)

data = df.iloc[:, 1:].values
distances, all_weights = get_distances(data)
output = df.iloc[:, 0].values

K = len(np.unique(output))

<h3>G-set</h3>

In [None]:
# Only the G1, G2, G3, G14, G15, G16, G22, G23, G24, G35, G36, G37, G43, G44, G45, G48, G49, and G50 graphs are included

def load_G(num):
  global distances, all_weights
  with open(f'datasets/Gset/G{num}', 'r') as f:
    n, lines = map(int, f.readline().split())
  
    distances = np.zeros((n, n), dtype=np.int32)
    for _ in range(lines):
      a, b, weight = map(int, f.readline().split())
      a -= 1; b -= 1
      distances[a, b] = weight
      distances[b, a] = weight

  all_weights = np.sum(distances) / 2

print('Load G_:')
g_val = int(input())
load_G(g_val)
data = None
K = 2

<h1>Run the algorithm</h1>

In [5]:
from sklearn.metrics import adjusted_rand_score
from collections import Counter
from sklearn.cluster import KMeans, SpectralClustering, AgglomerativeClustering
import pandas as pd
import math

fp_iterations = 1

solution, min_k_partition_weight = sdp_fp(data, fp_iterations, K, distances=distances)
max_k_cut_weight = all_weights - min_k_partition_weight

print(f'Solution has a max k-cut weight of: {max_k_cut_weight}')
print(f'Solution obtains an adjusted Rand index of: {adjusted_rand_score(solution, output)}')

Solution has a max k-cut weight of: 25016.881715538613
Solution obtains an adjusted Rand index of: 0.7561944834034595
