This colab contains code for creating a correlation clustering problem defined by two weight matrices, W_plus and W_minus. You should add code to implement an approximation algorithm using semidefinite programming and randomized rounding, as described in [Williamson and Shmoys](https://www.designofapproxalgs.com/book.pdf) section 6.4.

# Construct weight matrices

In [3]:
# Fetch and import libraries
#!pip install picos -q
import picos as pc
import cvxopt as cvx
import cvxopt.lapack
from scipy.linalg import cholesky
import numpy as np

In [4]:
# Fetch data
# !wget -q https://raw.githubusercontent.com/rasmus-pagh/apx/main/data/denmark-0.6.txt -O denmark-0.6.txt
# !wget -q https://raw.githubusercontent.com/rasmus-pagh/apx/main/data/learning-0.6.txt -O learning-0.6.txt
# !wget -q https://raw.githubusercontent.com/rasmus-pagh/apx/main/data/copenhagen-0.5.txt -O copenhagen-0.5.txt


There are three data files containing [GloVe](https://nlp.stanford.edu/projects/glove/) vectors, whose dot products measure the similarity between words. The matrix W_plus is defined as dot products of such vectors, while W_minus is constant, equaling the average in W_plus.

In [5]:
import numpy as np

# Place the file you want to work with last
filename = 'learning-0.6.txt'
filename = 'copenhagen-0.5.txt'
filename = 'denmark-0.6.txt'

# Read vectors and construct matrices
with open(filename, 'r') as f:
  feature_vectors = []
  words = []
  for line in f:
    word, vector = line.split(';')
    words.append(word)
    vector = [ float(x) for x in vector.split(',') ]
    feature_vectors.append(vector)
  n = len(words)
  feature_vectors = np.array(feature_vectors)
  W_plus = np.dot(feature_vectors, np.transpose(feature_vectors))
  W_minus = np.ones(shape=(n,n)) * np.average(W_plus)

# Correlation clustering

Here you can implement your approximation algorithm. It may be helpful to consult the [implementation](https://colab.research.google.com/drive/1Rhe0kra6mqt5VHc2uTlNzJ_JC6kpG8nA?usp=sharing) of an approximation algorithm for Max Cut. Your implementation must:
- Define and solve the semidefinite programming relaxation
- Output the upper bound on OPT given by the relaxation
- Output the expected value of the objective function with a random 4-clustering
- Output the value of the best 4-clustering found using randomized rounding (say, in 100 trials), and the words placed in each cluster.

If you experience problems with convergence (optimizer not terminating) don't worry about it.


In [6]:
# Define the SDP 
def correlation_clustering():
   filenames = ('learning-0.6.txt', 'copenhagen-0.5.txt', 'denmark-0.6.txt')
   for filename in filenames:
      with open(filename, 'r') as f:
        feature_vectors = []
        words = []
        for line in f:
          word, vector = line.split(';')
          words.append(word)
          vector = [ float(x) for x in vector.split(',') ]
          feature_vectors.append(vector)
        n = len(words)
        feature_vectors = np.array(feature_vectors)
        W_plus = np.dot(feature_vectors, np.transpose(feature_vectors))
        W_minus = np.ones(shape=(n,n)) * np.average(W_plus)
   
      problem = pc.Problem()
      X = pc.SymmetricVariable('X', (n,n))
      W_p = pc.Constant('W_p', W_plus)
      W_m = pc.Constant('W_m', W_minus)
      ones = pc.Constant('1', np.ones((n,n)))
      problem.add_constraint(pc.maindiag(X) == 1)
      problem.add_constraint(X >> 0)
      problem.add_constraint(X >= 0)
      problem.add_constraint(X <= 1)
      problem.set_objective('max', ((W_p | X) + (W_m | (ones - X)))/2)
      print(f"\nInspecting {filename}...\n")
      print(problem)
      problem.solve(solver='cvxopt')
      print(f"SDP upper bound on value:", problem.value)
      X = np.clip(X.value, -1, 1)
      EX = np.sum(
         np.multiply(W_plus, (1 - 1/np.pi*np.arccos(X))**2).flatten() 
         + np.multiply(W_minus, (1 - (1 - 1/np.pi * np.arccos(X))**2)).flatten()
         )/2
      print(f"Expected value on this graph is {EX}.")
      
      max_value = 0
      print(X)
      for _ in range(100):
         r1 = np.random.normal(size=n)
         r2 = np.random.normal(size=n)
         
         R1 = np.logical_and(r1@X >= 0, r2@X >= 0)
         R2 = np.logical_and(r1@X >= 0, r2@X  < 0)
         R3 = np.logical_and(r1@X  < 0, r2@X >= 0)
         R4 = np.logical_and(r1@X  < 0, r2@X  < 0)
         Rs = (R1,R2,R3,R4)
         
         similarity_reward = 1/2*(
            np.sum(W_plus[R1,:][:,R1]) 
            + np.sum(W_plus[R2,:][:,R2]) 
            + np.sum(W_plus[R3,:][:,R3]) 
            + np.sum(W_plus[R4,:][:,R4])
            )
         dissimilarity_reward = 1/2*(
            np.sum(W_minus[R1,:][:,~R1])
            + np.sum(W_minus[R2,:][:,~R2])
            + np.sum(W_minus[R3,:][:,~R3])
            + np.sum(W_minus[R4,:][:,~R4])
         )
         value = similarity_reward + dissimilarity_reward
         
         if value > max_value:
            max_clustering = Rs
            max_value = value
      R1,R2,R3,R4 = max_clustering
      word_array = np.array(words)
      print(f"Found best value {max_value}. The clusters are:\n{word_array[R1]}\n{word_array[R2]}\n{word_array[R3]}\n{word_array[R4]}")
   return max_clustering

correlation_clustering()


Inspecting denmark-0.6.txt...

Semidefinite Program
  maximize (⟨W_p, X⟩ + ⟨W_m, 1 - X⟩)/2
  over
    51×51 symmetric variable X
  subject to
    maindiag(X) = [1]
    X ≽ 0
    X ≥ 0
    X ≤ [1]
SDP upper bound on value: 814.9304216514133
Expected value on this graph is 794.1511336003958.
[[1.00000000e+00 9.74254263e-01 2.24702732e-01 ... 3.13039174e-02
  8.75275826e-01 3.93821792e-01]
 [9.74254263e-01 1.00000000e+00 4.25670938e-01 ... 2.26704584e-01
  7.95802699e-01 5.77135032e-01]
 [2.24702732e-01 4.25670938e-01 1.00000000e+00 ... 8.20888177e-01
  3.86377602e-09 8.69672227e-01]
 ...
 [3.13039174e-02 2.26704584e-01 8.20888177e-01 ... 1.00000000e+00
  6.33315528e-10 6.01085868e-01]
 [8.75275826e-01 7.95802699e-01 3.86377602e-09 ... 6.33315528e-10
  1.00000000e+00 5.52884036e-09]
 [3.93821792e-01 5.77135032e-01 8.69672227e-01 ... 6.01085868e-01
  5.52884036e-09 1.00000000e+00]]
Found best value 811.2108202828251. The clusters are:
['thailand' 'portugal' 'englan']
['switzerland' 'belgi