# SEMI-SUPERVISED LEARNING
## clustering
Good clustering involves
* Scale Invariance: clusters remain the same before and after zoom operation
* Consistency: remain consistent if individual clusters are compressed and in between space is stretched
* Richness: some arrangements cluster it all together, others cluster each point separately

BUT - no clustering algorithm can provide all three of these properties

K-MEANS algorithm:
* is stochastic
* produces compact groups

K-Means alternates between two steps:
1. reassign data point to its center
2. recompute center for each cluster

K-Means steps can only be alternated so many times before it can't optimize any more
y
* Active Learning Algorithms: can request their own labels

K-Means is an "Active, Semi-Supervised, Learning Algorithm"

Multi-Task Learning: single input and multiple tasks for outputs.
* if all tasks are relevant to the input, ie: task1 recognizes landscape, task2 looks for animals, task3 does some other shit.



In [2]:
import numpy as np

def assign_data(data,centers):
    n = len(data)       # number of data points
    d = len(data[0])    # dimensionality of dpoints
    k = len(centers)    # num of clusters

    # subtract set of centers from each dpoint
    res = np.reshape(data,(1,n,d))-np.reshape(centers,(k,1,d))
    res2 = np.add.reduce(res**2,2) # sum squared diff
    centerids = np.apply_along_axis(np.argmin,0,res2) # assign dpoints to closest center
    loss = sum(np.apply_along_axis(np.min,0,res2)) # note the loss

    return centerids,loss

In [3]:
def compute_means(data, centerids, k):
  n = len(data)         # num of data points
  d = len(data[0])      # dpoints dimensionality

  centers = np.zeros(shape=(k,d)) # zero out centers

  # loop through clusters
  for i in range(k):
    # Gather the data points from cluster i
    cols = np.array([data[j] for j in range(n) if centerids[j] == i])

    if len(cols) == 0:
      centers[i] = data[np.random.randint(0,n-1)] # cluster mean from averaging
    else:
      centers[i] = cols.mean(0)
  return centers

In [3]:
def kmeans(data, k):
  n = len(data)
  d = len(data[0])

  # get centers from random points
  centers = data[[random.randint(0,n-1) for _ in range(k)]]
  oldloss = 0
  loss = 1

  # loop until loss stops changing
  while oldloss != loss:
    oldloss = loss
    centerids, loss = assign_data(data,centers)
    centers = compute_means(data, centerids, k)
  return(centers, loss)

We will download the MNIST dataset and split the data into training data, `X_train` and `y_train` and test data, `X_test` and `y_test`.

In [None]:
from sklearn.datasets import fetch_openml
data = fetch_openml(name='mnist_784')

import numpy as np
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.1)
X_train, X_test, y_train, y_test = train_test_split(X_test, y_test, test_size=0.33)
len(X_train)

Here, we run `kmeans` on our `X_train` data where `k=10`.  We run `kmeans` 9 times and record the `bestcenters` which have the `bestloss` among the recorded losses.  We then find the accuracy of these new centers on the test set.

In [None]:
from scipy import stats
import math
from functools import reduce
import random

#for nlabeled in range(20,len(X_train),10):
nlabeled = 20
if True:
  print(nlabeled)
  ans = []
  k = 10 # 2 # 5 # 20
  if True:
    bestcenters, bestloss = kmeans(X_train, k)
    for rep in range(9):
      centers, loss = kmeans(X_train, k)
      if loss < bestloss: bestcenters, bestloss = centers, loss
    # How do we test the clustering that was discovered?
    # Assign testing points to clusters
    test_centerids, loss = assign_data(X_test, bestcenters)

    # Use the labeled examples to label the clusters
    train_centerids, loss = assign_data(X_train[:nlabeled], bestcenters)
    #print(train_centerids)
    #print(y_train[:nlabeled])
    labs = y_train[:nlabeled]

#    clust_labs = np.zeros(shape=(k))
    clust_labs = np.repeat(labs[0],k)
    for i in range(k):
      mode = stats.mode(labs[train_centerids == i]).mode
      if len(mode) > 0: clust_labs[i] = mode[0]

# print(clust_labs)
    ans = ans + [(k,sum(clust_labs[test_centerids] == y_test)/len(y_test))]
#    plt.plot(X_test[clust_labs[test_centerids] == 0,0],X_test[clust_labs[test_centerids] == 0,1],'o',color='r')
#    plt.plot(X_test[clust_labs[test_centerids] == 1,0],X_test[clust_labs[test_centerids] == 1,1],'o',color='b')
#    plt.show()

#  print(ans)
  print(reduce((lambda x, y: x if x[1] > y[1] else y), ans))
  labids, loss = assign_data(X_test, X_train[:nlabeled])
  print(nlabeled, sum(y_train[labids] == y_test)/len(y_test))

We'll next print the images that best represent the centers of each of our clusters in K-means and the label for each of the clusters

We will also calcuate the percent accuracy of the clusters

In [None]:
!pip install keras=='2.3.1'
from keras.preprocessing.image import array_to_img

train_centerids, loss = assign_data(X_train, bestcenters)
test_centerids, loss = assign_data(X_test, bestcenters)

clust_labs = np.repeat(labs[0],k)
for i in range(len(bestcenters)):
  display(array_to_img(np.reshape(bestcenters[i],(28,28,1)), scale=False))
  clust_labs[i] = y_train[train_centerids == i][0]
  print(clust_labs[i])
#  mode = stats.mode(y_train[train_centerids == i]).mode
#  print(mode[0])
#  if len(mode) > 0: clust_labs[i] = mode[0]

sum(clust_labs[test_centerids] == y_test)/len(y_test)

Finally, we'll rewrite the K-means model as an active learning problem and perform semi-supervised clustering of the data

In [None]:
from scipy import stats
import math
from functools import reduce

# ACTIVE LEARNING VERSION

#for nlabeled in range(20,len(X_train),10):
nlabeled = 10
if True:
  ans = []
  k = 50 # 10 # 2 # 5 # 20
  if True:
#  for k in range(10,200,50):
    bestcenters, bestloss = kmeans(X_train, k)
    for rep in range(9):
      centers, loss = kmeans(X_train, k)
      if loss < bestloss: bestcenters, bestloss = centers, loss
    # How do we test the clustering that was discovered?
    # Assign testing points to clusters
    test_centerids, loss = assign_data(X_test, bestcenters)

    # Let's label one example in each category
    train_centerids, loss = assign_data(X_train, bestcenters)

    clust_labs = np.repeat(labs[0],k)
    for i in range(len(bestcenters)):
      clust_labs[i] = y_train[train_centerids == i][0]

    # semi-supervised clustering
    print(k,sum(clust_labs[test_centerids] == y_test)/len(y_test))

    # nearest neighbors
    labids, loss = assign_data(X_test, X_train[:k])
    print(k, sum(y_train[labids] == y_test)/len(y_test))