# [Clustering4Ever](https://github.com/Clustering4Ever/Clustering4Ever) by [LIPN](https://lipn.univ-paris13.fr/) [A3](https://lipn.univ-paris13.fr/accueil/equipe/a3/) team

In [None]:
%%classpath add mvn
org.clustering4ever clustering4ever_2.11 0.9.6

In [None]:
%%classpath add mvn
org.apache.spark spark-core_2.11 2.4.3

In [None]:
%%classpath add mvn
org.apache.sanselan sanselan 0.97-incubator

In [None]:
%%classpath add mvn
com.github.haifengl smile-scala_2.11 1.5.0

# Scala _K_-Modes

import scala.collection.mutable
import smile.plot._
import org.clustering4ever.clustering.kcenters.scala.KModes
import org.clustering4ever.math.distances.binary.Hamming
import org.clustering4ever.clusterizables.EasyClusterizable
import org.clustering4ever.vectors.BinaryVector
import scala.io.Source
import smile.feature.Standardizer
import smile.data.NumericAttribute
import smile.data.Attribute.Type._
import smile.plot._
import smile.projection._
import smile.manifold._
import java.awt.Color
import org.clustering4ever.vectorizables.Vectorizable
import org.clustering4ever.clusterizables.EasyClusterizable
import org.clustering4ever.vectors.ScalarVector

## Download dataset Digits

In [None]:
%%bash
wget -P /tmp/ http://www.clustering4ever.org/Datasets/Digits/digits.csv
wget -P /tmp/ http://www.clustering4ever.org/Datasets/Digits/labels

## Import and format data

In [None]:
val datasetSize = 50000
val dim = 10
val path = "/tmp/digits.csv"

val rawData = Source.fromFile(path).getLines.map( x => Array(x.split(",").map(_.toDouble):_*) ).toArray

val parData = rawData.zipWithIndex.par.map{ case (v, id) => EasyClusterizable(id.toLong, new ScalarVector(v)) }

val groundTrueLabels = "/tmp/labels"

## Parameters 

In [None]:
val k = 10
val iterMax = 100
val epsilon = 0.1
val metric1 = new Hamming

## Run the algorithm

In [None]:
val t1 = System.nanoTime

val model = new KModes(k, metric1, epsilon, iterMax).fit(parData)

val t2 = System.nanoTime

(t2 - t1) / 1000000000D

## Different ways to apply model to datasets

In [None]:
val clusterized11 = rawData.map( v => (model.centerPredict(v), v) )
val clusterized12 = model.centerPredict(rawData)

val clusterized21 = parData.map( cz => (model.centerPredict(cz.v), cz) )
val clusterized22 = model.centerPredict(parData)

### Transform binary data into a grid to visualize it. Here a 15 x 16 grid for digits data

In [None]:
val centroids = model.centers.toArray
val formatedCentroids = centroids.map(_._2.vector.map(_.toDouble).grouped(15).map(_.toArray).toArray)

### Visualize centroids

In [None]:
formatedCentroids.foreach( centroid => hexmap(centroid, Array(Palette.BLACK, Palette.LIGHT_GRAY)))

## Standardize data and apply a PCA on it

In [None]:
val types = for( i <- (0 until parData.head.v.vector.size).toArray ) yield new NumericAttribute(i.toString)

val standardize = new Standardizer

val dataAsDouble = parData.map(_.v.vector.map(_.toDouble))

standardize.learn(types.toArray, dataAsDouble.map(_.toArray).toArray)

val standardizedData = clusterized22.map( cz => (cz.clusterIDs.last, cz.v.vector.map(_.toDouble).toArray) )

In [None]:
val readyToPca = standardizedData.map(_._2).toArray

val pcaModel = pca(readyToPca)

pcaModel.setProjection(3)

val pcaizedData = standardizedData.map{ case (clusterID, v) => (clusterID, pcaModel.project(v)) }

In [None]:
val pureData = pcaizedData.map(_._2).toArray
val labels = pcaizedData.map(_._1).toArray
val pcizedCentroids = centroids.map{ case (_, centroid) =>  (Int.MaxValue, pcaModel.project(standardize.transform(centroid.vector.map(_.toDouble).toArray))) }

In [None]:
plot(pureData, labels, '.', Palette.COLORS)

In [None]:
val window = plot(pureData, labels, '.', Palette.COLORS)
window.canvas.points("centroid", pcizedCentroids.map(_._2), 'Q', Color.BLACK)

## Visualization with t-sne

In [None]:
val sne = tsne(readyToPca, 3)

In [None]:
plot(sne.getCoordinates, labels, '.', Palette.COLORS)

## Include centroids 

In [None]:
val readyTsneCentroids = centroids.map{ case (_, centroid) =>  (Int.MaxValue, standardize.transform(centroid.vector.map(_.toDouble).toArray)) }

val readyToTsne = readyToPca ++ readyTsneCentroids.map(_._2)

In [None]:
val sne2 = tsne(readyToTsne, 3)

In [None]:
val dataTsne = sne2.getCoordinates.take(parData.size)
val dataCentroids = sne2.getCoordinates.takeRight(10)

val window = plot(dataTsne, labels, '.', Palette.COLORS)
window.canvas.points("centroid", dataCentroids, 'Q', Color.BLACK)