# [Clustering4Ever](https://github.com/Clustering4Ever/Clustering4Ever) by [LIPN](https://lipn.univ-paris13.fr/) [A3](https://lipn.univ-paris13.fr/accueil/equipe/a3/) team

In [None]:
%%classpath add mvn
org.clustering4ever clustering4ever_2.11 0.9.6

In [None]:
%%classpath add mvn
org.apache.spark spark-core_2.11 2.4.3

# Scala _K_-Means

In [None]:
import scala.io.Source
import scala.collection.{immutable, mutable, parallel}
import org.clustering4ever.clustering.kcenters.scala.KMeans
import org.clustering4ever.math.distances.scalar.{Euclidean, Cosine, Minkowski}
import org.clustering4ever.clustering.indices.MultiExternalIndicesLocal
import org.clustering4ever.enums.NmiNormalizationNature._
import org.clustering4ever.clusterizables.EasyClusterizable
import org.clustering4ever.vectorizables.Vectorizable
import org.clustering4ever.vectors.ScalarVector

## Download dataset Aggregation

In [None]:
%%bash
wget -P /tmp/ http://www.clustering4ever.org/Datasets/Aggregation/aggregation.csv
wget -P /tmp/ http://www.clustering4ever.org/Datasets/Aggregation/labels

## Import data as ParArray[Array[Double]]

In [None]:
val datasetSize = 100000
val dim = 10
val path = "/tmp/aggregation.csv"

val rawData = Source.fromFile(path).getLines.map( x => Array(x.split(",").map(_.toDouble):_*) ).toArray

val parData = rawData.zipWithIndex.par.map{ case (v, id) => EasyClusterizable(id.toLong, new ScalarVector(v)) }

val groundTruePath = "/tmp/labels"

In [None]:
val path = "/tmp/aggregation.csv"
val data = scala.io.Source.fromFile(path).getLines.toSeq.par
  .map( x => x.split(",").map(_.toDouble)).zipWithIndex
  .map{ case (v, id) => EasyClusterizable(id.toLong, ScalarVector(v)) }
val labelsPath = "/tmp/labels"
val labels = scala.io.Source.fromFile(labelsPath).getLines.toSeq.map(_.toInt)

## Parameters 

In [None]:
val k = 7
val iterMax = 50
val epsilon = 0.001
// True defines the true Euclidean with square root, set it to false to avoid the square
// val metric0 = new Euclidean[Array[Double]](true)
val metric1 = new Euclidean(false)
// Minkowski(p) where p is the Minkowski parameter
val metric2 = new Minkowski(4)

## Run and measure the algorithm time

In [None]:
val t1 = System.nanoTime


val model = new KMeans(k, metric1, epsilon, iterMax).fit(parData)

val t2 = System.nanoTime

(t2 - t1) / 1000000000D

In [None]:
val rawModel = new KMeans(k,metric1,epsilon, iterMax).fit(parData)

rawModel.centerPredict(parData)

# Compare algorithm between ParArray and Seq container

In [None]:
val t1 = System.nanoTime
new KMeans(k,metric1,epsilon, iterMax).fit(parData)
val t2 = System.nanoTime
val seqData = parData.seq
val t3 = System.nanoTime
new KMeans(k,metric1,epsilon, iterMax).fit(rawData)
val t4 = System.nanoTime



(t4 - t3).toDouble / (t2 - t1)

## Different ways to apply model to datasets

In [None]:
//val clusterized1 = model.centerPredict(data.map(_.v))
//val clusterized2 = model.obtainClustering(data)
val clusterized3 = model.centerPredict(data)

## Plot clustering results

In [None]:
val rawd = clusterized3.map( cz => cz.v.vector.toArray ).toArray
val labelsPred = clusterized3.map( cz => cz.clusterIDs(0) ).toArray
val plot = new Plot()
(rawd zip labelsPred).groupBy(_._2).values.foreach(x => {
    val Array(xx, yy) = x.map(_._1).transpose
    plot.add(new Points{x = xx
                        y = yy})
})
plot

## Inspect performance metrics

In [None]:
val labels = Source.fromFile(labelsPath).getLines.map(_.toInt).toBuffer.toParArray

In [None]:
val indices = MultiExternalIndicesLocal(labelsPred zip labels)
val nmi = indices.nmiSQRT

In [None]:
val arand = indices.arand