# Initialisation
Load Spark and the jar containing the ExtendedBlockMatrix, and starts the Spark session.

In [None]:
import $ivy.`org.apache.spark::spark-sql:3.0.1`
import $ivy.`org.apache.spark::spark-mllib:3.0.1`

In [None]:
val currentDirectory = new java.io.File(".").getCanonicalPath
val path = java.nio.file.FileSystems.getDefault().getPath(s"$currentDirectory/lib/sparktensordecomposition_2.12-0.1.jar")
val x = ammonite.ops.Path(path)
interp.load.cp(x)

In [None]:
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.mllib.linalg.distributed.ExtendedBlockMatrix
import org.apache.spark.mllib.linalg.distributed.ExtendedBlockMatrix._
import tensordecomposition._
import tensordecomposition.CPALS._

In [None]:
implicit val spark = {
    val MAX_MEMORY = "126g"
    SparkSession.builder()
        .config("spark.executor.memory", MAX_MEMORY)
        .config("spark.driver.memory", MAX_MEMORY)
        .appName("BenchmarkSparkCPALS")
        .master("local[*]")
        .getOrCreate()
}

spark.sparkContext.setLogLevel("ERROR")
spark.sparkContext.setCheckpointDir("Checkpoint")
import spark.implicits._

spark.sparkContext.getConf.getAll

# CP-ALS
Create the function used to perform the CP decomposition.

In [None]:
def computeCPALS(tensor: Tensor, rank: Int, maxIterations: Int = 5)
        (implicit spark: SparkSession): Kruskal = {
    val tensorMatricized = tensor.matricization()
    tensorMatricized.map(m => {
        val mc = m.cache()
        mc.checkpoint()
        mc
    })
    val result = new Array[ExtendedBlockMatrix](tensor.order)                   
    var lambda = new Array[Double](tensor.order)
    // Randomized initialization
    for (i <- 1 until tensor.order) {
        result(i) = ExtendedBlockMatrix.gaussian(tensor.dimensionsSize(i), rank)
    }
    // V is updated for each dimension rather than recalculated
    var v = (for (k <- 1 until result.size) yield
        result(k)).reduce((m1, m2) => (m1.transpose.multiply(m1)).hadamard(m2.transpose.multiply(m2)))
    var termination = false
    var nbIterations = 1
    while (!termination) {
        println("iteration " + nbIterations)
        for (i <- 0 until tensor.order) {
            // Remove current dimension from V
            if (result(i) != null) {
                v = v.hadamard(result(i).transpose.multiply(result(i)), (m1, m2) => m1 /:/ m2)
            }
            // Compute MTTKRP
            val mttkrp = ExtendedBlockMatrix.mttkrp(tensorMatricized(i),
                    (for (k <- 0 until result.size if i != k) yield result(k)).toArray,
                    (for (k <- 0 until tensor.dimensionsSize.size if i != k) yield tensor.dimensionsSize(k)).toArray,
                    tensor.dimensionsSize(i),
                    rank
                )
            result(i) = mttkrp.multiply(v.pinverse())

            // Compute lambda
            lambda = result(i).norm()
            result(i) = result(i).applyOperation(m => {
                for (k <- 0 until rank) {
                    m(::,k) := m(::,k) / lambda(k)
                }
                m
            })

            // Update of V
            v = v.hadamard(result(i).transpose.multiply(result(i)))
        }

        if (nbIterations >= maxIterations) {
            termination = true
        } else {
            nbIterations += 1
        }
    }
    Kruskal(result, lambda)
}

# Data loading
Load the CSV files previously created.

In [None]:
import java.io.File

val tensorsFiles = new File("sample_tensors").listFiles
    .map(_.getName)
    .filter(f => f.startsWith("tensor") && !f.contains("clusters")).toList

In [None]:
val tensors = (for (tensorFile <- tensorsFiles) yield {
    val name = tensorFile.replace(".csv", "").replace("tensor_", "").split("_")
    val nbDimensions = name(0).toInt
    val size = name(1).toLong
    (tensorFile -> Tensor.fromIndexedDataFrame(
        spark.read.option("header",true).csv(s"sample_tensors/$tensorFile"), 
        (for (i <- 0 until nbDimensions) yield size)toList))
}).toMap

# Run CP
Execute the CP on the loaded tensors, and measure the execution time.

In [None]:
import scala.collection.mutable.{Map => MMap}

In [None]:
var timeCPALSCoordinateMatrix = MMap[Int, MMap[Int, MMap[Double, Int]]]()

for (dimension <- 3 to 5; 
     size <- List(100, 1000, 10000, 100000);
     sparsity <- List(1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7, 1e-8, 1e-9, 1e-10)) {
    val fileName = s"tensor_${dimension}_${size}_${sparsity}.csv"
    if (tensors.contains(fileName)) {
        println(fileName)
        val nbIterations = 5
        var endTime = 0
        for (j <- 0 until nbIterations) {        
            val startTime = System.currentTimeMillis()

            computeCPALSWithCoordinateMatrix(tensors.get(fileName).get, 3)

            endTime += (System.currentTimeMillis() - startTime).toInt

            println("Execution time: " + (endTime / 1000) + "s")
        }
        val finalTime = (endTime / nbIterations).toInt
        var dimMap = timeCPALSCoordinateMatrix.getOrElse(dimension, MMap[Int, MMap[Double, Int]]())
        var sizeMap = dimMap.getOrElse(size, MMap[Double, Int]())
        sizeMap = sizeMap + (sparsity -> finalTime)
        dimMap = dimMap + (size -> sizeMap)
        timeCPALSCoordinateMatrix(dimension) = dimMap
    } 
}

# Results

## Plots

In [None]:
import $ivy.`org.plotly-scala::plotly-almond:0.7.2`

import plotly._
import plotly.element._
import plotly.layout._
import plotly.Almond._

In [None]:
for ((dimension, r1) <- timeCPALSCoordinateMatrix) {
    var plot = Seq[Scatter]()
    for (size <- r1.keys.toSeq.sorted) {
        val r2 = r1(size)
        val sparsity = r2.keys.toSeq.sorted.toList.reverse
        plot = plot :+ Scatter(sparsity.map(_ * math.pow(size, dimension)), for (key <- sparsity) yield r2(key), name = s"Size $size")
    }
    plot.plot(title = s"Number of dimensions: $dimension", 
          xaxis = Axis(title = "nnz", `type` = AxisType.Log),
          yaxis = Axis(title = "Time", `type` = AxisType.Log))
}