# Initialisation

In [None]:
import $ivy.`org.apache.spark::spark-sql:3.0.1`
import $ivy.`org.apache.spark::spark-mllib:3.0.1`

In [None]:
val currentDirectory = new java.io.File(".").getCanonicalPath
val path = java.nio.file.FileSystems.getDefault().getPath(s"$currentDirectory/lib/sparktensordecomposition_2.12-0.1.jar")
val x = ammonite.ops.Path(path)
interp.load.cp(x)

In [None]:
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.mllib.linalg.distributed.{ExtendedBlockMatrix, IndexedRowMatrix}
import org.apache.spark.mllib.linalg.distributed.ExtendedBlockMatrix._
import tensordecomposition._
import tensordecomposition.CPALS._

In [None]:
implicit val spark = {
    val MAX_MEMORY = "126g"
    SparkSession.builder()
        .config("spark.executor.memory", MAX_MEMORY)
        .config("spark.driver.memory", MAX_MEMORY)
        .appName("BenchmarkSparkCPALS")
        .master("local[*]")
        .getOrCreate()
}

spark.sparkContext.setLogLevel("ERROR")
spark.sparkContext.setCheckpointDir("Checkpoint")
import spark.implicits._

spark.sparkContext.getConf.getAll

# Data loading

In [None]:
import java.io.File

val tensorsFiles = new File("sample_tensors").listFiles
    .map(_.getName)
    .filter(f => f.startsWith("tensor") && !f.contains("clusters")).toList

In [None]:
val tensors = (for (tensorFile <- tensorsFiles) yield {
    val name = tensorFile.replace(".csv", "").replace("tensor_", "").split("_")
    val nbDimensions = name(0).toInt
    val size = name(1).toLong
    (tensorFile -> Tensor.fromIndexedDataFrame(
        spark.read.option("header",true).csv(s"sample_tensors/$tensorFile").dropDuplicates(for (i <- 0 until nbDimensions) yield s"d$i"), 
        (for (i <- 0 until nbDimensions) yield size)toList))
}).toMap

# Run CP

In [None]:
import scala.collection.mutable.{Map => MMap}

In [None]:
var timeCPALSCoordinateMatrix = MMap[Int, MMap[Int, MMap[Double, Int]]]()

for (dimension <- 3 to 3; 
     size <- List(100, 1000, 10000, 100000);
     sparsity <- List(1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7, 1e-8, 1e-9, 1e-10).reverse) {
    val fileName = s"tensor_${dimension}_${size}_${sparsity}.csv"
    if (tensors.contains(fileName)) {
        println(fileName)
        val nbIterations = 5
        var endTime = 0
        for (j <- 0 until nbIterations) {
            spark.catalog.clearCache()
            val _t = tensors.get(fileName).get
            val tensor = new Tensor(_t.data.cache(),
                _t.order,
                _t.dimensionsSize,
                _t.dimensionsName,
                _t.dimensionsIndex)
            
            tensor.data.count()
            val startTime = System.currentTimeMillis()

            tensor.runCPALS(3, 5, 0.9999999999999, true)

            endTime += (System.currentTimeMillis() - startTime).toInt

            println("Execution time: " + (endTime / 1000) + "s")
        }
        val finalTime = (endTime / nbIterations).toInt
        var dimMap = timeCPALSCoordinateMatrix.getOrElse(dimension, MMap[Int, MMap[Double, Int]]())
        var sizeMap = dimMap.getOrElse(size, MMap[Double, Int]())
        sizeMap = sizeMap + (sparsity -> finalTime)
        dimMap = dimMap + (size -> sizeMap)
        timeCPALSCoordinateMatrix(dimension) = dimMap
    } 
}

# Results

## Export results in CSV

In [None]:
import $ivy.`com.github.tototoshi::scala-csv:1.3.6`
import com.github.tototoshi.csv._

In [None]:
val f = new java.io.File(s"""results/timeCPALSMuLOT.csv""")
val fileExists = f.exists()
val writer = CSVWriter.open(f, append = true)
if (!fileExists) {
    writer.writeRow(List[String]("dimension", "size", "sparsity", "time"))
}
for ((dimension, r1) <- timeCPALSCoordinateMatrix; (size, r2) <- r1; (sparsity, time) <- r2) {
    println(List[Any](dimension, size, sparsity, time))
    writer.writeRow(List[Any](dimension, size, sparsity, time))
}
writer.close()