# Initialisation

In [None]:
import $ivy.`org.apache.spark::spark-core:1.5.2`
import $ivy.`org.apache.spark::spark-mllib:1.5.2`

In [None]:
val currentDirectory = new java.io.File(".").getCanonicalPath
val path = java.nio.file.FileSystems.getDefault().getPath(s"$currentDirectory/lib/cstf_2.11-0.1.jar")
val x = ammonite.ops.Path(path)
interp.load.cp(x)

In [None]:
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.paramath.CSTF._
import org.paramath.CSTF.utils.CSTFUtils

In [None]:
val MAX_MEMORY = "126g"
val sparkConf = new SparkConf()
        .setMaster("local[*]")
        .setAppName("BenchmarkCSTF")
        .set("spark.executor.memory", MAX_MEMORY)
        .set("spark.driver.memory", MAX_MEMORY)
        .set("spark.executor.heartbeatInterval", "10000000000")
implicit val sc = new SparkContext(sparkConf)
sc.setLogLevel("ERROR")

# Functions to run CP-ALS with CSTF

In [None]:
def RDDStringToRDDVector(rdd: RDD[String])(implicit sc: SparkContext): RDD[Vector] = {
    rdd.map(e => Vectors.dense(e.split("\t").map(_.toDouble)))
}

# Data loading

In [None]:
case class TensorRDD(rdd: RDD[String], order: Int, dimensionsSize: Int)

In [None]:
import java.io.File

val tensorsFiles = new File("sample_tensors_HaTen").listFiles
    .map(_.getName)
    .filter(f => f.startsWith("tensor") && !f.contains("clusters")).toList

In [None]:
val tensors = (for (tensorFile <- tensorsFiles) yield {
    val name = tensorFile.replace(".csv", "").replace("tensor_", "").split("_")
    val nbDimensions = name(0).toInt
    val size = name(1).toLong
    (tensorFile -> 
        TensorRDD(sc.textFile(s"sample_tensors_HaTen/$tensorFile"), nbDimensions, size.toInt))
}).toMap

# Run CP

In [None]:
import scala.collection.mutable.{Map => MMap}

In [None]:
var timeCPALSCSTF = MMap[Int, MMap[Int, MMap[Double, Int]]]()

for (dimension <- 3 to 3; 
     size <- List(100, 1000, 10000, 100000);
     sparsity <- List(1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7, 1e-8, 1e-9, 1e-10).reverse) {
    val fileName = s"tensor_${dimension}_${size}_${sparsity}.csv"
    if (tensors.contains(fileName)) {
        println(fileName)
        val nbIterations = 1
        var endTime = 0
        for (j <- 0 until nbIterations) {
            val _t = tensors.get(fileName).get
            val tensor = RDDStringToRDDVector(_t.rdd)
            tensor.count()
            
            val startTime = System.currentTimeMillis()

            COOGeneralizedSingleVec.CP_ALS(tensor, 5, 3, 1e-100, sc)
            //COOGeneralizedRowMatrix.CP_ALS(tensor, 5, 3, 1e-100, sc)
            //COOGeneralized.CP_ALS(tensor, 5, 3, 1e-100, sc)
            
            endTime += (System.currentTimeMillis() - startTime).toInt

            println("Execution time: " + (endTime / 1000) + "s")
        }
        val finalTime = (endTime / nbIterations).toInt
        var dimMap = timeCPALSCSTF.getOrElse(dimension, MMap[Int, MMap[Double, Int]]())
        var sizeMap = dimMap.getOrElse(size, MMap[Double, Int]())
        sizeMap = sizeMap + (sparsity -> finalTime)
        dimMap = dimMap + (size -> sizeMap)
        timeCPALSCSTF(dimension) = dimMap
    } 
}

# Results

## Export to CSV

In [None]:
import $ivy.`com.github.tototoshi::scala-csv:1.3.6`
import com.github.tototoshi.csv._

In [None]:
val f = new java.io.File(s"""results/benchmarkCSTF.csv""")
val fileExists = f.exists()
val writer = CSVWriter.open(f, append = true)
if (!fileExists) {
    writer.writeRow(List[String]("dimension", "size", "sparsity", "time"))
}
for ((dimension, r1) <- timeCPALSCSTF; (size, r2) <- r1; (sparsity, time) <- r2) {
    println(List[Any](dimension, size, sparsity, time))
    writer.writeRow(List[Any](dimension, size, sparsity, time))
}
writer.close()