# Initialisation

In [None]:
import $ivy.`org.apache.spark::spark-sql:2.2.0`
import $ivy.`org.apache.spark::spark-mllib:2.2.0`

In [None]:
val currentDirectory = new java.io.File(".").getCanonicalPath
val path = java.nio.file.FileSystems.getDefault().getPath(s"$currentDirectory/lib/sambaten_2.11-0.1.jar")
val x = ammonite.ops.Path(path)
interp.load.cp(x)

In [None]:
import org.apache.spark.SparkContext
import org.apache.spark.sql.{DataFrame, SparkSession}
import edu.ucr.sambaten._

In [None]:
implicit val spark = {
    val MAX_MEMORY = "126g"
    SparkSession.builder()
        .config("spark.executor.memory", MAX_MEMORY)
        .config("spark.driver.memory", MAX_MEMORY)
        .appName("BenchmarkSamBaTen")
        .master("local[*]")
        .getOrCreate()
}

spark.sparkContext.setLogLevel("ERROR")
import spark.implicits._

spark.sparkContext.getConf.getAll

# Functions to run CP-ALS with SamBaTen

In [None]:
implicit val sc = spark.sparkContext

def dataFrameToCoordinateTensor(df: DataFrame, order: Int, dimensionsSize: Int)(implicit sc: SparkContext): CoordinateTensor = {
    val entries = df.map(e => {
        val seq = for (i <- 0 until order) yield e.getString(e.fieldIndex(s"d$i")).toInt
        TEntry(new Coordinate(seq), e.getString(e.fieldIndex("val")).toDouble)
    }).rdd
    val shape = new Coordinate(for (_ <- 0 until order) yield dimensionsSize)
    val nnz = df.count()
    new CoordinateTensor(entries, shape, nnz.toInt)
}

# Data loading

In [None]:
case class TensorDf(df: DataFrame, order: Int, dimensionsSize: Int)

In [None]:
import java.io.File

val tensorsFiles = new File("sample_tensors").listFiles
    .map(_.getName)
    .filter(f => f.startsWith("tensor") && !f.contains("clusters")).toList

In [None]:
val tensors = (for (tensorFile <- tensorsFiles) yield {
    val name = tensorFile.replace(".csv", "").replace("tensor_", "").split("_")
    val nbDimensions = name(0).toInt
    val size = name(1).toLong
    (tensorFile -> 
        TensorDf(spark.read.option("header", true).csv(s"sample_tensors/$tensorFile"), nbDimensions, size.toInt))
}).toMap

# Run CP

In [None]:
import scala.collection.mutable.{Map => MMap}

In [None]:
var timeCPALSSamBaTen = MMap[Int, MMap[Int, MMap[Double, Int]]]()

for (dimension <- 3 to 3; 
     size <- List(100, 1000, 10000, 100000);
     sparsity <- List(1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7, 1e-8, 1e-9, 1e-10).reverse) {
    val fileName = s"tensor_${dimension}_${size}_${sparsity}.csv"
    if (tensors.contains(fileName)) {
        println(fileName)
        val nbIterations = 1//5
        var endTime = 0
        for (j <- 0 until nbIterations) {
            spark.catalog.clearCache()
            val _t = tensors.get(fileName).get
            val tensor = dataFrameToCoordinateTensor(_t.df, _t.order, _t.dimensionsSize)
            
            val startTime = System.currentTimeMillis()

            val cpAls = new CPALS().setAttr(3, 5, 1e-100)
            cpAls.run(tensor)

            endTime += (System.currentTimeMillis() - startTime).toInt

            println("Execution time: " + (endTime / 1000) + "s")
        }
        val finalTime = (endTime / nbIterations).toInt
        var dimMap = timeCPALSSamBaTen.getOrElse(dimension, MMap[Int, MMap[Double, Int]]())
        var sizeMap = dimMap.getOrElse(size, MMap[Double, Int]())
        sizeMap = sizeMap + (sparsity -> finalTime)
        dimMap = dimMap + (size -> sizeMap)
        timeCPALSSamBaTen(dimension) = dimMap
    } 
}

# Results

## Export to CSV

In [None]:
import $ivy.`com.github.tototoshi::scala-csv:1.3.6`
import com.github.tototoshi.csv._

In [None]:
val f = new java.io.File(s"""results/benchmarkSamBaTen.csv""")
val fileExists = f.exists()
val writer = CSVWriter.open(f, append = true)
if (!fileExists) {
    writer.writeRow(List[String]("dimension", "size", "sparsity", "time"))
}
for ((dimension, r1) <- timeCPALSSamBaTen; (size, r2) <- r1; (sparsity, time) <- r2) {
    println(List[Any](dimension, size, sparsity, time))
    writer.writeRow(List[Any](dimension, size, sparsity, time))
}
writer.close()