Skip to content
Permalink
Browse files

fix: update lightgbm to 2.2.400, fix probabilities and some win errors

  • Loading branch information...
imatiach-msft authored and mhamilton723 committed Aug 30, 2019
1 parent 16ea090 commit 05a2bef54fa88a2293020215cf4cae34f2d212c5
@@ -24,7 +24,7 @@ libraryDependencies ++= Seq(
"com.jcraft" % "jsch" % "0.1.54",
"com.jcraft" % "jsch" % "0.1.54",
"org.apache.httpcomponents" % "httpclient" % "4.5.6",
"com.microsoft.ml.lightgbm" % "lightgbmlib" % "2.2.350",
"com.microsoft.ml.lightgbm" % "lightgbmlib" % "2.2.400",
"com.github.vowpalwabbit" % "vw-jni" % "8.7.0.2"
)

@@ -4,7 +4,7 @@
package com.microsoft.ml.spark.lightgbm

import com.microsoft.ml.lightgbm._
import com.microsoft.ml.spark.lightgbm.LightGBMUtils.{getBoosterPtrFromModelString, intToPtr}
import com.microsoft.ml.spark.lightgbm.LightGBMUtils.getBoosterPtrFromModelString
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector}
import org.apache.spark.sql.{SaveMode, SparkSession}

@@ -39,24 +39,20 @@ class LightGBMBooster(val model: String) extends Serializable {
var scoredDataOutPtr: SWIGTYPE_p_double = _

@transient
var scoredDataLengthLongPtr: SWIGTYPE_p_long = _

@transient
var scoredDataLength_int64_tPtr: SWIGTYPE_p_int64_t = _ //scalastyle:ignore field.name
var scoredDataLengthLongPtr: SWIGTYPE_p_long_long = _

def ensureScoredDataCreated(): Unit = {
if (scoredDataLengthLongPtr != null)
return

scoredDataOutPtr = lightgbmlib.new_doubleArray(numClasses)
scoredDataLengthLongPtr = lightgbmlib.new_longp()
lightgbmlib.longp_assign(scoredDataLengthLongPtr, 1 /* numRows */)
scoredDataLength_int64_tPtr = lightgbmlib.long_to_int64_t_ptr(scoredDataLengthLongPtr)
scoredDataLengthLongPtr = lightgbmlib.new_int64_tp()
lightgbmlib.int64_tp_assign(scoredDataLengthLongPtr, 1)
}

override protected def finalize(): Unit = {
if (scoredDataLengthLongPtr != null)
lightgbmlib.delete_longp(scoredDataLengthLongPtr)
lightgbmlib.delete_int64_tp(scoredDataLengthLongPtr)
if (scoredDataOutPtr == null)
lightgbmlib.delete_doubleArray(scoredDataOutPtr)
}
@@ -74,9 +70,9 @@ class LightGBMBooster(val model: String) extends Serializable {
lightgbmlib.LGBM_BoosterPredictForCSRSingle(
sparseVector.indices, sparseVector.values,
sparseVector.numNonzeros,
boosterPtr, dataInt32bitType, data64bitType, intToPtr(1 + 1), intToPtr(numCols),
kind, -1, datasetParams,
scoredDataLength_int64_tPtr, scoredDataOutPtr), "Booster Predict")
boosterPtr, dataInt32bitType, data64bitType, 2, numCols,
kind, -1, datasetParams,
scoredDataLengthLongPtr, scoredDataOutPtr), "Booster Predict")

predToArray(classification, scoredDataOutPtr, kind)
}
@@ -96,7 +92,7 @@ class LightGBMBooster(val model: String) extends Serializable {
row, boosterPtr, data64bitType,
numCols,
isRowMajor, kind,
-1, datasetParams, scoredDataLength_int64_tPtr, scoredDataOutPtr),
-1, datasetParams, scoredDataLengthLongPtr, scoredDataOutPtr),
"Booster Predict")
predToArray(classification, scoredDataOutPtr, kind)
}
@@ -183,12 +183,6 @@ object LightGBMUtils {
idAsInt
}

def intToPtr(value: Int): SWIGTYPE_p_int64_t = {
val longPtr = lightgbmlib.new_longp()
lightgbmlib.longp_assign(longPtr, value)
lightgbmlib.long_to_int64_t_ptr(longPtr)
}

def generateData(numRows: Int, rowsAsDoubleArray: Array[Array[Double]]):
(SWIGTYPE_p_void, SWIGTYPE_p_double) = {
val numCols = rowsAsDoubleArray.head.length
@@ -202,14 +196,8 @@ object LightGBMUtils {
def generateDenseDataset(numRows: Int, rowsAsDoubleArray: Array[Array[Double]],
referenceDataset: Option[LightGBMDataset],
featureNamesOpt: Option[Array[String]]): LightGBMDataset = {
val numRowsIntPtr = lightgbmlib.new_intp()
lightgbmlib.intp_assign(numRowsIntPtr, numRows)
val numRows_int32_tPtr = lightgbmlib.int_to_int32_t_ptr(numRowsIntPtr) //scalastyle:ignore field.name
val numCols = rowsAsDoubleArray.head.length
val isRowMajor = 1
val numColsIntPtr = lightgbmlib.new_intp()
lightgbmlib.intp_assign(numColsIntPtr, numCols)
val numCols_int32_tPtr = lightgbmlib.int_to_int32_t_ptr(numColsIntPtr) //scalastyle:ignore field.name
val datasetOutPtr = lightgbmlib.voidpp_handle()
val datasetParams = "max_bin=255 is_pre_partition=True"
val data64bitType = lightgbmlibConstants.C_API_DTYPE_FLOAT64
@@ -219,7 +207,7 @@ object LightGBMUtils {
// Generate the dataset for features
LightGBMUtils.validate(lightgbmlib.LGBM_DatasetCreateFromMat(
data.get._1, data64bitType,
numRows_int32_tPtr, numCols_int32_tPtr,
numRows, numCols,
isRowMajor, datasetParams, referenceDataset.map(_.dataset).orNull, datasetOutPtr),
"Dataset create")
} finally {
@@ -247,7 +235,7 @@ object LightGBMUtils {
LightGBMUtils.validate(lightgbmlib.LGBM_DatasetCreateFromCSRSpark(
sparseRows.asInstanceOf[Array[Object]],
sparseRows.length,
intToPtr(numCols), datasetParams, referenceDataset.map(_.dataset).orNull,
numCols, datasetParams, referenceDataset.map(_.dataset).orNull,
datasetOutPtr),
"Dataset create")
val dataset = new LightGBMDataset(lightgbmlib.voidpp_value(datasetOutPtr))
@@ -149,11 +149,8 @@ private object TrainUtils extends Serializable {

def saveBoosterToString(boosterPtr: Option[SWIGTYPE_p_void], log: Logger): String = {
val bufferLength = LightGBMConstants.DefaultBufferLength
val bufferLengthPtr = lightgbmlib.new_longp()
lightgbmlib.longp_assign(bufferLengthPtr, bufferLength)
val bufferLengthPtrInt64 = lightgbmlib.long_to_int64_t_ptr(bufferLengthPtr)
val bufferOutLengthPtr = lightgbmlib.new_int64_tp()
lightgbmlib.LGBM_BoosterSaveModelToStringSWIG(boosterPtr.get, 0, -1, bufferLengthPtrInt64, bufferOutLengthPtr)
lightgbmlib.LGBM_BoosterSaveModelToStringSWIG(boosterPtr.get, 0, -1, bufferLength, bufferOutLengthPtr)
}

def getEvalNames(boosterPtr: Option[SWIGTYPE_p_void]): Array[String] = {
@@ -75,6 +75,16 @@ trait LightGBMTestUtils extends TestBase {
indexedDF
}

def assertProbabilities(tdf: DataFrame, model: LightGBMClassifier): Unit = {
tdf.select(model.getRawPredictionCol, model.getProbabilityCol)
.collect()
.foreach(row => {
val probabilities = row.getAs[DenseVector](1).values
assert((probabilities.sum - 1.0).abs < 0.001)
assert(probabilities.forall(probability => probability >= 0 && probability <= 1))
})
}

def assertFitWithoutErrors(model: Estimator[_ <: Model[_]], df: DataFrame): Unit = {
assert(model.fit(df).transform(df).collect().length > 0)
}
@@ -391,9 +401,7 @@ class VerifyLightGBMClassifier extends Benchmarks with EstimatorFuzzing[LightGBM
val fitModel = model.fit(df)
val tdf = fitModel.transform(df)

tdf.select(model.getProbabilityCol)
.collect()
.foreach(row => assert(row.getAs[DenseVector](0).values.sum === 1.0))
assertProbabilities(tdf, model)

assertImportanceLengths(fitModel, df)
addBenchmark(s"LightGBMClassifier_${fileName}_$boostingType",
@@ -429,9 +437,8 @@ class VerifyLightGBMClassifier extends Benchmarks with EstimatorFuzzing[LightGBM
val fitModel = model.fit(df)
val tdf = fitModel.transform(df)

tdf.select(model.getProbabilityCol)
.collect()
.foreach(row => assert(row.getAs[DenseVector](0).values.sum === 1.0))
assertProbabilities(tdf, model)

assertImportanceLengths(fitModel, df)
addBenchmark(s"LightGBMClassifier_${fileName}_$boostingType",
multiclassEvaluator.evaluate(tdf), precision)

0 comments on commit 05a2bef

Please sign in to comment.
You can’t perform that action at this time.