-
Notifications
You must be signed in to change notification settings - Fork 0
/
TransformTest.scala
82 lines (65 loc) · 3.23 KB
/
TransformTest.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
package common.transfomration
import junit.framework.Assert
import org.apache.spark.ml.Pipeline
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.sql.{DataFrame, SQLContext}
import org.apache.spark.{SparkConf, SparkContext}
import org.scalatest.{BeforeAndAfter, FunSuite}
class TransformTest extends FunSuite with BeforeAndAfter {
private val master = "local[1]"
private val appName = "test-transform"
private var sqlContext: SQLContext = _
private var sc: SparkContext = _
private var data: DataFrame = _
before {
val conf = new SparkConf()
.setMaster(master)
.setAppName(appName)
sc = new SparkContext(conf)
sqlContext = new SQLContext(sc)
data = sqlContext.createDataFrame(Seq(
("0", "lookup Windows Desktop"),
("1", "lookup Windows Desktop"),
("2", "search_results click view_search_results Windows Desktop"),
("3", "lookup Windows Desktops")
)).toDF("id", "action")
}
after {
if (sc != null) {
sc.stop()
}
}
test("testing tokenizer and hashing transformation") {
val transform = new Transform with TTokenize with THashing
val pipelineAction = new Pipeline().setStages(transform.apply(Array(), "action", "action_features", 50)._1)
val modelAction = pipelineAction.fit(data)
val features = modelAction.transform(data)
Assert.assertEquals(4, features.count())
Assert.assertEquals(features.filter(features("id") === "0").head().getAs[Vector]("action_features"),
features.filter(features("id") === "1").head().getAs[Vector]("action_features"))
Assert.assertNotSame(features.filter(features("id") === "0").head().getAs[Vector]("action_features"),
features.filter(features("id") === "3").head().getAs[Vector]("action_features"))
}
test("testing tokenizer transformation") {
val transform = new Transform with TTokenize
val pipelineAction = new Pipeline().setStages(transform.apply(Array(), "action", "action_features", 50)._1)
val modelAction = pipelineAction.fit(data)
val features = modelAction.transform(data)
Assert.assertEquals(4, features.count())
Assert.assertEquals(features.filter(features("id") === "0").head().getAs[Vector]("action_features"),
features.filter(features("id") === "1").head().getAs[Vector]("action_features"))
Assert.assertNotSame(features.filter(features("id") === "0").head().getAs[Vector]("action_features"),
features.filter(features("id") === "3").head().getAs[Vector]("action_features"))
}
test("testing tokenizer, hashing and idf transformation") {
val transform = new Transform with TTokenize with THashing with TIDF
val pipelineAction = new Pipeline().setStages(transform.apply(Array(), "action", "action_features", 50)._1)
val modelAction = pipelineAction.fit(data)
val features = modelAction.transform(data)
Assert.assertEquals(4, features.count())
Assert.assertEquals(features.filter(features("id") === "0").head().getAs[Vector]("action_features"),
features.filter(features("id") === "1").head().getAs[Vector]("action_features"))
Assert.assertNotSame(features.filter(features("id") === "0").head().getAs[Vector]("action_features"),
features.filter(features("id") === "3").head().getAs[Vector]("action_features"))
}
}