In [1]:
import org.apache.spark.rdd.RDD
import org.apache.spark.HashPartitioner

Intitializing Scala interpreter ...

Spark Web UI available at http://10.26.63.208:4041
SparkContext available as 'sc' (version = 2.3.3, master = local[*], app id = local-1569957382357)
SparkSession available as 'spark'


import org.apache.spark.rdd.RDD
import org.apache.spark.HashPartitioner


## Chapter 3 Programming with RDDs

In [2]:
val lines = sc.parallelize(List("pandas", "i like pandas"))
val inputRDD = sc.textFile("./log.txt")
val errorsRDD = inputRDD.filter(line => line.contains("error"))
val warningsRDD = inputRDD.filter(line => line.contains("warning"))
val badLineRDD = errorsRDD.union(warningsRDD)

lines: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[0] at parallelize at <console>:27
inputRDD: org.apache.spark.rdd.RDD[String] = ./log.txt MapPartitionsRDD[2] at textFile at <console>:28
errorsRDD: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[3] at filter at <console>:29
badLineRDD: org.apache.spark.rdd.RDD[String] = UnionRDD[5] at union at <console>:31


In [3]:
println("Input had " + badLineRDD.count() + " concerning lines")
println("Here are examples:")
badLineRDD.take(5).foreach(println)

Input had 2 concerning lines
Here are examples:
The application closes the API session, resulting in an error being reported because the state of the flow is SESSIONED. This error can be ignored.


In [4]:
class SearchFunctions(val query: String) {
    import org.apache.spark.rdd.RDD
    def isMatch(s: String): Boolean = {
        s.contains(query)
    }
    
    def getMatchesNoReference(rdd: RDD[String]): RDD[String] = { 
        // Safe: extract just the field we need into a local variable 
        val query_ = this.query 
        rdd.filter(x => x.contains(query_)) 
    }
}

defined class SearchFunctions


In [5]:
val input = sc.parallelize(List(1,2,3,4))
val result = input.map(x => x*x)
println(result.collect().mkString(","))

1,4,9,16


input: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[6] at parallelize at <console>:27
result: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[7] at map at <console>:28


In [6]:
val lines = sc.parallelize(List("hello world", "hi"))
val words = lines.flatMap(line => line.split(" "))
words.first()

lines: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[8] at parallelize at <console>:29
words: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[9] at flatMap at <console>:30
res2: String = hello


In [7]:
val nums2 = sc.parallelize(List(1,2,3,4,5,6,7,8))
val sum_ = nums2.reduce((x,y) => x + y)

nums2: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[10] at parallelize at <console>:27
sum_: Int = 36


In [8]:
val result = input.aggregate((0,0))(
                              (acc,value)=>(acc._1 + value, acc._2 + 1),
                              (acc1, acc2)=>(acc1._1 + acc2._1, acc1._2 + acc2._2))
val avg = result._1 / result._2.toDouble

result: (Int, Int) = (10,4)
avg: Double = 2.5


In [9]:
val result = input.map(x => x*x)
println(result.count())
println(result.collect().mkString(","))

4
1,4,9,16


result: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[11] at map at <console>:31


## Chapter 4

In [10]:
val pairs = lines.map(x => (x.split(" ")(0), x))

pairs: org.apache.spark.rdd.RDD[(String, String)] = MapPartitionsRDD[12] at map at <console>:29


In [11]:
pairs.filter{case (key,value) => value.length < 20}

res4: org.apache.spark.rdd.RDD[(String, String)] = MapPartitionsRDD[13] at filter at <console>:33


In [12]:
val input = sc.textFile("./log.txt")
val words = input.flatMap(x => x.split(" "))
val result = words.map(x => (x,1)).reduceByKey((x,y) => x+y)
result.take(3)

input: org.apache.spark.rdd.RDD[String] = ./log.txt MapPartitionsRDD[15] at textFile at <console>:34
words: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[16] at flatMap at <console>:35
result: org.apache.spark.rdd.RDD[(String, Int)] = ShuffledRDD[18] at reduceByKey at <console>:36
res5: Array[(String, Int)] = Array((index,7), (call,2), (dst-9.67.116.99:1047,3))


In [13]:
input.flatMap(x => x.split(" ")).countByValue() // much faster

res6: scala.collection.Map[String,Long] = Map(:.......rsvp_action_nHop: -> 6, destination. -> 1, used -> 2, *************** -> 2, :......rpapi_getSpecData: -> 3, file: -> 1, down -> 1, enable -> 1, 08:53:52 -> 9, "CLCat2" -> 1, application -> 5, "" -> 808, reserved -> 1, PATH -> 10, for -> 60, papiUserValue -> 1, rapi_release() -> 1, :..........rpapi_Reg_UnregFlow: -> 12, 12 -> 2, =0 -> 1, TERM -> 1, 08 -> 2, PROTERR:.......rsvp_flow_stateMachine: -> 1, any -> 1, refresh -> 1, module -> 1, registered -> 1, name -> 4, this -> 2, gateway: -> 7, in -> 4, RESVED, -> 10, :........traffic_action_oif: -> 2, rsvp-udp -> 7, based -> 1, flow. -> 2, path. -> 1, reported -> 1, DELRESP -> 1, are -> 5, is -> 23, available. -> 4, source -> 7, [CL -> 1, informational -> 2, reported. -> 4, 19 -> 2, acce...

In [14]:
val result2 = result.combineByKey(
                                (v) => (v,1),
                                (acc:(Int,Int), v) => (acc._1 + v, acc._2 + 1),
                                (acc1:(Int,Int), acc2:(Int,Int)) => (acc1._1 + acc2._1, acc1._2 + acc2._2)).map{case (key,value) => (key, value._1 / value._2.toFloat)}
result2.collectAsMap().map(println(_))

(:........rsvp_action_nHop:,2.0)
(Tspec.,1.0)
(APIInitialize:,3.0)
(hop=9.67.116.99,,4.0)
(down.,2.0)
(flow[sess=9.67.116.99:1047:6,source=9.67.116.98:8000],1.0)
(:..........qosmgr_response:,4.0)
(src-9.67.116.98:8000,3.0)
(:........flow_timer_start:,6.0)
(changes,1.0)
(rsvp-api,1.0)
(RSVPPutActionName:,3.0)
(call,,2.0)
(TR1,1.0)
([CL,1.0)
(service=0,1.0)
(TCP/IP,4.0)
(08:54:35,23.0)
(Associate,1.0)
(action,2.0)
(determines,1.0)
(source=9.67.116.98:8000],3.0)
(flow,7.0)
(configured,1.0)
(ReadBuffer:,12.0)
(traffic,1.0)
(SENDER,1.0)
(expired,6.0)
(terminate,2.0)
(:........mailslot_send:,3.0)
(interfaces,3.0)
(:.......rsvp_action_nHop:,6.0)
(STYLE,4.0)
(is,23.0)
(level,1.0)
(30,1.0)
(papi_debug:,3.0)
(sessioned,1.0)
(filter,2.0)
(:......rpapi_getPolicyData:,11.0)
(UNIX,1.0)
(stopped,2.0)
(rc=0,1.0)
(querying,1.0)
(if=9.67.116.98,1.0)
(M,1.0)
(Home,1.0)
(does,2.0)
(parameters,1.0)
(#2,,1.0)
(associates,1.0)
(2,1.0)
(interface.,1.0)
(EDC8112I,1.0)
(Such,1.0)
(node,,1.0)
(based,1.0)
(throug

(unicasting.,1.0)
(specified,2.0)
(torn,1.0)
(via,7.0)
(13,2.0)
(started,9.0)
(SESSIONED,2.0)
(:.......rsvp_parse_objects:,8.0)
(enabled,1.0)
(first,1.0)
(#5,,1.0)
(reported,1.0)
(query,6.0)
(08:54:53,14.0)
(08:51:01,5.0)
(expect,1.0)
(:......router_forward_getOI:,30.0)
(obj,4.0)
(Exiting,17.0)
(entering,3.0)
(failed,4.0)
(07,2.0)
(08:53:22,21.0)
(system,2.0)
((broken),1.0)
(passing,1.0)
(destination,1.0)
(constructed,1.0)
(,808.0)


result2: org.apache.spark.rdd.RDD[(String, Float)] = MapPartitionsRDD[24] at map at <console>:34
res7: Iterable[Unit] = ArrayBuffer((), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (...

In [15]:
val data = Seq(("a",3), ("b",4), ("a",1))
val par_default = sc.parallelize(data).reduceByKey((x, y) => x + y) // Default parallelism 
val par_customize = sc.parallelize(data).reduceByKey((x, y) => x + y, 10) // Custom parallelism


data: Seq[(String, Int)] = List((a,3), (b,4), (a,1))
par_default: org.apache.spark.rdd.RDD[(String, Int)] = ShuffledRDD[26] at reduceByKey at <console>:28
par_customize: org.apache.spark.rdd.RDD[(String, Int)] = ShuffledRDD[28] at reduceByKey at <console>:29


In [16]:
par_default.getNumPartitions

res8: Int = 12


In [17]:
par_customize.getNumPartitions

res9: Int = 10


In [18]:
par_customize.partitions.size

res10: Int = 10


In [19]:
val pairs = sc.parallelize(List((1,1),(2,2),(3,3)))

pairs: org.apache.spark.rdd.RDD[(Int, Int)] = ParallelCollectionRDD[29] at parallelize at <console>:27


In [20]:
pairs.partitioner

res11: Option[org.apache.spark.Partitioner] = None


In [21]:
val partitioned = pairs.partitionBy(new HashPartitioner(2))

partitioned: org.apache.spark.rdd.RDD[(Int, Int)] = ShuffledRDD[30] at partitionBy at <console>:29


In [22]:
partitioned.partitioner

res12: Option[org.apache.spark.Partitioner] = Some(org.apache.spark.HashPartitioner@2)


In [29]:
// PageRank
val links = sc.objectFile[(String,Seq[String])]("links.txt")
              .partitionBy(new HashPartitioner(100))
              .persist()

var ranks = links.mapValues(v => 1.0)
for (i <- 0 until 10) {
    var contributions = links.join(ranks).flatMap {
        case (pageId, (links, rank)) =>
          links.map(dest => (dest, rank/links.size))
    }
    ranks = contributions.reduceByKey((x,y) => x+y).mapValues(v => 0.15 + 0.85*v)
}

ranks.saveAsTextFile("ranks")

org.apache.spark.SparkException:  Job aborted.

## Chapter 5