In [None]:
//prepare data
case class Bili(id:String, mark:Integer, age:Integer)

val info = sc.parallelize(Array(
    Bili("a", 80, 25), 
    Bili("a", 90, 26), 
    Bili("a", 90, 27),
    Bili("b", 98, 27),
    Bili("b", 92, 10),
    Bili("b", 82, 23),
    Bili("b", 82, 23),
    Bili("b", 80, 25),
    Bili("b", 84, 25),
    Bili("c", 84, 21),
    Bili("c", 84, 21),
    Bili("c", 80, 25)
))

val text = "Apache Spark is a unified analytics engine for big data processing, with built-in modules for streaming, SQL, machine learning and graph processing"

In [None]:
//wordCount and sort
val lines = sc.textFile("test.txt")
val words = lines.flatMap(_.split(" "))
val pairs = words.map((_, 1))
val count = pairs.reduceByKey(_+_)//对Tuple2的RDD进行隐式转换成PairRDDFunction，提供reduceByKey
val sorted = count.sortBy(_._2, false)

In [None]:
//custom sorting all
val result = info.sortBy{case Bili(id, mark, age) => (-mark, age)}

// sorting in group when rdd has no partitioners
import org.apache.spark.HashPartitioner

case class Bili(id : Int, mark : Int) extends Ordered[Bili]{
override def compare(that: Bili) = {
  if (this.id != that.id){
    this.id - that.id //id大的排前面
  }else{
    that.mark - this.mark //mark小的排前面
  }
 }
}

val result = info.repartitionAndSortWithinPartitions(new HashPartitioner(n))

// sorting in group when rdd already has partitioner
val result = info.mapPartitions(_.sortBy{case Bili(id, mark, age) => (-mark, age)})

In [None]:
//top n in RDD with agg
import scala.math.Ordering
import scala.collection.mutable.PriorityQueue

implicit val ord = Ordering.by[Bili, String](_.id)
val result = info.keyBy(_.id).
                aggregateByKey(new scala.collection.mutable.PriorityQueue[Bili]())(
                    (acc, v) => {
                        acc.enqueue(v)
                        acc.take(n)},
                    (acc1, acc2) => (acc1 ++ acc2).take(n))

//top n in RDD without agg
rdd.top(n)(Ordering.by[Bili, String](_.id))

//top n in dataframe
val windowSpec = Window.partitionBy("category").
                        orderBy(desc("sale"))
windf.select(expr("*"), rank().over(windowSpec)).
      filter(col("b") <= n).
      show
//top 1
//stackoverflow.com/questions/33878370/how-to-select-the-first-row-of-each-group/33878701#33878701
case class Record(Hour: Integer, Category: String, TotalValue: Double)
df.as[Record]
  .groupByKey(_.Hour)
  .reduceGroups((x, y) => if (x.TotalValue > y.TotalValue) x else y)

In [None]:
//find the top n Searched Keyword with filter on daily basis
//prepare DataFrame
val myManualSchema =  StructType(Array(
    StructField("Date", DateType, true),
    StructField("Name", StringType, true),
    StructField("Product", StringType, true),
    StructField("City", StringType, false),
    StructField("Platform", StringType, false),
    StructField("Times", DoubleType, false)))
val myRows = Seq(Row(Date.valueOf("2015-10-01"), "jack", "toy", "beijing", "android", 1.5),
           Row(Date.valueOf("2015-10-01"), "tom", "sea", "beijing", "android", 1.0),
           Row(Date.valueOf("2015-10-01"), "leo", "apple", "guangzhou", "android", 1.5),
           Row(Date.valueOf("2015-10-01"), "white", "toy", "beijing", "iphone", 1.1),
           Row(Date.valueOf("2015-10-01"), "jack", "toy", "beijing", "android", 1.0),
           Row(Date.valueOf("2015-10-01"), "leo", "sea", "beijing", "android", 1.0),
           Row(Date.valueOf("2015-10-01"), "tom", "toy", "beijing", "iphone", 1.0),
           Row(Date.valueOf("2015-10-01"), "may", "sea", "beijing", "android", 1.5),
           Row(Date.valueOf("2015-10-01"), "jack", "toy", "beijing", "android", 1.0),
           Row(Date.valueOf("2015-10-01"), "tom", "bar", "beijing", "android", 1.5),
           Row(Date.valueOf("2015-10-02"), "jack", "bar", "guangzhou", "android", 1.1),
           Row(Date.valueOf("2015-10-02"), "jack", "toy", "beijing", "android", 1.0),
           Row(Date.valueOf("2015-10-02"), "leo", "sea", "beijing", "iphone", 1.5),
           Row(Date.valueOf("2015-10-02"), "jack", "toy", "beijing", "android", 1.0),
           Row(Date.valueOf("2015-10-02"), "tom", "toy", "beijing", "android", 2.0),
           Row(Date.valueOf("2015-10-02"), "jack", "kk", "beijing", "android", 1.0),
           Row(Date.valueOf("2015-10-02"), "may", "water", "beijing", "android", 1.0),
           Row(Date.valueOf("2015-10-02"), "jack", "toy", "beijing", "android", 1.0),
           Row(Date.valueOf("2015-10-02"), "leo", "toy", "beijing", "android", 2.0),
           Row(Date.valueOf("2015-10-02"), "tom", "water", "beijing", "android", 1.0),
           Row(Date.valueOf("2015-10-02"), "may", "apple", "beijing", "android", 2.0))
val myRDD = spark.sparkContext.parallelize(myRows)
val df = spark.createDataFrame(myRDD, myManualSchema)

//filter records according to "city", "platform" and "time"
val filtered = df.filter($"City" === "beijing" and $"Platform" === "android" and $"Times".isin(1.0,1.5,2.0))

//def usf to count user visit
spark.udf.register("count_uv", (s: Seq[String]) => s.size)
//exctually, we can use "size" sqlfunction

//groupby "Date" and "Product", then counts the number of distinct people belong to these groups
val agged = filtered.groupBy($"Date", $"Product").
                    agg(collect_set("Name").alias("unique")).
                    withColumn("UV", expr("count_uv(unique)"))
                    
//use window func to compute topn
val windowSpec = Window.partitionBy($"Date").orderBy(desc("UV"))
val result = agged.withColumn("rank", rank.over(windowSpec)).where($"rank" <= n)