In [32]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as func

spark = SparkSession.builder.appName("friendsByAge").getOrCreate()

In [33]:
peoples = spark.read.option("inferSchema", "true").option("header", "true").csv("Data/fakefriends-header.csv")

In [34]:
peoples.printSchema()

root
 |-- userID: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- friends: integer (nullable = true)



In [35]:
peoples.show(5)

+------+--------+---+-------+
|userID|    name|age|friends|
+------+--------+---+-------+
|     0|    Will| 33|    385|
|     1|Jean-Luc| 26|      2|
|     2|    Hugh| 55|    221|
|     3|  Deanna| 40|    465|
|     4|   Quark| 68|     21|
+------+--------+---+-------+
only showing top 5 rows



In [36]:
# solution 1 : query

In [37]:
peoples.createOrReplaceTempView("people")

In [40]:
friendsByAge = spark.sql("SELECT age, round(avg(friends), 2) as Friends FROM people GROUP BY age ORDER BY age DESC")

In [41]:
friendsByAge.show()

+---+-------+
|age|Friends|
+---+-------+
| 69|  235.2|
| 68|  269.6|
| 67| 214.63|
| 66| 276.44|
| 65|  298.2|
| 64| 281.33|
| 63|  384.0|
| 62| 220.77|
| 61| 256.22|
| 60| 202.71|
| 59|  220.0|
| 58| 116.55|
| 57| 258.83|
| 56| 306.67|
| 55| 295.54|
| 54| 278.08|
| 53| 222.86|
| 52| 340.64|
| 51| 302.14|
| 50|  254.6|
+---+-------+
only showing top 20 rows



In [42]:
# solution 2 : dataset

In [43]:
average = peoples.select("age", "friends")

In [44]:
friendAge = average.groupBy("age").agg(func.round(func.avg("friends"), 2).alias("Friends")).sort("age", ascending=False)
friendAge.show()

+---+-------+
|age|Friends|
+---+-------+
| 69|  235.2|
| 68|  269.6|
| 67| 214.63|
| 66| 276.44|
| 65|  298.2|
| 64| 281.33|
| 63|  384.0|
| 62| 220.77|
| 61| 256.22|
| 60| 202.71|
| 59|  220.0|
| 58| 116.55|
| 57| 258.83|
| 56| 306.67|
| 55| 295.54|
| 54| 278.08|
| 53| 222.86|
| 52| 340.64|
| 51| 302.14|
| 50|  254.6|
+---+-------+
only showing top 20 rows



In [45]:
spark.stop()