In [1]:
from pyspark.sql import SparkSession, Window
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [2]:
spark = SparkSession.builder \
    .master("spark://spark-master:7077")\
    .appName("1 billion row challenge") \
    .getOrCreate()

In [3]:
spark.sparkContext

In [4]:
schema = StructType() \
        .add("CITY",StringType(),True) \
        .add("MEASUREMENT",StringType(),True)

In [5]:
data = spark\
    .read\
    .format("com.databricks.spark.csv")\
    .options(inferSchema='True')\
    .options(delimiter=';')\
    .options(header='False')\
    .schema(schema)\
    .load("/data/measurements.csv")

In [6]:
data.show(10)

+--------------------+-----------+
|                CITY|MEASUREMENT|
+--------------------+-----------+
|               Sorso|      -42.0|
|           Miacatlán|      -91.8|
|             Milagro|       21.2|
|             Denizli|        7.7|
|          Boa Viagem|       46.3|
|             Draveil|      -56.7|
|Blunsdon Saint An...|       69.7|
|                Burg|       12.5|
|            Reyhanlı|       91.9|
|              Pudtol|      -86.8|
+--------------------+-----------+
only showing top 10 rows



In [8]:
data\
    .repartition(100)\
    .groupBy("city")\
    .agg(min("measurement").alias("min"), avg("measurement").alias("mean"), max("measurement").alias("max"))\
    .orderBy("city")\
    .coalesce(1)\
    .write\
    .format("csv")\
    .save("/data/result.csv")