### データセットのダウンロード
https://github.com/databricks/LearningSparkV2/blob/master/databricks-datasets/learning-spark-v2/flights/departuredelays.csv   
を取得して同じディレクトリに格納する。

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import when, col

In [None]:
spark = (SparkSession.builder.appName("SparkSQLExampleApp").getOrCreate())

In [None]:
csv_file = 'departuredelays.csv'

In [None]:
# スキーマ指定しないとdateで一桁月の0埋めが消える。
schema = '`date` STRING, `delay` INT, `distance` INT, `origin` STRING, `destination` STRING'

In [None]:
df = (spark.read.format('csv')
     .option('header', 'true')
     .schema(schema)
     .load(csv_file))
df.createOrReplaceTempView('us_delay_flights_tbl')

In [None]:
spark.sql("""SELECT distance, origin, destination
FROM us_delay_flights_tbl WHERE distance > 1000
ORDER BY distance DESC""").show(10)

In [None]:
spark.sql("""SELECT date, delay, origin, destination
FROM us_delay_flights_tbl
WHERE delay > 120 AND origin = 'SFO' AND destination = 'ORD'
ORDER BY delay DESC""").show(10)

In [None]:
spark.sql("""SELECT
date_format(to_timestamp(date, 'MMddHHmm'), 'MM-dd hh:mm a') as formatted_date, 
delay, origin, destination 
FROM us_delay_flights_tbl 
WHERE delay > 120 AND origin = 'SFO' AND destination = 'ORD'
ORDER BY delay DESC""").show(10)

In [None]:
spark.sql("""SELECT delay, origin, destination,
    CASE
        WHEN delay > 360 THEN 'Very Long Delays'
        WHEN delay > 120 AND delay <= 360 THEN 'Long Delays'
        WHEN delay >= 60 AND delay <= 120 THEN 'Short Delays'
        WHEN delay > 0 and delay <= 60 THEN 'Tolerable Delays'
        WHEN delay = 0 THEN 'No Delays'
        ELSE 'Early'
    END AS Flight_Delays
FROM us_delay_flights_tbl
ORDER BY origin, delay DESC""").show(10)


In [None]:
(df.select('distance', 'origin', 'destination').where('distance > 1000').orderBy('distance', ascending=False).show(10))

In [None]:
(df.select('date', 'delay', 'origin', 'destination').where('delay > 120').where('origin == "SFO"').where('destination == "ORD"').orderBy('delay', ascending=False).show(10))

In [None]:
(df
 .selectExpr('date_format(to_timestamp(date, "MMddHHmm"), "MM-dd hh:mm a") as formatted_date', 'delay', 'origin', 'destination')
 .where('delay > 120').where('origin == "SFO"').where('destination == "ORD"').orderBy('delay', ascending=False).show(10))

In [None]:
(df.select('delay', 'origin', 'destination',
           when(df.delay > 360, 'Very Long Delays')
           .when((df.delay > 120) & (df.delay <= 360), 'Long Delays')
           .when((df.delay >= 60) & (df.delay <= 120), 'Short Delays')
           .when((df.delay > 0) & (df.delay < 60), 'Tolerable Delays')
           .when(df.delay == 0, 'No Delays')
           .otherwise('Early').alias('Flight_Delays'))
    .orderBy(col('origin').asc(), col('delay').desc()).show(10))