In [None]:
# Spark Config
conf = SparkConf() \
    .set("spark.master", "yarn") \
    .set("spark.eventLog.enabled", "true") \
    .set("spark.eventLog.dir", "hdfs://okeanos-master:54310/spark.eventLog") \
    .set("spark.history.fs.logDirectory", "hdfs://okeanos-master:54310/spark.eventLog") \
    .set("spark.driver.memory", "1g") \
    .set("spark.executor.memory", "1g") \
    .set("spark.executor.cores", "1") 

In [None]:
# ----- Query 2 | SQL API

crime_data_df.createOrReplaceTempView("crime_data")

query_2_sql = """

WITH OnlyInStreet AS (
  SELECT 
    `TIME OCC` as time,
    CASE 
      WHEN time >= 500 AND time < 1200 THEN 'Morning'
      WHEN time >= 1200 AND time < 1700 THEN 'Noon' 
      WHEN time >= 1700 AND time < 2100 THEN 'Afternoon' 
      WHEN time >= 2100 AND time < 2400 OR time >=0 AND time < 500 THEN 'Night' 
    END AS PartOfDay
  FROM
    crime_data
  WHERE
    `Premis Desc`='STREET'
)

SELECT 
  PartOfDay,
  COUNT(PartOFDay) as NumberOfCrimes
FROM
  OnlyInStreet
GROUP BY
  PartOfDay
ORDER BY
  NumberOfCrimes DESC;
"""

crime_data_query = sc.sql(query_2_sql).show()

In [None]:
# ----- Query 3 | SQL API

joined_crime_data_df.createOrReplaceTempView("joined_crime_data")

def query_3_sql(num, type):
  return f"""

  with ranked as (
    select 
      descent,
      median_income,
      dense_rank() over (order by `median_income` {type}) as rank
    from joined_crime_data
    where 
      descent is not null and trim(descent) != ''
  ) 

  select 
    descent,
    count(`median_income`) as rnk
  from 
    ranked
  where
    rank={num}
  group by descent
  order by rnk desc;
"""

# Print first three
for i in range(1,4):
  sc.sql(query_3_sql(i, 'desc')).show()

# Print last three
for i in range(1,4):
  sc.sql(query_3_sql(i, 'desc')).show()