In [None]:
# ---- Query 4 | Dataframe API ----

# Pyspark Libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import count, col, udf, year, avg
import geopy.distance

# Spark Session | Queries
sc = SparkSession \
    .builder \
    .appName("Query 4 - Dataframe API") \
    .getOrCreate() 

In [None]:

# Crime Data DF
crime_data_df = sc.read.format('csv') \
    .options(header='true', inferSchema=True) \
    .load("hdfs://okeanos-master:54310/user/data/primary/crime_data")

# Change Columns types
crime_data_df = crime_data_df \
    .withColumn('LAT',col('LAT').cast('double')) \
    .withColumn('LON', col('LON').cast('double'))

# Police Stations DF
police_stations_df = sc \
    .read.format('csv') \
    .options(header='true', inferSchema=True) \
    .load("hdfs://okeanos-master:54310/user/data/secondary/LAPD_Police_Stations.csv")

# Change Column Types and Select with aliased names
police_stations_df = police_stations_df \
    .withColumn("X", col("X").cast("double")) \
    .withColumn("Y", col("Y").cast("double")) \
    .select(col("X").alias("police_station_lat"), col("Y").alias("police_station_lon"), col("PREC").alias("police_station")) 

# Join Data
joined_police_station_df = crime_data_df \
    .withColumn("AREA", col("AREA").cast('int')) \
    .withColumn("Year", year('Date Rptd')) \
    .select(col('Year'), col("Weapon Used Cd").alias("weapon"), col("LAT").alias("crime_lat"), col("LON").alias("crime_lon"), col("AREA").alias("police_station")) \
    .join(police_stations_df, on="police_station")

# # Register Distance Function
# def haversine(lat1, lon1, lat2, lon2):
#     R = 6371.0
#     lat1_rad, lon1_rad, lat2_rad, lon2_rad = radians(lat1), radians(lon1), radians(lat2), radians(lon2)
#     dlat, dlon = lat2_rad - lat1_rad, lon2_rad - lon1_rad
#     a = sin(dlat / 2)**2 + cos(lat1_rad) * cos(lat2_rad) * sin(dlon / 2)**2
#     c = 2 * atan2(sqrt(a), sqrt(1 - a))
#     return R * c

def get_distance(lat1, lon1, lat2, lon2):
    return geopy.distance.geodesic((lat1, lon1), (lat2, lon2)).km

distance = udf(lambda lat1, lon1, lat2, lon2: get_distance(lat1, lon1, lat2, lon2))

filtered_df =  joined_police_station_df \
    .filter(col('weapon') != 'NULL') 
    # .withColumn("Distance", distance(col("crime_lat"), col("crime_lon"), col("police_station_lat"), col("police_station_lon"))) 
    # .groupBy(col('Year')).agg(count('*').alias('#'), avg('Distance').alias('average_distance')) \
    # .orderBy(col('Year').asc()) \
    # .withColumn('average_distance', (col('average_distance'))) \
    # .select(col('Year'), col('average_distance'), col('#'))

filtered_df.limit(100).show()

In [None]:
# ----- Query 2 | SQL API

crime_data_df.createOrReplaceTempView("crime_data")

query_2_sql = """

WITH OnlyInStreet AS (
  SELECT 
    `TIME OCC` as time,
    CASE 
      WHEN time >= 500 AND time < 1200 THEN 'Morning'
      WHEN time >= 1200 AND time < 1700 THEN 'Noon' 
      WHEN time >= 1700 AND time < 2100 THEN 'Afternoon' 
      WHEN time >= 2100 AND time < 2400 OR time >=0 AND time < 500 THEN 'Night' 
    END AS PartOfDay
  FROM
    crime_data
  WHERE
    `Premis Desc`='STREET'
)

SELECT 
  PartOfDay,
  COUNT(PartOFDay) as NumberOfCrimes
FROM
  OnlyInStreet
GROUP BY
  PartOfDay
ORDER BY
  NumberOfCrimes DESC;
"""

crime_data_query = sc.sql(query_2_sql).show()

In [None]:
# ----- Query 3 | SQL API

joined_crime_data_df.createOrReplaceTempView("joined_crime_data")

def query_3_sql(num, type):
  return f"""

  with ranked as (
    select 
      descent,
      median_income,
      dense_rank() over (order by `median_income` {type}) as rank
    from joined_crime_data
    where 
      descent is not null and trim(descent) != ''
  ) 

  select 
    descent,
    count(`median_income`) as rnk
  from 
    ranked
  where
    rank={num}
  group by descent
  order by rnk desc;
"""

# Print first three
for i in range(1,4):
  sc.sql(query_3_sql(i, 'desc')).show()

# Print last three
for i in range(1,4):
  sc.sql(query_3_sql(i, 'desc')).show()