# Working with Joins

## Imports

In [None]:
import os

import findspark
import matplotlib.pyplot as plt
import pandas as pd
import pyspark

# Set display for scrolling dataframes
from IPython.core.display import HTML
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, date_format, dayofweek, lit, lpad, to_timestamp

findspark.init()

## SparkSession

In [None]:
spark = SparkSession.builder.getOrCreate()

## Load the data

Load the Chicago reported crimes dataset.

In [None]:
display(HTML("<style>pre {white-space: pre !important; }</style>"))

In [None]:
data_path = "file:///" + os.getcwd() + "/data"

file_path = data_path + "/reported-crimes.csv"

crimes_df = (
    spark.read.option("header", "true")
    .csv(file_path)
    .withColumn("Date", to_timestamp(col("Date"), "M/d/y h:m:s a"))
    .filter(col("Date") <= lit("2018-11-11"))
)

crimes_df.show(5)

## Download police station data

In addition to the reported crimes dataset, we are going to download the police station dataset from the city of Chicago Data Portal. The data can be downloaded on the commandline using wget:

`$ !wget -O data/police-stations.csv 'https://data.cityofchicago.org/api/views/z8bn-74gv/rows.csv?accessType=DOWNLOAD'`

In [None]:
!wget -O data/police-stations.csv 'https://data.cityofchicago.org/api/views/z8bn-74gv/rows.csv?accessType=DOWNLOAD'

In [None]:
file_path = data_path + "/police-stations.csv"

police_station_df = spark.read.csv(file_path, header=True)

police_station_df.show(5)

**The reported crimes dataset has only one district number. Add the district name by joining with the police station dataset**

In [None]:
# crimes_df.cache()
# crimes_df.count()

In [None]:
police_station_df.select(col("District")).distinct().show(5)

In [None]:
crimes_df.select("District").distinct().show(5)

In [None]:
police_station_df.select(lpad(col("District"), 3, "0")).show(30)

In [None]:
police_station_df = police_station_df.withColumn(
    "Format_district", lpad(col("District"), 3, "0")
)

In [None]:
police_station_df.show(5)

In [None]:
crimes_df.join(
    police_station_df,
    crimes_df.District == police_station_df.Format_district,
    "left_outer",
).show(10)

In [None]:
police_station_df.columns

In [None]:
(
    crimes_df.join(
        police_station_df,
        crimes_df.District == police_station_df.Format_district,
        "left_outer",
    )
    .drop(*drop_columns)
    .show(5)
)

## Challenge Questions

In [None]:
crimes_df.select(col("Primary Type")).distinct().count()

In [None]:
(
    crimes_df.select(col("Primary Type"))
    .distinct()
    .orderBy(col("Primary Type"))
    .show(36, truncate=False)
)

In [None]:
non_criminal_df = crimes_df.filter(
    (col("Primary Type") == "NON - CRIMINAL")
    | (col("Primary Type") == "NON-CRIMINAL")
    | (col("Primary Type") == "NON-CRIMINAL (SUBJECT SPECIFIED)")
)

non_criminal_df.show(10)

In [None]:
(
    non_criminal_df.groupBy(col("Description"))
    .count()
    .orderBy("count", ascending=False)
    .show(truncate=False)
)

**Which day of the week has the most number of reported crime?**

In [None]:
crimes_df.show(5)

In [None]:
(crimes_df.select(col("Date"), dayofweek(col("Date"))).show(5))

In [None]:
(
    crimes_df.select(
        col("Date"), dayofweek(col("Date")), date_format(col("Date"), "E")
    ).show(5)
)

In [None]:
(
    crimes_df.groupBy(date_format(col("Date"), "E"))
    .count()
    .orderBy("count", ascending=False)
    .show()
)

In [None]:
day_counts = crimes_df.groupBy(date_format(col("Date"), "E")).count().collect()

day_counts

In [None]:
dow = [x[0] for x in day_counts]
dow

In [None]:
cnt = [x[1] for x in day_counts]
cnt

In [None]:
count_df = pd.DataFrame({"Day_of_week": dow, "Count": cnt})
count_df.head()

In [None]:
count_df.sort_values("Count", ascending=False).plot(
    kind="bar", color="olive", x="Day_of_week", y="Count"
)

plt.xlabel("Day of the week")
plt.ylabel("No. of reported crimes")
plt.title("No. of reported crimes per day of the week from 2001 to present")

ax = plt.gca()
ttl = ax.title
ttl.set_position([0.5, 1.1])