# Working with Rows

## Import Libraries

In [None]:
import os

import findspark
import pyspark

# Display settings
from IPython.core.display import HTML
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, to_timestamp

findspark.init()

## SparkSession

In [None]:
spark = SparkSession.builder.master("local").config(conf=SparkConf()).getOrCreate()

In [None]:
display(HTML("<style>pre {white-space: pre !important; }</style>"))

## Load the data

In [None]:
data_path = "file:///" + os.getcwd() + "/data"

file_path = data_path + "/reported-crimes.csv"

crimes_df = (
    spark.read.csv(file_path, header=True)
    .withColumn("Date", to_timestamp(col("Date"), "MM/dd/yyyy hh:mm:ss a"))
    .filter(col("Date") <= lit("2018-11-11"))
)
crimes_df.show(5)

**Add the reported crimes for an additional day, 12-Nov-2018, to our dataset**

In [None]:
one_day = (
    spark.read.csv(file_path, header=True)
    .withColumn("Date", to_timestamp(col("Date"), "MM/dd/yyyy hh:mm:ss a"))
    .filter(col("Date") == lit("2018-11-12"))
)
one_day.count()

In [None]:
crimes_df.union(one_day).orderBy("Date", ascending=False).show(5)

**What are the top 10 number of crimes by Primary type, in descending order of occurence?**

In [None]:
(crimes_df.groupBy("Primary Type").count().orderBy("count", ascending=False).show(10))

**What percentage of reported crimes resulted in an arrest?**

In [None]:
crimes_df.select("Arrest").distinct().show()

In [None]:
crimes_df.printSchema()

In [None]:
arrests = crimes_df.filter(col("Arrest") == "true").count()
total_reported = crimes_df.select("Arrest").count()
percentage_arrests = arrests / total_reported * 100
print(f"Percentage arrests: {percentage_arrests}")

**What are the top 3 locations for reported crimes**

In [None]:
crimes_df.groupBy("Location Description").count().orderBy(
    "count", ascending=False
).show(3)