In [2]:
import os

os.chdir("../../")


FILES_PATH = f"{os.path.abspath('.')}/data/bronze"

ARREST_PATH = os.path.join(FILES_PATH, "arrests")
CRIME_PATH = os.path.join(FILES_PATH, "crimes")
CODE_PATH = os.path.join(FILES_PATH, "iucr")



In [3]:
from pyspark.sql import SparkSession


spark = (
    SparkSession.builder
    .appName("silver-preparation")
    .config("spark.jars", "./artifacts/postgresql-42.7.7.jar")

    .getOrCreate()
)


Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/09/27 20:04:01 WARN Utils: Your hostname, Air-M4.local, resolves to a loopback address: 127.0.0.1; using 192.168.0.65 instead (on interface en0)
25/09/27 20:04:01 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
25/09/27 20:04:01 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [5]:
crimes_df = spark.read.options(recursiveFileLookup="true").parquet(CRIME_PATH)
arrest_df = spark.read.options(recursiveFileLookup="true").parquet(ARREST_PATH)
code_df = spark.read.parquet(CODE_PATH)

In [7]:
from pyspark.sql.types import IntegerType, DoubleType, BooleanType
from pyspark.sql.functions import col, to_timestamp, coalesce, lit, trim, length, when

crimes_silver_df = crimes_df.withColumnsRenamed({
  "ID": "crime_id",
  "Case Number": "case_number",
  "Date": "crime_datetime",
  "Block": "block",
  "IUCR": "iucr_code",
  "Primary Type": "primary_type",
  "Description": "description",
  "Location Description": "location_description",
  "Arrest": "is_arrest",
  "Domestic": "is_domestic",
  "Beat": "beat_id",
  "District": "district_id",
  "Ward": "ward_id",
  "Community Area": "community_area_id",
  "FBI Code": "fbi_code",
  "X Coordinate": "x_coordinate",
  "Y Coordinate": "y_coordinate",
  "Latitude": "latitude",
  "Longitude": "longitude",
  "Location": "location_geo_point",
}).select(
    "crime_id", "case_number", "crime_datetime", "block", "iucr_code", "primary_type",
    "description", "location_description", "is_arrest", "is_domestic", "beat_id", "ward_id",
    "community_area_id", "fbi_code", "x_coordinate", "y_coordinate", "latitude", "longitude", 
    "location_geo_point", "district_id"
).withColumns({
    "crime_datetime": to_timestamp(col("crime_datetime"), "MM/dd/yyyy hh:mm:ss a"),
    "is_arrest": col("is_arrest").cast(BooleanType()),
    "is_domestic": col("is_domestic").cast(BooleanType()),
    "latitude": col("latitude").cast(DoubleType()),
    "longitude": col("longitude").cast(DoubleType()),
    "x_coordinate": col("x_coordinate").cast(DoubleType()),
    "y_coordinate": col("y_coordinate").cast(DoubleType()),
    "beat_id": col("beat_id").cast(IntegerType()),
    "district_id": col("district_id").cast(IntegerType()),
    "ward_id": col("ward_id").cast(IntegerType()),
    "community_area_id": col("community_area_id").cast(IntegerType()),
    ## Transform section
    "location_description": coalesce(col("location_description"), lit("Unknown"))
}).filter(
    col("latitude").isNotNull() & \
    col("longitude").isNotNull() & \
    col("iucr_code").isNotNull() & \
    col("primary_type").isNotNull()
).dropDuplicates(["crime_id"])

In [9]:
def to_snake_case(name: str) -> str:
    name = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name)
    name = re.sub("([a-z0-9])([A-Z])", r"\1_\2", name)
    return name.replace(" ", "_").lower()

arrests_renamed_df = arrest_df.withColumnsRenamed({
  # Only this column naming specified, another will be default snake_case
  "CB_NO": "arrest_id",
  "RACE": "arrestee_race",
  "CHARGE 1 STATUTE": "charge_one_statue",
  "CHARGE 1 DESCRIPTION": "charge_one_description",
  "CHARGE 1 TYPE": "charge_one_type",
})

arrests_silver_df = arrests_renamed_df.toDF(
    *[to_snake_case(c) for c in arrests_renamed_df.columns]
).select(
    "arrest_id", "case_number", "arrest_date", "arrestee_race",
    "charge_one_statue", "charge_one_description", "charge_one_type"
).withColumns({
    "arrest_date": to_timestamp(col("arrest_date"), "MM/dd/yyyy hh:mm:ss a"),
    "charge_one_type": (
        when(col("charge_one_type") == "F", "felony")
        .when(col("charge_one_type") == "M", "misdemeanor")
        .otherwise("other")
    )
}).filter(
    (col("case_number").isNotNull()) & \
    (length(trim(col("case_number"))) > 0)
).dropDuplicates(["arrest_id"])

In [10]:
arrests_silver_df.show(5)



+---------+-----------+-------------------+--------------+--------------------+----------------------+---------------+
|arrest_id|case_number|        arrest_date| arrestee_race|   charge_one_statue|charge_one_description|charge_one_type|
+---------+-----------+-------------------+--------------+--------------------+----------------------+---------------+
| 18812380|   HX100068|2014-01-01 01:05:00|WHITE HISPANIC|720 ILCS 5.0/12-3...|  BATTERY - MAKE PH...|    misdemeanor|
| 18812401|   HX100135|2014-01-01 01:35:00|         BLACK|720 ILCS 5.0/12-2...|  AGG ASSAULT/POLIC...|    misdemeanor|
| 18812508|   HX100292|2014-01-01 04:35:00|         WHITE|720 ILCS 5.0/12-3...|  DOMESTIC BATTERY ...|    misdemeanor|
| 18812509|   HX100178|2014-01-01 02:55:00|         BLACK|720 ILCS 5.0/24-1...|  AGG UUW/VEH/FIR L...|         felony|
| 18812547|   HX100403|2014-01-01 07:45:00|         BLACK|720 ILCS 5.0/31-4...|  OBSTRUCTING IDENT...|    misdemeanor|
+---------+-----------+-------------------+-----

                                                                                

In [11]:
codes_renamed_df = code_df.withColumnsRenamed({
    "IUCR": "iucr_code",
    # Another default snake case
})

codes_silver_df = codes_renamed_df.toDF(
    *[to_snake_case(c) for c in codes_renamed_df.columns]
).withColumns({
    "primary_description": coalesce(col("primary_description"), lit("Unknown")),
    "secondary_description": coalesce(col("secondary_description"), lit("Unknown")),
}).filter( # Dont parse all columns to string, because here are boolean
    col("active") == True
).dropDuplicates(["iucr_code"]).select(
    "iucr_code", "primary_description", "secondary_description", "index_code"
)

In [12]:
DB_USER = "airflow"
DB_PASSWORD = "airflow"
JDBC_URL = "jdbc:postgresql://localhost:8432/final"
from pyspark.sql import DataFrame


def write_psql(
        writeable_df: DataFrame,
        *,
        table: str,
        connection_str: str,
        username: str,
        password: str,
        mode: str = "overwrite"
) -> None:
    properties = {"user":username, "password":password, "driver": "org.postgresql.Driver"}
    writeable_df.write.jdbc(url=connection_str, table=table, properties=properties, mode=mode)

write_psql(codes_silver_df, table="silver_ucr_codes", connection_str=JDBC_URL, username=DB_USER, password=DB_PASSWORD)
write_psql(arrests_silver_df, table="silver_arrests", connection_str=JDBC_URL, username=DB_USER, password=DB_PASSWORD, mode="append") # pyspark doesnt have mode merge
write_psql(crimes_silver_df, table="silver_crimes", connection_str=JDBC_URL, username=DB_USER, password=DB_PASSWORD, mode="append") #  pyspark doesnt have mode merge



                                                                                