In [1]:
import pyspark.sql
from pyspark.sql.functions import (
    expr,
    col,
    to_timestamp,
    format_string,
    regexp_extract,
    datediff,
    current_timestamp,
    when,
    max,
    lit,
)

In [2]:
def get_311_data(spark: pyspark.sql.SparkSession) -> pyspark.sql.DataFrame:
    print("[wrangle.py] reading case.csv")
    df = spark.read.csv("data/case.csv", header=True, inferSchema=True)
    return df.withColumnRenamed("SLA_due_date", "case_due_date")

In [3]:
def handle_dtypes(df: pyspark.sql.DataFrame) -> pyspark.sql.DataFrame:
    print("[wrangle.py] handling data types")
    return (
        df.withColumn("case_closed", expr('case_closed == "YES"'))
        .withColumn("case_late", expr('case_late == "YES"'))
        .withColumn("council_district", col("council_district").cast("string"))
    )

In [4]:
def handle_dates(df: pyspark.sql.DataFrame) -> pyspark.sql.DataFrame:
    print("[wrangle.py] parsing dates")
    fmt = "M/d/yy H:mm"
    return (
        df.withColumn("case_opened_date", to_timestamp("case_opened_date", fmt))
        .withColumn("case_closed_date", to_timestamp("case_closed_date", fmt))
        .withColumn("case_due_date", to_timestamp("case_due_date", fmt))
    )

In [5]:
def add_features(df: pyspark.sql.DataFrame) -> pyspark.sql.DataFrame:
    print("[wrangle.py] adding features")
    max_date = df.select(max("case_closed_date")).first()[0]
    return (
        df.withColumn("num_weeks_late", expr("num_days_late / 7 AS num_weeks_late"))
        .withColumn(
            "council_district",
            format_string("%03d", col("council_district").cast("int")),
        )
        .withColumn("zipcode", regexp_extract("request_address", r"\d+$", 0))
        .withColumn("case_age", datediff(lit(max_date), "case_opened_date"))
        .withColumn("days_to_closed", datediff("case_closed_date", "case_opened_date"))
        .withColumn(
            "case_lifetime",
            when(expr("! case_closed"), col("case_age")).otherwise(
                col("days_to_closed")
            ),
        )
    )

In [6]:
def join_departments(
    case_df: pyspark.sql.DataFrame, spark: pyspark.sql.SparkSession
) -> pyspark.sql.DataFrame:
    print("[wrangle.py] joining departments")
    dept = spark.read.csv("data/dept.csv", header=True, inferSchema=True)
    return (
        case_df.join(dept, "dept_division", "left")
        # drop all the columns except for standardized name, as it has much fewer unique values
        .drop(dept.dept_division)
        .drop(dept.dept_name)
        .drop(case_df.dept_division)
        .withColumnRenamed("standardized_dept_name", "department")
        # convert to a boolean
        .withColumn("dept_subject_to_SLA", col("dept_subject_to_SLA") == "YES")
    )

In [7]:
def wrangle_311(spark: pyspark.sql.SparkSession) -> pyspark.sql.DataFrame:
    df = add_features(handle_dates(handle_dtypes(get_311_data(spark))))
    return join_departments(df, spark)

In [10]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()
df = wrangle_311(spark)

[wrangle.py] reading case.csv
[wrangle.py] handling data types
[wrangle.py] parsing dates
[wrangle.py] adding features
[wrangle.py] joining departments


In [12]:
df.select('zipcode').show()

+-------+
|zipcode|
+-------+
|  78207|
|  78223|
|  78223|
|  78223|
|  78228|
|       |
|  78251|
|  78251|
|  78251|
|  78251|
|  78251|
|  78251|
|  78251|
|  78251|
|  78251|
|  78251|
|  78251|
|  78251|
|  78251|
|  78251|
+-------+
only showing top 20 rows



In [21]:
row = df.select('service_request_type', 'department').head()
row

Row(service_request_type='Stray Animal', department='Animal Care Services')

In [22]:
row.service_request_type

'Stray Animal'

In [24]:
row = df.select(max('case_closed_date')).head()
row[0]

datetime.datetime(2018, 8, 8, 10, 38)

In [26]:
df.select('case_lifetime').show()

+-------------+
|case_lifetime|
+-------------+
|            0|
|            2|
|            1|
|            1|
|            0|
|            0|
|            1|
|            1|
|            1|
|            1|
|            1|
|            1|
|            1|
|            1|
|            1|
|            1|
|            1|
|            1|
|            1|
|            1|
+-------------+
only showing top 20 rows

