# Date Exploratory (EDA)

Use this notebook to explore the data generated by the pipeline in your preferred programming language.

**Note**: This notebook is not executed as part of the pipeline.

In [0]:
import sys

sys.path.append("/Workspace/capstone_project/Tables/")

## 1. Bronze Layer Inspection


### 1.1 Load Bronze Table

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType, TimestampType, FloatType
import pyspark.sql.functions as F

In [0]:
catalog_name = 'workspace.capstone_project'
df_bronze = spark.table(f'{catalog_name}.nyc_fire_incidents_bronze')
display(df_bronze.limit(10))

In [0]:
df_bronze.printSchema()

In [0]:
print("Rows:", df_bronze.count())

### 1.2 Basic Profiling Bronze

#### 1.2.1 Check Missing Value 

In [0]:
# 1.1 Column-level missingness
n = df_bronze.count()
missing = df_bronze.select([
    F.round(
        (F.sum(F.when(F.col(c).isNull(), 1).otherwise(0)) / F.lit(n) * 100),
        2   # <-- number of decimal places
    ).alias(c)
    for c in df_bronze.columns
])

display(missing.toPandas().T.reset_index()
        .rename(columns={"index":"column", 0:"percent_missing"}))
print("Column Level Missing Count:", missing.count())


In [0]:
# Null check for key timestamps
display(
    df_bronze.select(
        F.sum(F.col("INCIDENT_DATETIME").isNull().cast("int")).alias("null_incident_datetime"),
        F.sum(F.col("FIRST_ON_SCENE_DATETIME").isNull().cast("int")).alias("null_on_scene"),
        F.sum(F.col("INCIDENT_RESPONSE_SECONDS_QY").isNull().cast("int")).alias("null_response_seconds")
    )
)

#### 1.2.2 Duplicate Incident ID Check

In [0]:
display(
    df_bronze.agg(
        F.count("*").alias("total_rows"),
        F.countDistinct("STARFIRE_INCIDENT_ID").alias("distinct_incidents")
    )
)

display(
    df_bronze.groupBy("STARFIRE_INCIDENT_ID")
    .count()
    .filter(F.col("count") > 1)
    .orderBy(F.desc("count"))
)


## 2. Silver Layer (Clean Incident Table Validation)

### 2.1 Load and Profile Table

In [0]:
df_silver = spark.read.table(f'{catalog_name}.nyc_fire_incidents_silver')

print("Silver Row Count:", df_silver.count())
display(df_silver.limit(5))

### 2.2 Check Missing Values

In [0]:
# 1.1 Column-level missingness
n = df_silver.count()

missing = df_silver.select([
    F.round(
        (F.sum(F.when(F.col(c).isNull(), 1).otherwise(0)) / F.lit(n) * 100),
        2   # <-- number of decimal places
    ).alias(c)
    for c in df_silver.columns
])

display(missing.toPandas().T.reset_index()
        .rename(columns={"index":"column", 0:"percent_missing"}))
print("Column Level Missing Count Silver Table:", missing.count())

In [0]:
# Null check for key timestamps
display(
    df_silver.select(
        F.sum(F.col("INCIDENT_DATETIME").isNull().cast("int")).alias("null_incident_datetime"),
        F.sum(F.col("FIRST_ON_SCENE_DATETIME").isNull().cast("int")).alias("null_on_scene"),
        F.sum(F.col("INCIDENT_RESPONSE_SECONDS_QY").isNull().cast("int")).alias("null_response_seconds")
    )
)

Missing vs valid response-time count


In [0]:
display(
    df_silver.select(
        F.sum(F.col("INCIDENT_RESPONSE_SECONDS_QY").isNotNull().cast("int")).alias("valid_response_time"),
        F.sum(F.col("INCIDENT_RESPONSE_SECONDS_QY").isNull().cast("int")).alias("missing_response_time")
    )
)

### 2.3 Response Time Sanity Check

In [0]:
df_resp_sec = df_silver.agg(
        F.min("INCIDENT_RESPONSE_SECONDS_QY").alias("min_resp_sec"),
        F.expr("percentile_approx(INCIDENT_RESPONSE_SECONDS_QY, 0.5)").alias("p50_resp_sec"),
        F.expr("percentile_approx(INCIDENT_RESPONSE_SECONDS_QY, 0.9)").alias("p90_resp_sec"),
        F.expr("percentile_approx(INCIDENT_RESPONSE_SECONDS_QY, 0.95)").alias("p95_resp_sec"),
        F.max("INCIDENT_RESPONSE_SECONDS_QY").alias("max_resp_sec")
    )
display(
    df_resp_sec.toPandas().T.reset_index()
)

### 2.4 Check validity flags

In [0]:
display(
    df_silver.groupBy("VALID_INCIDENT_RSPNS_TIME_INDC")
    .count()
)

### 2.5 Borough distribution

In [0]:
display(
    df_silver.groupBy("INCIDENT_BOROUGH")
    .count()
    .orderBy(F.desc("count"))
)

### 2.6 Unique incident category inspection (for harmonization)

In [0]:
display(
    df_silver.groupBy("INCIDENT_CLASSIFICATION_GROUP")
    .count()
    .orderBy(F.desc("count"))
)

In [0]:
display(
    df_silver.groupBy("INCIDENT_CLASSIFICATION")
    .count()
    .orderBy(F.desc("count"))
)

## 3. Gold Layer (Feature Table Check)

In [0]:
df_gold = spark.read.table("workspace.capstone_project.nyc_fire_incidents_gold")


### 3.1 Basic Sanity Check

In [0]:
print("Gold Row Count:", df_gold.count())
display(df_gold.limit(10))

In [0]:
df_gold.printSchema()

### 3.2 Censoring structure (survival readiness)
Purpose: verify survival analysis design

`event_indicator` is stored as a boolean in the Gold table (`True` = arrival observed, `False` = right-censored). For some survival libraries, it may be cast to 0/1 integers at modeling time without changing its meaning.


In [0]:
display(
    df_gold.groupBy("event_indicator")
           .count()
)

Interpretation:

True → arrival observed

False → right-censored

### 3.3. Final missingness check (Gold validation)

In [0]:
n = df_gold.count()

missing = df_gold.select([
    F.round(
        (F.sum(F.when(F.col(c).isNull(), 1).otherwise(0)) / F.lit(n) * 100),
        2   # <-- number of decimal places
    ).alias(c)
    for c in df_gold.columns
])
display(missing.toPandas().T.reset_index()
        .rename(columns={"index":"column", 0:"percent_missing"}))
print("Column Level Missing Count Silver Table:", missing.count())