# Auto reload modules

In [None]:
%load_ext autoreload
%autoreload 2

# Directories

In [3]:
# import findspark
# findspark.init()

import pyspark # only run after findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import fs

In [4]:
spark = (SparkSession.builder
  .master('local[*]')
  .appName('hello_world_spark')
  .getOrCreate())

cores = spark._jsc.sc().getExecutorMemoryStatus().keySet().size()
print("You are working with", cores, "core(s)")
spark

24/10/23 20:27:12 WARN Utils: Your hostname, ThinkPad-X1-Nano resolves to a loopback address: 127.0.1.1; using 192.168.68.130 instead (on interface wlp0s20f3)
24/10/23 20:27:12 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/23 20:27:13 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


You are working with 1 core(s)


In [5]:
sc = spark.sparkContext

In [6]:
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [7]:
RAW_DIR = fs.open_fs("../../data/raw")
CSV_DIR = RAW_DIR.getsyspath("sf-fire-calls.csv")

In [8]:
filePath = CSV_DIR

rawDF = spark.read.csv(filePath,
                       header="true",
                       inferSchema="true",
                       multiLine="true",
                       escape='"')

rawDF.show(3, truncate=False)

24/10/23 20:27:28 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+----------+------+--------------+----------------+----------+----------+--------------------+----------------------+---------------------------+----+-------+---------+-----------+----+----------------+--------+-------------+-------+-------------+---------+--------+--------------------------+----------------------+------------------+---------------------+-------------------------------------+-------------+---------+
|CallNumber|UnitID|IncidentNumber|CallType        |CallDate  |WatchDate |CallFinalDisposition|AvailableDtTm         |Address                    |City|Zipcode|Battalion|StationArea|Box |OriginalPriority|Priority|FinalPriority|ALSUnit|CallTypeGroup|NumAlarms|UnitType|UnitSequenceInCallDispatch|FirePreventionDistrict|SupervisorDistrict|Neighborhood         |Location                             |RowID        |Delay    |
+----------+------+--------------+----------------+----------+----------+--------------------+----------------------+---------------------------+----+-------+--

1. ¿Cuáles fueron los diferentes tipos de llamadas que hubo en 2018?

In [9]:
%%time
rawDF = rawDF.withColumn('CallDate', F.to_date(F.col('CallDate'), 'dd/MM/yyyy'))
df_calltype = (
    rawDF
    .withColumn('year', F.year(F.col('CallDate')))
    .filter(F.col('year') == 2018)
    .select("CallType")
    .distinct()
)

CPU times: user 7.91 ms, sys: 2.97 ms, total: 10.9 ms
Wall time: 452 ms


In [10]:
df_calltype.show()

[Stage 3:>                                                          (0 + 1) / 1]

+--------------------+
|            CallType|
+--------------------+
|Elevator / Escala...|
|              Alarms|
|Odor (Strange / U...|
|Citizen Assist / ...|
|              HazMat|
|        Vehicle Fire|
|               Other|
|        Outside Fire|
|   Traffic Collision|
|       Assist Police|
|Gas Leak (Natural...|
|        Water Rescue|
|   Electrical Hazard|
|      Structure Fire|
|    Medical Incident|
|          Fuel Spill|
|Smoke Investigati...|
|Train / Rail Inci...|
+--------------------+



                                                                                

In [11]:
df_calltype.count()

                                                                                

18

2. ¿Qué mes de 2018 fue el que tuvo un número mayor de llamadas?

In [12]:
df_month = (
    rawDF
    .withColumn('year', F.year(F.col('CallDate')))
    .withColumn('month', F.month(F.col('CallDate')))
    .filter(F.col('year') == 2018)
    .groupBy('month')
    .count()
    .orderBy(F.desc('count')) 
)

In [13]:
df_month.show()

[Stage 12:>                                                         (0 + 1) / 1]

+-----+-----+
|month|count|
+-----+-----+
|    1|  416|
|    2|  384|
|    4|  379|
|    3|  355|
|    7|  351|
|    6|  350|
|    5|  349|
|   12|  329|
|   11|  320|
|    8|  316|
|    9|  314|
|   10|  307|
+-----+-----+



                                                                                

Por tanto el mes 10 (Octubre) fue cuando se tuvieron más llamadas.

3. ¿Cuál vecindario de San Francisco fue el que realizó el mayor número de llamadas en 2018?

In [14]:
df_neighborhood = (
    rawDF
    .withColumn('year', F.year(F.col('CallDate')))
    .filter(F.col('year') == 2018)
    .groupBy('Neighborhood')
    .count()
    .orderBy(F.desc('count')) 
)

In [20]:
df_neighborhood.show(1)

[Stage 24:>                                                         (0 + 1) / 1]

+------------+-----+
|Neighborhood|count|
+------------+-----+
|  Tenderloin|  583|
+------------+-----+
only showing top 1 row



                                                                                

4. ¿Cuáles vecindarios tuvieron el peor tiempo de respuesta a las llamadas en 2018?

In [21]:
df_delay = (
    rawDF
    .withColumn('year', F.year(F.col('CallDate')))
    .filter(F.col('year') == 2018)
    .groupBy('Neighborhood')
    .agg(F.avg('Delay').alias('avg_delay'))
    .orderBy(F.desc('avg_delay'))
)

In [22]:
df_delay.show(1)

[Stage 27:>                                                         (0 + 1) / 1]

+------------+----------------+
|Neighborhood|       avg_delay|
+------------+----------------+
|    Presidio|6.45308641111111|
+------------+----------------+
only showing top 1 row



                                                                                

5. ¿Cuál semana del 2018 tuvo el mayor número de llamadas?

In [25]:
df_week = (
    rawDF
    .withColumn('year', F.year(F.col('CallDate')))
    .filter(F.col('year') == 2018)
    .withColumn('week', F.weekofyear(F.col('CallDate')))
    .groupBy('week')
    .count()
    .orderBy(F.desc('count'))
)

In [27]:
df_week.show(1)

[Stage 36:>                                                         (0 + 1) / 1]

+----+-----+
|week|count|
+----+-----+
|   1|  279|
+----+-----+
only showing top 1 row



                                                                                

6. ¿Existe alguna correlación entre los vecindarios, códigos postales y número de llamadas?

In [30]:
rawDF.show(1)

+----------+------+--------------+--------------+----------+----------+--------------------+--------------------+--------------------+----+-------+---------+-----------+----+----------------+--------+-------------+-------+-------------+---------+--------+--------------------------+----------------------+------------------+---------------+--------------------+-------------+-----+
|CallNumber|UnitID|IncidentNumber|      CallType|  CallDate| WatchDate|CallFinalDisposition|       AvailableDtTm|             Address|City|Zipcode|Battalion|StationArea| Box|OriginalPriority|Priority|FinalPriority|ALSUnit|CallTypeGroup|NumAlarms|UnitType|UnitSequenceInCallDispatch|FirePreventionDistrict|SupervisorDistrict|   Neighborhood|            Location|        RowID|Delay|
+----------+------+--------------+--------------+----------+----------+--------------------+--------------------+--------------------+----+-------+---------+-----------+----+----------------+--------+-------------+-------+-------------+

In [31]:
df_relation = (
    rawDF
    .groupBy('Neighborhood', 'Zipcode', 'IncidentNumber')
    .count()
    .orderBy(F.desc('count'))
)

In [32]:
df_relation.show()

[Stage 41:>                                                         (0 + 1) / 1]

+--------------------+-------+--------------+-----+
|        Neighborhood|Zipcode|IncidentNumber|count|
+--------------------+-------+--------------+-----+
|             Mission|  94110|      15010877|    7|
|           Chinatown|  94111|       1035424|    5|
|     South of Market|  94107|       6077383|    5|
|      Outer Richmond|  94121|       1022600|    5|
|    Western Addition|  94115|      15011972|    5|
|           Lakeshore|  94132|      15017455|    4|
|       Outer Mission|  94112|       2028136|    4|
|            Presidio|  94129|      17054697|    4|
|      Inner Richmond|  94118|       3046001|    4|
|     Pacific Heights|  94109|       2048543|    4|
|              Marina|  94123|      15028880|    4|
|            Presidio|  94129|      16007202|    4|
|          Noe Valley|  94110|       7005510|    4|
|      Haight Ashbury|  94117|       1011468|    4|
|     Treasure Island|  94130|       5003090|    4|
|     South of Market|  94103|         87458|    4|
|Bayview Hun

                                                                                

In [33]:
df_relation2 = (
    rawDF
    .groupBy('Zipcode', 'IncidentNumber')
    .count()
    .orderBy(F.desc('count'))
)

In [34]:
df_relation2.show()

[Stage 44:>                                                         (0 + 1) / 1]

+-------+--------------+-----+
|Zipcode|IncidentNumber|count|
+-------+--------------+-----+
|  94110|      15010877|    7|
|  94111|       1035424|    5|
|  94107|       6077383|    5|
|  94115|      15011972|    5|
|  94121|       1022600|    5|
|  94133|       7054787|    4|
|  94110|       7005510|    4|
|  94132|      15017455|    4|
|  94115|      12081781|    4|
|  94118|       3046001|    4|
|  94115|       3090733|    4|
|  94109|      15019147|    4|
|  94112|      14094492|    4|
|  94129|      16007202|    4|
|  94124|       7005596|    4|
|  94114|      14037815|    4|
|  94103|         87458|    4|
|  94124|      15096938|    4|
|  94112|       2028136|    4|
|  94130|       5003090|    4|
+-------+--------------+-----+
only showing top 20 rows



                                                                                

Tanto en los agrupados de las 3 variables; vecindario, código postar y número de inciede, como en los correspondientes únicamente el código postal y el número de incidente. 

Se puede obser var que prácticamente el resto de las zonas están con la misma cantidad de incidentes, pero que hay zonas con una mayor ocurrencia. Por lo que probablementeloso servicios de dichos lugares no sean eficientes.

Si se entiende por correlación que aumente la cantidad de ellos, probablemente si. Habría que ver si por las coordenadas se enuentran cerca.