# 스파크 환경설정

In [14]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("24120901_sf_fire_calls_App").getOrCreate() 

In [11]:
from pyspark.sql.functions import *

# 데이터 로드

In [18]:
df = spark.read.format('csv').load('src/data/sf-fire-calls.csv', inferSchema = True, header = True)

                                                                                

In [19]:
df.printSchema()

root
 |-- CallNumber: integer (nullable = true)
 |-- UnitID: string (nullable = true)
 |-- IncidentNumber: integer (nullable = true)
 |-- CallType: string (nullable = true)
 |-- CallDate: string (nullable = true)
 |-- WatchDate: string (nullable = true)
 |-- CallFinalDisposition: string (nullable = true)
 |-- AvailableDtTm: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Zipcode: integer (nullable = true)
 |-- Battalion: string (nullable = true)
 |-- StationArea: string (nullable = true)
 |-- Box: string (nullable = true)
 |-- OriginalPriority: string (nullable = true)
 |-- Priority: string (nullable = true)
 |-- FinalPriority: integer (nullable = true)
 |-- ALSUnit: boolean (nullable = true)
 |-- CallTypeGroup: string (nullable = true)
 |-- NumAlarms: integer (nullable = true)
 |-- UnitType: string (nullable = true)
 |-- UnitSequenceInCallDispatch: integer (nullable = true)
 |-- FirePreventionDistrict: string (nullable = true)
 

In [22]:
df.show(5)

+----------+------+--------------+----------------+----------+----------+--------------------+--------------------+--------------------+----+-------+---------+-----------+----+----------------+--------+-------------+-------+-------------+---------+--------+--------------------------+----------------------+------------------+--------------------+--------------------+-------------+---------+
|CallNumber|UnitID|IncidentNumber|        CallType|  CallDate| WatchDate|CallFinalDisposition|       AvailableDtTm|             Address|City|Zipcode|Battalion|StationArea| Box|OriginalPriority|Priority|FinalPriority|ALSUnit|CallTypeGroup|NumAlarms|UnitType|UnitSequenceInCallDispatch|FirePreventionDistrict|SupervisorDistrict|        Neighborhood|            Location|        RowID|    Delay|
+----------+------+--------------+----------------+----------+----------+--------------------+--------------------+--------------------+----+-------+---------+-----------+----+----------------+--------+------------

# 데이터 탐색

In [23]:
#175296 행 
df.count()

175296

In [130]:
df.columns

['CallNumber',
 'UnitID',
 'IncidentNumber',
 'CallType',
 'CallDate',
 'WatchDate',
 'CallFinalDisposition',
 'AvailableDtTm',
 'Address',
 'City',
 'Zipcode',
 'Battalion',
 'StationArea',
 'Box',
 'OriginalPriority',
 'Priority',
 'FinalPriority',
 'ALSUnit',
 'CallTypeGroup',
 'NumAlarms',
 'UnitType',
 'UnitSequenceInCallDispatch',
 'FirePreventionDistrict',
 'SupervisorDistrict',
 'Neighborhood',
 'Location',
 'RowID',
 'Delay']

In [150]:
for i in df.columns:
    num_count = df.filter(col(i).isNull()).count()
    if num_count > 0 :
        print(f'{i} 의 Null 개수: {num_count}개')

AvailableDtTm 의 Null 개수: 1794개
City 의 Null 개수: 207개
Zipcode 의 Null 개수: 142개
StationArea 의 Null 개수: 75개
Box 의 Null 개수: 13개
OriginalPriority 의 Null 개수: 922개
CallTypeGroup 의 Null 개수: 99517개
UnitSequenceInCallDispatch 의 Null 개수: 1개


In [152]:
df.select("CallTypeGroup").distinct().show(10)

+--------------------+
|       CallTypeGroup|
+--------------------+
|               Alarm|
|                null|
|Potentially Life-...|
|Non Life-threatening|
|                Fire|
+--------------------+



### 1. 화재 신고로 기록된 calltype의 종류

-총 30개

In [24]:
df.select('calltype').where(col('calltype').isNotNull()).distinct().show()

                                                                                

+--------------------+
|            calltype|
+--------------------+
|Elevator / Escala...|
|         Marine Fire|
|  Aircraft Emergency|
|Confined Space / ...|
|      Administrative|
|              Alarms|
|Odor (Strange / U...|
|Citizen Assist / ...|
|              HazMat|
|Watercraft in Dis...|
|           Explosion|
|           Oil Spill|
|        Vehicle Fire|
|  Suspicious Package|
|Extrication / Ent...|
|               Other|
|        Outside Fire|
|   Traffic Collision|
|       Assist Police|
|Gas Leak (Natural...|
+--------------------+
only showing top 20 rows



                                                                                

In [25]:
df.select('calltype').where(col('calltype').isNotNull()).distinct().count()

                                                                                

30

## 2. 5분 이상 걸린 응답시간 

In [26]:
new_fire_df = df.withColumnRenamed('Delay', 'ResponseDelayedMins')
new_fire_df.select('ResponseDelayedMins').where('ResponseDelayedMins > 5').show(5)

+-------------------+
|ResponseDelayedMins|
+-------------------+
|               5.35|
|               6.25|
|                5.2|
|                5.6|
|               7.25|
+-------------------+
only showing top 5 rows



In [27]:
new_fire_df.printSchema()

root
 |-- CallNumber: integer (nullable = true)
 |-- UnitID: string (nullable = true)
 |-- IncidentNumber: integer (nullable = true)
 |-- CallType: string (nullable = true)
 |-- CallDate: string (nullable = true)
 |-- WatchDate: string (nullable = true)
 |-- CallFinalDisposition: string (nullable = true)
 |-- AvailableDtTm: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Zipcode: integer (nullable = true)
 |-- Battalion: string (nullable = true)
 |-- StationArea: string (nullable = true)
 |-- Box: string (nullable = true)
 |-- OriginalPriority: string (nullable = true)
 |-- Priority: string (nullable = true)
 |-- FinalPriority: integer (nullable = true)
 |-- ALSUnit: boolean (nullable = true)
 |-- CallTypeGroup: string (nullable = true)
 |-- NumAlarms: integer (nullable = true)
 |-- UnitType: string (nullable = true)
 |-- UnitSequenceInCallDispatch: integer (nullable = true)
 |-- FirePreventionDistrict: string (nullable = true)
 

## 3. 날짜 데이터 타입 변경

- 'CallDate','WatchDate','AvailableDtTm' 데이터 타입 변경

In [28]:
df.select('CallDate','WatchDate','AvailableDtTm').show(5)

+----------+----------+--------------------+
|  CallDate| WatchDate|       AvailableDtTm|
+----------+----------+--------------------+
|01/11/2002|01/10/2002|01/11/2002 01:51:...|
|01/11/2002|01/10/2002|01/11/2002 03:01:...|
|01/11/2002|01/10/2002|01/11/2002 02:39:...|
|01/11/2002|01/10/2002|01/11/2002 04:16:...|
|01/11/2002|01/10/2002|01/11/2002 06:01:...|
+----------+----------+--------------------+
only showing top 5 rows



In [29]:
fire_ts_df = new_fire_df.withColumn('IncidentDate', to_timestamp(col('CallDate'), 'MM/dd/yyyy'))\
    .drop('CallDate')\
    .withColumn('OnWatchDate', to_timestamp(col('WatchDate'), 'MM/dd/yyyy'))\
    .drop('WatchDate')\
    .withColumn('AvailableDtTS', to_timestamp(col('AvailableDtTm'), 'MM/dd/yyyy HH:mm:ss a'))\
    .drop('AvailableDtTm')

fire_ts_df.select('IncidentDate', 'OnWatchDate', 'AvailableDtTS').show(5)

+-------------------+-------------------+-------------------+
|       IncidentDate|        OnWatchDate|      AvailableDtTS|
+-------------------+-------------------+-------------------+
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 01:51:44|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 03:01:18|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 02:39:50|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 04:16:46|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 06:01:58|
+-------------------+-------------------+-------------------+
only showing top 5 rows



## 4. 소방서 호출 년수 

- 2000년 ~ 2019년 (총 19년) 

In [30]:
fire_ts_df\
    .select(year('IncidentDate'))\
    .distinct()\
    .orderBy(year('IncidentDate'))\
    .show()

                                                                                

+------------------+
|year(IncidentDate)|
+------------------+
|              2000|
|              2001|
|              2002|
|              2003|
|              2004|
|              2005|
|              2006|
|              2007|
|              2008|
|              2009|
|              2010|
|              2011|
|              2012|
|              2013|
|              2014|
|              2015|
|              2016|
|              2017|
|              2018|
+------------------+



In [31]:
fire_ts_df\
    .select(year('IncidentDate'))\
    .distinct()\
    .orderBy(year('IncidentDate'))\
    .count()

                                                                                

19

## 5. 가장 흔한 형태의 신고는 무엇인가?

1. Medical Incident
2. Structure Fire
3. Traffic Collision
4. Citizen Assist
5. Other

In [32]:
fire_ts_df\
    .select('CallType')\
    .where(col('CallType').isNotNull())\
    .groupBy('CallType')\
    .count()\
    .orderBy('count', ascending = False)\
    .show()

+--------------------+------+
|            CallType| count|
+--------------------+------+
|    Medical Incident|113794|
|      Structure Fire| 23319|
|              Alarms| 19406|
|   Traffic Collision|  7013|
|Citizen Assist / ...|  2524|
|               Other|  2166|
|        Outside Fire|  2094|
|        Vehicle Fire|   854|
|Gas Leak (Natural...|   764|
|        Water Rescue|   755|
|Odor (Strange / U...|   490|
|   Electrical Hazard|   482|
|Elevator / Escala...|   453|
|Smoke Investigati...|   391|
|          Fuel Spill|   193|
|              HazMat|   124|
|Industrial Accidents|    94|
|           Explosion|    89|
|Train / Rail Inci...|    57|
|  Aircraft Emergency|    36|
+--------------------+------+
only showing top 20 rows



                                                                                

## 6. 2018년도의 신고 전화들의 유형은 어떤 것들이었나? 

In [107]:
fire_ts_df.createOrReplaceTempView("fire_ts_df_view")

In [108]:
spark.sql(
    '''
    SELECT Distinct CallType
    FROM fire_ts_df_view
    WHERE year(IncidentDate) = '2018'
    '''
).show()

+--------------------+
|            CallType|
+--------------------+
|Elevator / Escala...|
|              Alarms|
|Odor (Strange / U...|
|Citizen Assist / ...|
|              HazMat|
|           Explosion|
|        Vehicle Fire|
|  Suspicious Package|
|               Other|
|        Outside Fire|
|   Traffic Collision|
|       Assist Police|
|Gas Leak (Natural...|
|        Water Rescue|
|   Electrical Hazard|
|      Structure Fire|
|    Medical Incident|
|          Fuel Spill|
|Smoke Investigati...|
|Train / Rail Inci...|
+--------------------+



## 7. 2018년에 신고 전화가 가장 많았던 달은 언제인가?

- 12월달의 기록은 없다.
- 10월달의 신고 전화가 가장 많다.

In [51]:
spark.sql(
    '''
    SELECT Month(IncidentDate), count(Month(IncidentDate))
    FROM fire_ts_df_view
    WHERE year(IncidentDate) = '2018'
    group by Month(IncidentDate)
    order by count(Month(IncidentDate)) desc
    '''
).show()

+---------------------------------+----------------------------------------+
|month(CAST(IncidentDate AS DATE))|count(month(CAST(IncidentDate AS DATE)))|
+---------------------------------+----------------------------------------+
|                               10|                                    1068|
|                                5|                                    1047|
|                                3|                                    1029|
|                                8|                                    1021|
|                                1|                                    1007|
|                                7|                                     974|
|                                6|                                     974|
|                                9|                                     951|
|                                4|                                     947|
|                                2|                                     919|

                                                                                

## 8. 2018년도에 가장 많은 신고가 들어온 샌프란시스코 지역은 어디인가?

- 샌프란시스코의 Tenderloin 지역이 총 신고 건수가 1193건으로 가장 많았다. 

In [124]:
# 24개의 city
spark.sql(
    '''
    SELECT distinct city
    FROM fire_ts_df_view
    '''
).show(24)

spark.sql(
    '''
    SELECT distinct city
    FROM fire_ts_df_view
    '''
).count()

+---------------+
|           city|
+---------------+
|     FORT MASON|
|     Fort Mason|
|            OAK|
|             DC|
|             TI|
|TREASURE ISLAND|
|           null|
|  San Francisco|
|             HP|
|             YB|
|             BN|
|       Brisbane|
|      Daly City|
|    Yerba Buena|
|Treasure Island|
|             SF|
|       PRESIDIO|
|       Presidio|
|             FM|
|            SFO|
|  Hunters Point|
|             PR|
|  SAN FRANCISCO|
|  Treasure Isla|
+---------------+



24

In [125]:
spark.sql(
    '''
    SELECT distinct Neighborhood
    FROM fire_ts_df_view
    WHERE city = 'San Francisco'
    '''
).count()

42

In [126]:
spark.sql(
    '''
    SELECT Neighborhood, count(Neighborhood) as count_by_area
    FROM fire_ts_df_view
    WHERE year(IncidentDate) = '2018'
        AND City = 'San Francisco'
    group by Neighborhood
    order by count_by_area desc
    limit 5
    '''
).show()

+--------------------+-------------+
|        Neighborhood|count_by_area|
+--------------------+-------------+
|          Tenderloin|         1393|
|     South of Market|         1052|
|             Mission|          911|
|Financial Distric...|          764|
|Bayview Hunters P...|          513|
+--------------------+-------------+



## 9. 2018년에 가장 응답 시간이 늦었던 지역은 어디인가?

- 2018년에 가장 응답 시간이 늦었던 지역은 san francisco이나 평균적으로 가장 응답 시간이 늦었던 지역은 Daly City다. 

In [111]:
spark.sql(
    '''
    SELECT ResponseDelayedMins
    FROM fire_ts_df_view
    WHERE year(IncidentDate) = '2018'
    '''
).show(2)

+-------------------+
|ResponseDelayedMins|
+-------------------+
|          2.8833334|
|          6.3333335|
+-------------------+
only showing top 2 rows



In [93]:
spark.sql(
    '''
    SELECT city, round(avg(ResponseDelayedMins),2) as avg_ResponseDelayedMins
    FROM fire_ts_df_view
    WHERE year(IncidentDate) = '2018'
    group by city
    order by avg_ResponseDelayedMins desc
    '''
).show(10)

+-------------+-----------------------+
|         city|avg_ResponseDelayedMins|
+-------------+-----------------------+
|    Daly City|                   5.63|
|     Presidio|                   5.53|
|  Yerba Buena|                   5.03|
|Treasure Isla|                   4.63|
|   Fort Mason|                   4.52|
|     Brisbane|                   4.18|
|Hunters Point|                    4.1|
|San Francisco|                   3.94|
|         null|                   3.06|
+-------------+-----------------------+



In [92]:
spark.sql(
    '''
    SELECT city, round(max(ResponseDelayedMins),2) as Max_ResponseDelayedMins
    FROM fire_ts_df_view
    WHERE year(IncidentDate) = '2018'
    group by city 
    order by Max_ResponseDelayedMins desc
    '''
).show(10)

+-------------+-----------------------+
|         city|Max_ResponseDelayedMins|
+-------------+-----------------------+
|San Francisco|                 491.27|
|     Presidio|                  38.05|
|Treasure Isla|                  31.25|
|  Yerba Buena|                  10.57|
|         null|                   9.65|
|   Fort Mason|                    8.8|
|Hunters Point|                    5.8|
|    Daly City|                   5.63|
|     Brisbane|                   4.18|
+-------------+-----------------------+



## 10. 2018년에 어떤 주에서 신고가 가장 많았는가?

- San Francisco가 9967건으로 신고가 가장 많았다.

In [105]:
spark.sql(
    '''
    SELECT city, count(city) as count
    FROM fire_ts_df_view
    WHERE year(IncidentDate) = '2018'
    group by city
    order by count desc
    '''
).show(10)

+-------------+-----+
|         city|count|
+-------------+-----+
|San Francisco| 9967|
|     Presidio|   63|
|Treasure Isla|   58|
|  Yerba Buena|   10|
|Hunters Point|    9|
|   Fort Mason|    8|
|     Brisbane|    1|
|    Daly City|    1|
|         null|    0|
+-------------+-----+



## 11. 지역, 우편번호, 신고 숫자 간에 상관관계가 있는가?

In [153]:
spark.sql(
    '''
    SELECT city, zipcode, count(IncidentDate)
    FROM fire_ts_df_view
    group by city, zipcode
    '''
).show(10)

+-------------+-------+-------------------+
|         city|zipcode|count(IncidentDate)|
+-------------+-------+-------------------+
|           SF|  94124|               6554|
|Treasure Isla|  94130|                225|
|           PR|  94123|                 15|
|           FM|  94118|                  6|
|SAN FRANCISCO|  94130|                  2|
|         null|  94109|                  9|
|           HP|  94124|                 31|
|     PRESIDIO|  94108|                  2|
|San Francisco|  94121|               1300|
|  Yerba Buena|  94105|                  8|
+-------------+-------+-------------------+
only showing top 10 rows



In [13]:
spark.stop()