<a href="https://colab.research.google.com/github/Avipsa1/UPPP275-Notebooks/blob/main/Boston_Crimes_Heatmap_using_Folium.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Initialize a folium map
You can find additional leaflet basemap tiles here: http://leaflet-extras.github.io/leaflet-providers/preview

In [None]:
import folium
boston = [42.3202, -71.1500]
map = folium.Map(
    location = boston,
    zoom_start = 7,
    tiles = 'https://{s}.tile.opentopomap.org/{z}/{x}/{y}.png',
    attr = 'Map data: &copy; <a href="https://www.openstreetmap.org/copyright">OpenStreetMap</a> contributors, <a href="http://viewfinderpanoramas.org">SRTM</a> | Map style: &copy; <a href="https://opentopomap.org">OpenTopoMap</a> (<a href="https://creativecommons.org/licenses/by-sa/3.0/">CC-BY-SA</a>)'
)
map

Read crime data in a pandas dataframe

In [None]:
import pandas as pd
crimes = pd.read_csv("./sample_data/boston_crime.csv",encoding= 'unicode_escape')

Access dataframe values

In [None]:
crimes.head()

Unnamed: 0,INCIDENT_NUMBER,OFFENSE_CODE,OFFENSE_CODE_GROUP,OFFENSE_DESCRIPTION,DISTRICT,REPORTING_AREA,SHOOTING,OCCURRED_ON_DATE,YEAR,MONTH,DAY_OF_WEEK,HOUR,UCR_PART,STREET,Lat,Long,Location
0,I182070945,619,Larceny,LARCENY ALL OTHERS,D14,808,,9/2/2018 13:00,2018,9,Sunday,13,Part One,LINCOLN ST,42.357791,-71.139371,"(42.35779134, -71.13937053)"
1,I182070943,1402,Vandalism,VANDALISM,C11,347,,8/21/2018 0:00,2018,8,Tuesday,0,Part Two,HECLA ST,42.306821,-71.0603,"(42.30682138, -71.06030035)"
2,I182070941,3410,Towed,TOWED MOTOR VEHICLE,D4,151,,9/3/2018 19:27,2018,9,Monday,19,Part Three,CAZENOVE ST,42.346589,-71.072429,"(42.34658879, -71.07242943)"
3,I182070940,3114,Investigate Property,INVESTIGATE PROPERTY,D4,272,,9/3/2018 21:16,2018,9,Monday,21,Part Three,NEWCOMB ST,42.334182,-71.078664,"(42.33418175, -71.07866441)"
4,I182070938,3114,Investigate Property,INVESTIGATE PROPERTY,B3,421,,9/3/2018 21:05,2018,9,Monday,21,Part Three,DELHI ST,42.275365,-71.090361,"(42.27536542, -71.09036101)"


In [None]:
crimes['DISTRICT']

0         D14
1         C11
2          D4
3          D4
4          B3
         ... 
319068     D4
319069    E18
319070    E18
319071    E13
319072     D4
Name: DISTRICT, Length: 319073, dtype: object

In [None]:
crimes.DISTRICT

0         D14
1         C11
2          D4
3          D4
4          B3
         ... 
319068     D4
319069    E18
319070    E18
319071    E13
319072     D4
Name: DISTRICT, Length: 319073, dtype: object

In [None]:
type(crimes.DISTRICT)

pandas.core.series.Series

In [None]:
crimes.DISTRICT[3]

'D4'

In [None]:
crimes.iloc[0,4]

'D14'

In [None]:
crimes.DISTRICT.value_counts()

B2     49945
C11    42530
D4     41915
A1     35717
B3     35442
C6     23460
D14    20127
E13    17536
E18    17348
A7     13544
E5     13239
A15     6505
Name: DISTRICT, dtype: int64

In [None]:
crimes.Lat.describe()

count    299074.000000
mean         42.214381
std           2.159766
min          -1.000000
25%          42.297442
50%          42.325538
75%          42.348624
max          42.395042
Name: Lat, dtype: float64

In [None]:
crimes.Lat/5.0

0         8.471558
1         8.461364
2         8.469318
3         8.466836
4         8.455073
            ...   
319068    8.467390
319069    8.451185
319070    8.451185
319071    8.460467
319072    8.466768
Name: Lat, Length: 319073, dtype: float64

In [None]:
crimes.DAY_OF_WEEK == "Tuesday"

0         False
1          True
2         False
3         False
4         False
          ...  
319068    False
319069    False
319070    False
319071     True
319072    False
Name: DAY_OF_WEEK, Length: 319073, dtype: bool

Rename or drop unwanted columns

In [None]:
renamed_columns = {
    'OCCURRED_ON_DATE': 'date',
    'OFFENSE_CODE_GROUP': 'offense',
    'SHOOTING': 'shooting',
    'Lat': 'lat',
    'Long': 'lon'
}
crimes = crimes.rename(columns = renamed_columns)
crimes = crimes[list(renamed_columns.values())]

In [None]:
crimes.columns

Index(['date', 'offense', 'shooting', 'lat', 'lon'], dtype='object')

Deal with different data types

In [None]:
type(crimes.date[0])

str

In [None]:
#Convert date from String to timestamp format
crimes.date = pd.to_datetime(crimes.date)
#Sort the dataframe by date of occurrence of the crimes
crimes = crimes.sort_values(by = "date")
#Print the first 10 crime incidents from the sorted dataframe
crimes.date[0:10]

129056   2015-06-15 00:00:00
314676   2015-06-15 00:00:00
310350   2015-06-15 00:00:00
253464   2015-06-15 00:00:00
8793     2015-06-15 00:00:00
318414   2015-06-15 00:00:00
317446   2015-06-15 00:00:00
303001   2015-06-15 00:00:00
317447   2015-06-15 00:00:00
318621   2015-06-15 00:01:00
Name: date, dtype: datetime64[ns]

Deal with null values

In [None]:
crimes.shooting

129056    NaN
314676    NaN
310350    NaN
253464    NaN
8793      NaN
         ... 
8         NaN
4         NaN
5         NaN
3         NaN
6         NaN
Name: shooting, Length: 319073, dtype: object

In [None]:
crimes.shooting = (crimes.shooting == 'Y')
crimes = crimes.dropna()
crimes.head()

Unnamed: 0,date,offense,shooting,lat,lon
129056,2015-06-15,Harassment,False,42.291093,-71.065945
314676,2015-06-15,Confidence Games,False,42.300217,-71.080979
310350,2015-06-15,Other,False,42.293606,-71.071887
253464,2015-06-15,Property Lost,False,42.283634,-71.082813
8793,2015-06-15,Property Lost,False,-1.0,-1.0


Group data by month

In [None]:
import datetime
from dateutil.relativedelta import relativedelta

print(crimes.date.min())
print(crimes.date.max())

2015-06-15 00:00:00
2018-09-03 21:25:00


In [None]:
#Create a blank list to store the months
months = []

#Set the start date to June 1, 2015
start = datetime.datetime(year = 2015, month = 6, day = 1)

#Run a loop until the start date is less than the max date
while start < crimes.date.max():
  #Set the end date by adding a month each time
  end = start + relativedelta(months=+1)
  #Create a mask value (True/False) which checks if the the crime occurred between the start and end dates
  mask = (start <= crimes.date) & (crimes.date <end)
  #Pick only those crime that occured between start and end dates
  crimes_month = crimes[mask]
  #Only select the lat and lon values for those crimes selected
  crimes_month = crimes_month[['lat','lon']]
  #Add the location of the crimes to the month list - for each month
  months.append(crimes_month)
  start = end
print(months[0])

              lat        lon
129056  42.291093 -71.065945
314676  42.300217 -71.080979
310350  42.293606 -71.071887
253464  42.283634 -71.082813
8793    -1.000000  -1.000000
...           ...        ...
314737  42.380275 -71.060377
314736  42.380275 -71.060377
314735  42.380275 -71.060377
314708  42.288705 -71.078108
314724  42.280587 -71.074322

[4066 rows x 2 columns]


In [132]:
#try runnning this  in Firefox - if other browsers do not render maps correctly
from folium.plugins import HeatMapWithTime
from folium import plugins
m = folium.Map(boston, zoom_start=11)
m.add_child(plugins.HeatMap(
    data = list(zip(months[0].lat.values,months[0].lon.values)),# [ list(zip(m.lat.values,m.lon.values)) for m in months ],
    radius = 15
))
m

In [149]:
LAT_LON_GRID = 0.005
import datetime
def custom_round(val,res):
  return round(val / res) * res

def cluster(df_interval):
  data = df_interval.copy()
  data = custom_round(data,LAT_LON_GRID)
  data = data.groupby(["lat","lon"]).size().reset_index(name = "weight")
  data.weight = data.weight / data.weight.max()
  return data

start = datetime.datetime(2015, 6, 1)
end = start + relativedelta(months=+1)
mask = (start <= crimes.date) & (crimes.date < end)
df_month = crimes[mask]
df_month = df_month[['lat','lon']]

print(cluster(df_month))

        lat     lon    weight
0    -1.000  -1.000  0.066667
1    42.235 -71.140  0.013333
2    42.235 -71.125  0.013333
3    42.240 -71.140  0.053333
4    42.240 -71.125  0.066667
..      ...     ...       ...
436  42.390 -71.015  0.013333
437  42.390 -71.010  0.093333
438  42.390 -71.005  0.080000
439  42.390 -71.000  0.040000
440  42.395 -71.010  0.080000

[441 rows x 3 columns]


Redraw heatmap

In [162]:
m = folium.Map(boston, zoom_start=11)

m.add_child(plugins.HeatMap(
    data = list(zip(df_month.lat.values,df_month.lon.values,cluster(df_month).weight)),# [ list(zip(m.lat.values,m.lon.values)) for m in months ],
    radius = 8
))
m