In [39]:
library(stringr)
library(dplyr)
library(lubridate)
library(ggplot2)
library(broom)

In [2]:
crime_df <- read.csv("bos_crime_incidents_2021.csv")

In [3]:
head(crime_df)

Unnamed: 0_level_0,INCIDENT_NUMBER,OFFENSE_CODE,OFFENSE_CODE_GROUP,OFFENSE_DESCRIPTION,DISTRICT,REPORTING_AREA,SHOOTING,OCCURRED_ON_DATE,YEAR,MONTH,DAY_OF_WEEK,HOUR,UCR_PART,STREET,Lat,Long,Location
Unnamed: 0_level_1,<chr>,<int>,<lgl>,<chr>,<chr>,<int>,<int>,<chr>,<int>,<int>,<chr>,<int>,<lgl>,<chr>,<dbl>,<dbl>,<chr>
1,PLTEST005,520,,BURGLARY - RESIDENTIAL,B2,289.0,1,2021-10-13 00:00:00,2021,10,Wednesday,0,,SCHROEDER PLZ,42.33368,-71.09188,"(42.33367921810846, -71.09187754618458)"
2,PLTEST003,3114,,INVESTIGATE PROPERTY,B2,289.0,0,2021-05-12 00:00:00,2021,5,Wednesday,0,,SCHROEDER PLZ,42.33368,-71.09188,"(42.33367921810846, -71.09187754618458)"
3,PLTEST002,3114,,INVESTIGATE PROPERTY,B2,289.0,0,2021-05-12 00:00:00,2021,5,Wednesday,0,,SCHROEDER PLZ,42.33368,-71.09188,"(42.33367921810846, -71.09187754618458)"
4,PLTEST001,3114,,INVESTIGATE PROPERTY,B2,,0,2021-03-29 00:00:00,2021,3,Monday,0,,SCHROEDER PLAZA,0.0,0.0,"(0, 0)"
5,312030979,3201,,PROPERTY - LOST/ MISSING,E13,465.0,0,2021-03-22 00:00:00,2021,3,Monday,0,,BLUE HILL AVE,42.28483,-71.09137,"(42.28482576580488, -71.09137368938802)"
6,292152228,3301,,VERBAL DISPUTE,C11,347.0,0,2021-07-28 20:45:00,2021,7,Wednesday,20,,LYON ST,42.30638,-71.06021,"(42.30638322801941, -71.06021217719662)"


In [6]:
# keeps eco offense
crime_df1 <- crime_df %>%
    filter(OFFENSE_DESCRIPTION == "AUTO THEFT" | 
           OFFENSE_DESCRIPTION == "AUTO THEFT - LEASED/RENTED VEHICLE" | 
           OFFENSE_DESCRIPTION == "AUTO THEFT - MOTORCYCLE / SCOOTER" | 
           OFFENSE_DESCRIPTION == "BURGLARY - COMMERICAL" | 
           OFFENSE_DESCRIPTION == "BURGLARY - RESIDENTIAL" | 
           OFFENSE_DESCRIPTION == "LARCENY ALL OTHERS" | 
           OFFENSE_DESCRIPTION == "LARCENY PICK-POCKET" | 
           OFFENSE_DESCRIPTION == "LARCENY PURSE SNATCH - NO FORCE" | 
           OFFENSE_DESCRIPTION == "LARCENY SHOPLIFTING" | 
           OFFENSE_DESCRIPTION == "LARCENY THEFT FROM BUILDING" | 
           OFFENSE_DESCRIPTION == "LARCENY THEFT FROM COIN-OP MACHINE" | 
           OFFENSE_DESCRIPTION == "LARCENY THEFT FROM MV - NON-ACCESSORY" | 
           OFFENSE_DESCRIPTION == "LARCENY THEFT OF BICYCLE" | 
           OFFENSE_DESCRIPTION == "LARCENY THEFT OF MV PARTS & ACCESSORIES" | 
           OFFENSE_DESCRIPTION == "ROBBERY") %>%
    select (INCIDENT_NUMBER, OFFENSE_DESCRIPTION, OCCURRED_ON_DATE)
head(crime_df1)

Unnamed: 0_level_0,INCIDENT_NUMBER,OFFENSE_DESCRIPTION,OCCURRED_ON_DATE
Unnamed: 0_level_1,<chr>,<chr>,<chr>
1,PLTEST005,BURGLARY - RESIDENTIAL,2021-10-13 00:00:00
2,222000453,LARCENY THEFT FROM BUILDING,2021-12-30 12:30:00
3,222000583,LARCENY THEFT FROM MV - NON-ACCESSORY,2021-12-28 12:00:00
4,222000575,LARCENY THEFT FROM BUILDING,2021-12-20 00:00:00
5,222000557,LARCENY THEFT FROM BUILDING,2021-12-28 00:00:00
6,222000513,LARCENY THEFT FROM BUILDING,2021-12-31 14:30:00


In [24]:
# turn datetime into date
crime_df1$OCCURRED_ON_DATE <- as_date(crime_df1$OCCURRED_ON_DATE)
head(crime_df1)

Unnamed: 0_level_0,INCIDENT_NUMBER,OFFENSE_DESCRIPTION,OCCURRED_ON_DATE,IS_HOLIDAY
Unnamed: 0_level_1,<chr>,<chr>,<date>,<lgl>
1,PLTEST005,BURGLARY - RESIDENTIAL,2021-10-13,False
2,222000453,LARCENY THEFT FROM BUILDING,2021-12-30,False
3,222000583,LARCENY THEFT FROM MV - NON-ACCESSORY,2021-12-28,False
4,222000575,LARCENY THEFT FROM BUILDING,2021-12-20,False
5,222000557,LARCENY THEFT FROM BUILDING,2021-12-28,False
6,222000513,LARCENY THEFT FROM BUILDING,2021-12-31,False


In [8]:
# Date of Major Public Holidays in 2021
# New Year’s Day 2021-01-01
# Martin Luther King’s Birthday 2021-01-18
# Washington’s Birthday 2021-02-15
# Memorial Day 2021-05-31
# Juneteenth National Independence Day 2021-06-19
# Independence Day 2021-07-04
# Labor Day 2021-09-06
# Columbus Day 2021-10-11
# Veterans’ Day 2021-11-11
# Thanksgiving Day 2021-11-25
# Christmas Day 2021-12-25

In [9]:
# set time interval for every holiday
time1 <- as_datetime("2021-01-01 00:00:00")
time2 <- as_datetime("2021-01-01 23:59:59")
dt_intr1 <- interval(time1, time2)

time3 <- as_datetime("2021-01-18 00:00:00")
time4 <- as_datetime("2021-01-18 23:59:59")
dt_intr2 <- interval(time3, time4)

time5 <- as_datetime("2021-02-15 00:00:00")
time6 <- as_datetime("2021-02-15 23:59:59")
dt_intr3 <- interval(time5, time6)

time7 <- as_datetime("2021-05-31 00:00:00")
time8 <- as_datetime("2021-05-31 23:59:59")
dt_intr4 <- interval(time7, time8)

time9 <- as_datetime("2021-06-19 00:00:00")
time10 <- as_datetime("2021-06-19 23:59:59")
dt_intr5 <- interval(time9, time10)

time11 <- as_datetime("2021-07-04 00:00:00")
time12 <- as_datetime("2021-07-04 23:59:59")
dt_intr6 <- interval(time11, time12)

time13 <- as_datetime("2021-09-06 00:00:00")
time14 <- as_datetime("2021-09-06 23:59:59")
dt_intr7 <- interval(time13, time14)

time15 <- as_datetime("2021-10-11 00:00:00")
time16 <- as_datetime("2021-10-11 23:59:59")
dt_intr8 <- interval(time15, time16)

time17 <- as_datetime("2021-11-11 00:00:00")
time18 <- as_datetime("2021-11-11 23:59:59")
dt_intr9 <- interval(time17, time18)

time19 <- as_datetime("2021-11-25 00:00:00")
time20 <- as_datetime("2021-11-25 23:59:59")
dt_intr10 <- interval(time19, time20)

time21 <- as_datetime("2021-12-25 00:00:00")
time22 <- as_datetime("2021-12-25 23:59:59")
dt_intr11 <- interval(time21, time22)

In [10]:
holiday_intr <- c(dt_intr1, dt_intr2, dt_intr3, dt_intr4, dt_intr5, dt_intr6, 
                  dt_intr7, dt_intr8, dt_intr9, dt_intr10, dt_intr11)
print(holiday_intr)

 [1] 2021-01-01 UTC--2021-01-01 23:59:59 UTC
 [2] 2021-01-18 UTC--2021-01-18 23:59:59 UTC
 [3] 2021-02-15 UTC--2021-02-15 23:59:59 UTC
 [4] 2021-05-31 UTC--2021-05-31 23:59:59 UTC
 [5] 2021-06-19 UTC--2021-06-19 23:59:59 UTC
 [6] 2021-07-04 UTC--2021-07-04 23:59:59 UTC
 [7] 2021-09-06 UTC--2021-09-06 23:59:59 UTC
 [8] 2021-10-11 UTC--2021-10-11 23:59:59 UTC
 [9] 2021-11-11 UTC--2021-11-11 23:59:59 UTC
[10] 2021-11-25 UTC--2021-11-25 23:59:59 UTC
[11] 2021-12-25 UTC--2021-12-25 23:59:59 UTC


In [11]:
is_holiday_period <- function(date, holiday_intr) {
    n_incidents <- length(date)
    is_holiday <- rep(FALSE, n_incidents)

    for (i in 1:n_incidents) {
        
        for (j in 1:length(holiday_intr)) {
            
            if (date[i] %within% holiday_intr[j]) {
            
            is_holiday[i] <- TRUE
            
            }
        }
    }
    return(is_holiday)
}

In [25]:
is_holiday_period(as_datetime("2021-11-11"), holiday_intr)

In [26]:
# need around 2 minutes to run nesting if
crime_df1$IS_HOLIDAY <- is_holiday_period(as_date(crime_df1$OCCURRED_ON_DATE), holiday_intr)

In [27]:
head(crime_df1)

Unnamed: 0_level_0,INCIDENT_NUMBER,OFFENSE_DESCRIPTION,OCCURRED_ON_DATE,IS_HOLIDAY
Unnamed: 0_level_1,<chr>,<chr>,<date>,<lgl>
1,PLTEST005,BURGLARY - RESIDENTIAL,2021-10-13,False
2,222000453,LARCENY THEFT FROM BUILDING,2021-12-30,False
3,222000583,LARCENY THEFT FROM MV - NON-ACCESSORY,2021-12-28,False
4,222000575,LARCENY THEFT FROM BUILDING,2021-12-20,False
5,222000557,LARCENY THEFT FROM BUILDING,2021-12-28,False
6,222000513,LARCENY THEFT FROM BUILDING,2021-12-31,False


In [28]:
crime_df2 <- crime_df1 %>%
    group_by (OCCURRED_ON_DATE) %>%
    summarise (
        n_OFFENSE = n(),
        IS_HOLIDAY = IS_HOLIDAY
    )

[1m[22m`summarise()` has grouped output by 'OCCURRED_ON_DATE'. You can override using
the `.groups` argument.


In [30]:
head(crime_df2)

OCCURRED_ON_DATE,n_OFFENSE,IS_HOLIDAY
<date>,<int>,<lgl>
2021-01-01,34,True
2021-01-01,34,True
2021-01-01,34,True
2021-01-01,34,True
2021-01-01,34,True
2021-01-01,34,True


In [31]:
# get rid of duplicate rows
crime_df3 <- crime_df2 %>%  distinct()

In [32]:
head(crime_df3)

OCCURRED_ON_DATE,n_OFFENSE,IS_HOLIDAY
<date>,<int>,<lgl>
2021-01-01,34,True
2021-01-02,31,False
2021-01-03,19,False
2021-01-04,26,False
2021-01-05,25,False
2021-01-06,24,False


In [35]:
# t test
result <- t.test (n_OFFENSE ~ IS_HOLIDAY, data = crime_df3)
result


	Welch Two Sample t-test

data:  n_OFFENSE by IS_HOLIDAY
t = 1.505, df = 11.212, p-value = 0.16
alternative hypothesis: true difference in means between group FALSE and group TRUE is not equal to 0
95 percent confidence interval:
 -1.115360  5.974631
sample estimates:
mean in group FALSE  mean in group TRUE 
           31.88418            29.45455 


In [41]:
tidy(result)

estimate,estimate1,estimate2,statistic,p.value,parameter,conf.low,conf.high,method,alternative
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>
2.429635,31.88418,29.45455,1.505022,0.159959,11.21169,-1.11536,5.974631,Welch Two Sample t-test,two.sided


In [None]:
# P-value = 0.16, cannot reject null hypothesis.
# There is no significant difference between holiday and non-holiday periods.