# **Data Analysis in Python for R Users — R**

# 0. Environment Setup

#### 1. Loading a library

In [None]:
# R uses library() or require() for libraries
if (!require(tidyverse)) install.packages("tidyverse")

# We'll also use a library {naniar} for handling missing values
if (!require(naniar)) install.packages("naniar")

#### 2. Defining custom functions

In [None]:
# Define a custom function affiche() for cleanly printing tables
affiche <- function(df,
                    align = "left",
                    na_color = "\033[91;3m",   # nolint
                    theme = "newspaper") {     # nolint

  # Error handling
  if (ncol(df) == 0 || nrow(df) == 0) { #P
    msg <- "That table doesn't exist!"
    width <- nchar(msg)
    top <- paste0("╔", strrep("═", width + 2), "╗")
    mid <- paste0("║ ", msg, " ║")
    bot <- paste0("╚", strrep("═", width + 2), "╝")
    cat(top, "\n", mid, "\n", bot, "\n", sep = "")
    return(invisible(df))
  }

  # Handle missing rownames
  if (!is.null(rownames(df)) && !any(names(df) == " ")) {
    df <- tibble::rownames_to_column(df, var = " ")
  }


  # Theme setup
  border <- switch(theme,
    "newspaper" = list(
      h = "═", v = "║",
      tl = "╔", tr = "╗",
      bl = "╚", br = "╝",
      jn = "╬",
      l = "╠", r = "╣",
      t = "╦", b = "╩"
    ),
    stop("Theme not supported. Try 'newspaper'")
  )

  # ANSI helpers
  reset <- "\033[0m"
  color_na <- function(x) paste0(na_color, x, reset)

  # Width calculator
  display_width <- function(s) {
    if (is.na(s)) return(2)  # Width of "NA"
    clean <- gsub("\033\\[[0-9;]*[mK]", "", as.character(s))
    nchar(clean, type = "width")
  }

  # --- DATA PREP ---
  df_display <- as.data.frame(
    lapply(df, function(col) {
      ifelse(is.na(col), color_na("NA"), as.character(col))
    }),
    stringsAsFactors = FALSE
  )
  col_names <- names(df_display)

  # Column widths
  col_widths <- sapply(seq_along(col_names), function(i) {
    max(display_width(col_names[i]),
        sapply(df_display[[i]], display_width),
        na.rm = TRUE)
  })

  # Border drawing
  draw_hline <- function(connector_left, connector_right, cross) {
    line <- paste0(
      connector_left,
      paste0(sapply(col_widths, function(w) {
        paste0(strrep(border$h, w + 2), cross)
      }), collapse = "")
    )
    gsub(paste0(cross, "$"), connector_right, line)
  }

  top_line <- draw_hline(border$tl, border$tr, border$t)
  mid_line <- draw_hline(border$l, border$r, border$jn)
  bot_line <- draw_hline(border$bl, border$br, border$b)

  # Header row
  header <- paste0(
    border$v,
    paste0(sapply(seq_along(col_names), function(i) {
      name <- col_names[i]
      width <- col_widths[i]
      pad_total <- width - display_width(name)
      pad_left <- switch(align,
        "left" = 0,
        "center" = floor(pad_total / 2),
        "right" = pad_total
      )
      paste0(" ",
             strrep(" ", pad_left),
             name,
             strrep(" ", pad_total - pad_left),
             " ", border$v)
    }), collapse = "")
  )


  # Data rows
data_rows <- sapply(1:nrow(df_display), function(i) {  # nolint
    paste0(
      border$v,
      paste0(sapply(seq_along(col_names), function(j) {
        content <- df_display[i, j]
        width <- col_widths[j]
        pad_total <- width - display_width(content)
        pad_left <- switch(align,
          "left" = 0,
          "center" = floor(pad_total / 2),
          "right" = pad_total
        )
        paste0(" ",
               strrep(" ", pad_left),
               content,
               strrep(" ", pad_total - pad_left),
               ifelse(j == length(col_names), "", paste0(" ", border$v)))
      }), collapse = ""),
      " ", border$v  # rightmost edge
    )
  })

  # Final assembly
  cat(top_line, "\n")
  cat(header, "\n")
  cat(mid_line, "\n")
  cat(paste0(data_rows, collapse = "\n"), "\n")
  cat(bot_line, "\n")

  invisible(df)
}

In [None]:
# Define a custom function count_table() for generating value counts and percentages
count_table <- function(df, column) {
  df |> 
    group_by({{ column }}) |> 
    summarize(count = n()) |>
    arrange(desc(count)) |>  
    mutate(percent = paste0(round(count / sum(count) * 100, 0), "%")) |> 
    mutate(percent = if_else(percent == "0%" & count >= 1, true = "<1%", false = percent))
}

In [None]:
# Define a custom function count_na() for counting NA values for each column
count_na <- function(df) {
  result <- tibble(col = character(), na_count = integer())

  for (col in names(df)) {
    # Count missing values per column
    na_count <- df |> filter(is.na(!!sym(col))) |> count() |> pull(n)
    # Place the counts into a result table
    result <- result |> add_row(col = col, na_count = na_count)
  }

  # Add percentage labels 
  result |>
    mutate(
      na_percent = case_when(
        na_count == 0 ~ "0%",
        (na_count / nrow(df)) <= 0.0099 ~ "<1%",
        TRUE ~ paste0(round(na_count / nrow(df) * 100, 0), "%")
      )
    ) |>
    # Sort by missing count
    arrange(desc(na_count))
}

In [None]:
# Define a custom function describe() for displaying summary stats for numeric cols
describe <- function(df) {
  numeric_df <- df %>%
    select(where(is.numeric))
  
  # Initialize a list to store results
  summary_list <- list()
  
  # Loop through each numeric column and calculate statistics
  for (col in names(numeric_df)) {
    summary_list[[col]] <- c(
      min = min(numeric_df[[col]], na.rm = TRUE),
      max = max(numeric_df[[col]], na.rm = TRUE),
      median = median(numeric_df[[col]], na.rm = TRUE),
      mean = mean(numeric_df[[col]], na.rm = TRUE),
      sd = round(sd(numeric_df[[col]], na.rm = TRUE), 2),
      n = sum(!is.na(numeric_df[[col]]))
    )
  }
  
  # Combine the results into a data frame and transpose
  result <- as.data.frame(do.call(rbind, summary_list))
  
  # Transpose the result to switch rows and columns
  result <- as.data.frame(t(result))
  
  # Set the column names to the measurements
  colnames(result) <- names(numeric_df)
  
  result %>%
    return()
}

# 1. Simple Data

In [None]:
# We use print for displaying something
print("Hello world!") # Test comment

In [None]:
# Our assignment operator is the arrow (<-)
x <- 10
y <- 20

print(x)
print(y)

In [None]:
# We roughly use the same mathematical operators
z <- x * y
print(z)

a <- x / y
print(a)

b <- x - y
print(b)

In [None]:
# Let's look at some data types
k <- 1000
name <- "Henry"
does_exist <- TRUE

# Note: In R, we don't slways have to specify print()
str(k)
str(name)
str(does_exist)

In [None]:
# Missing values operate differently as objects vs as parts of a vector 
# Note: NA_character is the NA placeholder for character strings
null_object <- NULL
na_vector <- c("apple", NA_character_, 25) 

null_object
na_vector

# 2. Working with Dataframes

In [None]:
# Create a dataframe
# Note: We use NA_real_ for numeric NAs
df <- data.frame(
  Name = c("Henry", "Bob", "Joanne", "Steven"),
  Age = c(22, NA_real_, 30, 48),
  Birthday = c("2003-12-29", "1980-05-15", "1995-01-12", NA_character_)
)

print(df)

In [None]:
# Display the structure of the df
str(df)

In [None]:
# Access a single column by name
df$Birthday

In [None]:
# Access the first column by position
column_1 <- df[1]

print(column_1)

In [None]:
# Add a new column
df$Fav_Animal <- c("Cat", "Penguin", "Sloth", "Dog")

print(df)

In [None]:
# Drop a column 
df$Age <- NULL

print(df)

In [None]:
# Convert birthday to a datefield with as.Date()
df$Birthday <- as.Date(df$Birthday)

str(df$Birthday)

# 3. More Advanced Manipulation

#### 1. Reading in data

In [None]:
# Read in the calls for service data
# Note in R, naming a library before a function is only for conflicts + clarification
cfs <- readr::read_csv("./data/calls_for_service_2025_demo.csv")

# Preview the first ten rows
# Note: We'll start using R's native pipe ( |> ) from here on
cfs |> head(10) |> print()

In [None]:
# Use our custom affiche() function for better readability
cfs |> head(10) |> affiche()

#### 2. Cleaning column names

In [None]:
# Load the {janitor} package from within {tidyverse}
library(janitor)

# Apply the clean_names() function from {janitor}
cfs <- janitor::clean_names(cfs)

str(cfs)

In [None]:
# We can select data with select() from {dplyr}
cfs |> select(nopd_item) |> 
  head(10) |> 
  affiche()

In [None]:
# Rename columns (in case they weren't already renamed)
# We'll use rename() from {dplyr} from within {tidyverse}
# Where rename wants "new = old"
cfs <- cfs |> rename(zip_code = zip)

# Check on our 19th column
colnames(cfs)[19]

In [None]:
# Let's see if our column names match now
str(cfs)

# Note: Looks like there are some differences in data types
# We'll try to get these to line up later

#### 3. Converting character fields to sentence case

In [None]:
# Use str_to_sentence() from {stringr} via the {tidyverse}
cfs <- cfs |> 
  mutate(across(where(is.character), str_to_sentence))

In [None]:
# Now we can look at the dispositions and see what they look like
# We'll use {dplyr} to group_by() a column, summarize() the count, then arrange() by count
cfs |> 
  group_by(disposition_text) |> 
  summarize(count = n()) |> 
  arrange(desc(count)) |> 
  affiche()

#### 4. Re-casting data types

In [None]:
# Let's investigate zipcode:
# From now on, we'll use our custom count_table() function

cfs$zip_code |> str()
cfs |> count_table(zip_code) |> affiche()

In [None]:
# Looks like it's a character in R because of this "None" value
# Let's recast it as a numeric
cfs <- cfs |> 
  mutate(zip_code = as.numeric(zip_code))

# Now let's look at zip_code
# And it coerced the "None" for us!
cfs$zip_code |> str()
cfs |> count_table(zip_code) |> affiche()

In [None]:
# Let's go back and look at the rest of our datatypes
str(cfs)

In [None]:
# Let's investigate the time_create column 
cfs |> 
  count_table(time_create) |> 
  affiche()

In [None]:
# We can tell that R recognizes our date fields correctly (POSIXct)
# But we could re-cast them if they were stored as character strings
time_cols <- c("time_create", "time_dispatch", "time_arrive", "time_closed")

for (col in time_cols) {
  cfs[[col]] <- as.POSIXct(cfs[[col]])
}

# Now let's select() the date columns and see their datatypes
# Note: We'll use a handy argument within select() called all_of()
cfs |> select(all_of(time_cols)) |> str()

In [None]:
# Let's go back to our columns
# And all of our datatypes look pretty good
str(cfs)

#### 5. Replacing missing values

##### A. Replacing 0s

In [None]:
# With some digging, we can find that some columns have 0s 
# They're map_x, map_y, and police_district '
# Here's police district 
cfs |> 
  count_table(police_district) |> 
  affiche()

In [None]:
# Replace 0s in map_x, map_y, and police_district 
zero_cols <- c("map_x", "map_y", "police_district")

# In R, we'll use a handy function from {naniar}
cfs <- cfs %>% 
  naniar::replace_with_na_at(.vars = zero_cols, ~.x == 0)

# Now, let's see what police districts look like
cfs |> 
  count_table(police_district) |> 
  affiche()

##### B. Replacing Nones

In [None]:
# After more digging, you'll find that both beat also has some "None" values
cfs |> count_table(beat) |> 
  filter(beat == "None") |> 
  affiche()

In [None]:
# We can use the same approach we used with replacing 0s
cfs <- cfs %>% 
  naniar::replace_with_na_at(.vars = "beat", ~.x == "None")

# Now let's see if they were succesfully transormed to NAs
cfs |> count_table(beat) |> 
  filter(is.na(beat)) |> 
  affiche()

In [None]:
# Now we can look at our missing values by column:
cfs |> count_na() |> affiche()

#### 6. Using regex to update a column

In [None]:
# Next, let's take a look at other unexpected beat values
# We're looking for strings that don't follow the conventional pattern of ...
# ... 1 digit followed by 1 letter, then 2 more digits (eg 1b01)
# This is a common issue resulting from programs like Excel auto-parsing certain fields
# Basically, we want 1.00e+03 to become 1e03, etc 
# We'll create a new column with an if_else() requiring a condition and arguments for true or false
cfs <- cfs |> mutate(
  beat_2 = if_else(
    condition = !grepl("\\d{1}[a-z]\\d{2}", beat) & !is.na(beat),
    true = str_replace(beat, 
                       pattern = "(\\d{1})\\.00(e)\\+(\\d{2})", 
                       replacement = "\\1\\2\\3"),
    false = beat 
    )
  ) 

cfs |> 
  filter(!grepl("\\d{1}[a-z]\\d{2}", beat) & !is.na(beat)) |> 
  distinct(beat, beat_2) |> 
  affiche()

In [None]:
# Now we can update beat with our beat_2 column
cfs$beat <- cfs$beat_2

# Then drop the beat_2 column
cfs$beat_2 <- NULL

# And look at the new beat values that were updated
cfs |> 
  count_table(beat) |> 
  filter(grepl("e", beat)) |> 
  affiche()

#### 7. Creating booleans

In [None]:
# It looks like self-initiated uses a Y / N system 
cfs |> 
  count_table(self_initiated) |> 
  affiche()

In [None]:
# So we're going to make it a boolean instead with another if_else()
cfs <- cfs |> mutate(self_initiated = if_else(
  condition = self_initiated == "Y",
  true = TRUE,
  false = FALSE)
)

In [None]:
# Let's see how that turned out
str(cfs$self_initiated)

cfs |> 
  count_table(self_initiated) |> 
  affiche()

#### 8. Extracting coordinate info

In [None]:
# Let's take a look at the location field
str(cfs$location)

cfs |> select(location) |> head(5) |> affiche()

In [None]:
# Let's extract the longitude field from the location column
# And coerce as numeric type

# Key: \\- is a double-escaped search for a hyphen 
# \\d{2} is a double-escaped search for two digits
# \\. is a double-escaped search for a decimal
# and \\d{4,} is a double-escaped search for four or more digits
longitude_pattern <- "(\\-\\d{2}\\.\\d{4,})"
cfs$longitude <- cfs$location |> str_extract(longitude_pattern) |> as.numeric()

cfs |> select(longitude) |> head(5) |> affiche()

In [None]:
# Let's extract the latitude field from the location column
# And coerce as numeric type

# Key: \\s is a double-escaped search for a whitespace
# \\d{2} is a double-escaped search for two digits
# \\. is a double-escaped search for a decimla
# and \\d{4,} is a double-escaped search for four or more digits
latitude_pattern <- "\\s(\\d{2}\\.\\d{4,})"
cfs$latitude <- cfs$location |> str_extract(latitude_pattern) |> as.numeric()

cfs |> select(latitude) |> head(5) |> affiche()

#### 9. Geocoding missing location data

In [None]:
# We'll load {glue} in R for handy string interpolation
if (!require(glue)) install.packages("glue")

# Identify cases with missing location data
missing <- cfs |> 
  filter(is.na(longitude) & is.na(latitude)) |> 
  filter(!is.na(block_address)) 
  
missing |> 
  select(block_address, zip_code, location:latitude) |> 
  head(10) |> 
  affiche()

print(glue("Number of missing location fields: {nrow(missing)}"))

In [None]:
# Load the {tidygeocoder} package
if (!require(tidygeocoder)) install.packages("tidygeocoder")

# Let's fill out an address field
missing$address <- paste(missing$block_address, "Louisiana, LA", missing$zipcode)

missing |> 
  select(block_address, address) |> 
  head(10) |> 
  affiche()

missing <- missing |> 
  tidygeocoder::geocode(address = address,
                        method = "arcgis",
                        lat = latitude,
                        long = longitude)

In [None]:
# First let's fix the new latitude and longitude columns
# Since R renamed all four of our columns, we'll just use the new columns
missing <- missing |> 
  rename(latitude = latitude...25,
         longitude = longitude...26)

# Now let's look at the geocoded addresses
missing |> 
  select(address:longitude) |> 
  head(10) |> 
  affiche()           

In [None]:
# Let's double-check if there are any missing coordinates
missing |> 
  filter(is.na(latitude) | is.na(longitude)) |> 
  head(10) |> 
  affiche()

In [None]:
# Now let's join them back to the original cfs table
cfs <- cfs %>% 
  left_join(missing |> select(block_address, 
                              latitude, 
                              longitude), 
            by = "block_address")

# We'll update the original longitude and latitude values with the new ones via coalesce(), ...
# ... drop the extra columns, and rename the original columns
cfs <- cfs |> 
  mutate(longitude.x = coalesce(longitude.x, longitude.y),
         latitude.x = coalesce(latitude.x, latitude.y)) |> 
  select(-c(longitude.y, latitude.y)) |> 
  rename(longitude = longitude.x, 
         latitude = latitude.x)

str(cfs)

#### 10. Calculating response time

In [None]:
# Calculate response time (from when call was created to when the officer arrived)
cfs <- cfs |> 
  mutate(response_time = (time_arrive - time_create) |> round(2),
    .after = time_arrive
  )

# We see some zeroes here — let's make them NA
cfs <- cfs %>% 
  naniar::replace_with_na_at(.vars = "response_time", ~.x == "0")

# Look at the longest response times
cfs |> 
  select(time_create:response_time) |> 
  filter(!is.na(time_arrive)) |> 
  arrange(desc(response_time)) |> 
  head(10) |> 
  affiche()

In [None]:
# Let's look at that case with the longest response time
cfs |> filter(response_time == "105700") |> affiche()

#### 11. Collapsing call priorities

In [None]:
# We'll collapse the call priorities using the descriptions given from the codebook
cfs <- cfs |> 
  mutate(
    priority_desc = case_when(
      grepl("0", priority) ~ "Non-police",
      grepl("1", priority) ~ "Non-emergency",
      grepl("2", priority) ~ "Emergency",
      grepl("3", priority) ~ "Officer assistance"
    ), .after = priority
  )

In [None]:
# Take a look at the new priority descriptions
cfs |> count_table(priority_desc) |> affiche()

#### 12. Classifying call types

In [None]:
# Define keywords for violent crimes
violent_keywords <- c("assault", "battery", "homicide", "fight", "rape", "carjacking")

# Define keywords for theft crimes
theft_keywords <- c("theft", "burglary", "stolen", "shoplifting", "damage")

# Define keywords for traffic incidents
traffic_keywords <- c("traffic", "driving", "tow", "accident", "stranded")

# Categorize calls
cfs <- cfs |> 
  mutate(
    is_violent = str_detect(tolower(type_text), paste(violent_keywords, collapse = "|")), 
    is_theft = str_detect(tolower(type_text), paste(theft_keywords, collapse = "|")),  
    is_traffic = str_detect(tolower(type_text), paste(traffic_keywords, collapse = "|")), 
    category = case_when(
      is_violent == TRUE ~ "Violent",
      is_theft == TRUE ~ "Theft", 
      is_traffic == TRUE ~ "Traffic", 
      TRUE ~ "Other")
  ) |> 
  select(-c(is_violent:is_traffic))

# 4. Exploratory Data Analysis

#### 1. Summary statistics

In [None]:
# Use our custom describe() function for pulling summary stats
cfs |> describe() |> affiche()

#### 2. What are the most commonly occuring call type categories?

In [None]:
cfs |> count_table(category) |> affiche()

#### 3. When do most calls take place?

In [None]:
# Extract the hour from the time_create field
# We'll also calculate a time period of day for later use
cfs <- cfs |> mutate(
  hour = hour(time_create), 
  time_period = case_when(
    hour >= 0 & hour <= 5 ~ "Night",
    hour >= 6 & hour <= 11 ~ "Morning",
    hour >= 12 & hour <= 17 ~ "Afternoon",
    hour >= 18 & hour <= 23 ~ "Evening"
  ), .after = time_create, 
)

# Generate a summary table by hour
hourly_sum <- cfs |> 
  count_table(hour) |> 
  arrange(hour)

# We can use ggplot() + geom_line()
ggplot(hourly_sum, aes(x = hour, y = count)) +
  geom_line()

In [None]:
# Let's look at those time periods too
cfs |> count_table(time_period) |> affiche()

#### 4. What are the most common call dispositions?

In [None]:
cfs |> count_table(disposition_text) |> affiche()

#### 5. Which police districts receive the most calls?

In [None]:
cfs |> count_table(police_district) |> affiche()

#### 6. What proportion of calls are self-initiated?

In [None]:
cfs |> count_table(self_initiated) |> affiche()

#### 7. Where do most calls originate from?

In [None]:
if (!require(sf)) install.packages("sf")

# Read shapefile and filter to Orleans Parish
# As well as filter out water-only polygons
shapes <- sf::st_read("./data/shapes/tl_2024_22_bg.shp") |> 
  filter(COUNTYFP == "071") |> 
  filter(!grepl("220719900000|220719801001", GEOID))

# Preview the orleans shapes map
ggplot(shapes, aes()) +
  geom_sf() +
  labs(title = "New Orleans shapefile")

In [None]:
# Convert the cfs object to a spatial feature (sf) object
cfs_sf <- cfs %>%
  st_as_sf(coords = c("longitude", "latitude"), crs = 4326, agr = "constant")

# Look at the shapes map with cfs overlaid
ggplot() +
  geom_sf(data = shapes) +
  geom_sf(data = cfs_sf, alpha = .5, size = .75) +
  labs(title = "New Orleans calls for service")

#### 8. What is the best predictor of response time?

In [None]:
# Note: I only included variables that were statistically significant for readability
# As well as convert response time to a true numeric column (in seconds)
# We'll also convert character fields and police district to factors
# And drop NAs
data <- cfs |> 
  select(c(category, priority_desc, time_period, response_time, 
           self_initiated, police_district)) |> 
  mutate(response_time = round(as.numeric(response_time), 0)) |> 
  mutate(across(where(is.character), as.factor)) |> 
  mutate(police_district = as.factor(police_district)) |> 
  drop_na()

str(data)

In [None]:
# Fit a multiple linear regression model
model <- lm(formula = response_time ~ ., data = data)

# TO-DO: Invesitage why model outputs differ
model |> summary()