In [None]:
R.version.string


In [None]:
library(mgcv)
library(gratia)
library(tidyr)
getwd()

In [None]:
df <- read.csv('step1_occurrence.csv')

In [None]:

detect_outlier <- function(x) {
  
  # calculate first quantile
  Quantile1 <- quantile(x, probs=.25)
  
  # calculate third quantile
  Quantile3 <- quantile(x, probs=.75)
  
  # calculate inter quartile range
  IQR = Quantile3-Quantile1
  
  # return true or false
  #x > Quantile3 + (IQR*1.5) | x < Quantile1 - (IQR*1.5)
  x > Quantile3 + (IQR*1.5) | x < Quantile1 - (IQR*1.5)

}

# create remove outlier function
remove_outlier <- function(dataframe,
                           columns=names(dataframe)) {
  
  # for loop to traverse in columns vector
  for (col in columns) {
    
    # remove observation if it satisfies outlier function
    dataframe <- dataframe[!detect_outlier(dataframe[[col]]), ]
  }
  # return dataframe
  print("Remove outliers")
  print(dataframe)
}


In [None]:
# Filter the data for the specific year ,2014,2016,2018,2020,2022
df_year_all <- df[df$accident_year %in% c(2012,2014,2016,2018,2020,2022), ]

# Filter out rows with Total_Events2 < 3
df_year_all <- df_year_all[df_year_all$Total_Events2 >= 3, ]
# Convert relevant columns to factors
df_year_all$accident_year <- as.factor(df_year_all$accident_year)
df_year_all$Area_Type_y <- as.factor(df_year_all$Area_Type_y)
df_year_all$STATE <- as.factor(df_year_all$STATE)
df_year_all$pf <- df_year_all$Total_Events2 / df_year_all$Total_house_unit 
df_year_all$build_year_new <- df_year_all$build_time_1980_to_later_list

df_year_all$pf <- df_year_all$pf * 100000 * 12

# Remove outliers for specific columns
outlier_columns <- c(
'Black.Alone',
'Pct_SA_62_and_over_list',
'Industrial',
'Transportation.and.storage',
'Pct_EDU_Bachelor_or_higher_list',
'POPPCT_URBAN',
'Pct_HOU_Occupied_units_list',
'build_year_new',
'zndx',
'tmp_c'
)
for (col in outlier_columns) {
df_year_all <- remove_outlier(df_year_all, c(col))
}

# Define the GAM model
k_value = 3
mod_all <- gam(
pf ~ factor(STATE) +# factor(Area_Type_y)+ #factor(accident_year)+
    s(Pct_HOU_Occupied_units_list, k = k_value) +
    s(build_year_new, k = k_value) +
    s(Pct_EDU_Bachelor_or_higher_list, k = k_value) +
    s(Pct_SA_62_and_over_list, k = k_value) +
    s(Black.Alone, k = k_value) +
    s(Industrial, k = k_value) +
    s(Transportation.and.storage, k = k_value) +
    s(tmp_c, k = k_value) +
    s(zndx, k = k_value)+
    s(POPPCT_URBAN, k = k_value),

data = df_year_all,
family = Gamma(link = "log"),
select = TRUE
)

# Summarize the model
summary_output <- capture.output(summary(mod_all))

plot(mod_all, page = 1)
concurvity(mod_all)

# Save the summary to a text file
summary_file <- paste0("step_1_results_final_remove_outlier_cm_density_all_year_git", ".txt")
writeLines(summary_output, summary_file)

# Save the concurvity
concurvity_output <- capture.output(concurvity(mod_all))
concurvity_file <- paste0("concurvity_density_all_year_git", ".txt")
writeLines(concurvity_output, concurvity_file)

# Save the comparison of smooths
comp <- compare_smooths(mod_all, mod_all)
rest <- unnest(comp, data)
csv_file <- paste0("step_1_results_final_remove_outlier_cm_density_all_year_git", ".csv")
write.csv(rest, csv_file)

# Plot the GAM results
plot_file <- paste0("plot_mod_density_all_year_git",".png")
png(plot_file, width = 1200, height = 800)
plot(mod_all, page = 1)
dev.off()

# GAM diagnostics
#gam.check(mod_all, rep = 100, page = 1)

gam_check_plot_file <- paste0("gam_check_plots_density_all_year_git", ".png")
png(gam_check_plot_file, width = 800, height = 800)
gam.check(mod_all, rep = 100, page = 1)
dev.off()

In [None]:
# Define the years to iterate over
years <- seq(2012, 2022, by = 2)

# Iterate over each year
for (year in years) {
  
  # Filter the data for the specific year
  df_year <- df[df$accident_year %in% c(year), ]
  
  # Filter out rows with Total_Events2 < 3
  df_year <- df_year[df_year$Total_Events2 >= 3, ]
  
  # Convert relevant columns to factors
  df_year$accident_year <- as.factor(df_year$accident_year)
  df_year$Area_Type_y <- as.factor(df_year$Area_Type_y)
  df_year$STATE <- as.factor(df_year$STATE)
  # Calculate the `pf` column
  df_year$pf <- df_year$Total_Events2 / df_year$Total_house_unit 
  df_year$pf <- df_year$pf * 100000 * 12
  df_year$build_year_new <- df_year$build_time_1980_to_later_list
  # Remove outliers for specific columns
  outlier_columns <- c(
'Black.Alone',
'Pct_SA_62_and_over_list',
'Industrial',
'Transportation.and.storage',
'Pct_EDU_Bachelor_or_higher_list',
'POPPCT_URBAN',
'Pct_HOU_Occupied_units_list',
'build_year_new',
'zndx',
'tmp_c'
  )
  for (col in outlier_columns) {
    df_year <- remove_outlier(df_year, c(col))
  }
  
  # Define the GAM model
  k_value = 3
  mod_all <- gam(
    pf ~ factor(STATE) +#+ factor(Area_Type_y)+
    s(Pct_HOU_Occupied_units_list, k = k_value) +
    s(build_year_new, k = k_value) +
    s(Pct_EDU_Bachelor_or_higher_list, k = k_value) +
    s(Pct_SA_62_and_over_list, k = k_value) +
    s(Black.Alone, k = k_value) +
    s(Industrial, k = k_value) +
    s(Transportation.and.storage, k = k_value) +
    s(tmp_c, k = k_value) +
    s(zndx, k = k_value)+
       s(POPPCT_URBAN, k = k_value),
    data = df_year,
    family = Gamma(link = "log"),
    select = TRUE
  )
  
  # Summarize the model
  summary_output <- capture.output(summary(mod_all))
  
  # Save the summary to a text file
  summary_file <- paste0("step_1_results_final_remove_outlier_cm_density_", year, ".txt")
  writeLines(summary_output, summary_file)
  
  # Save the concurvity
  concurvity_output <- capture.output(concurvity(mod_all))
  concurvity_file <- paste0("concurvity_density_", year, ".txt")
  writeLines(concurvity_output, concurvity_file)
  
  # Save the comparison of smooths
  comp <- compare_smooths(mod_all, mod_all)
  rest <- unnest(comp, data)
  csv_file <- paste0("step_1_results_final_remove_outlier_cm_density_", year, ".csv")
  write.csv(rest, csv_file)
  
  # Plot the GAM results
  plot_file <- paste0("plot_mod_all_density_", year, ".png")
  png(plot_file, width = 1200, height = 800)
  plot(mod_all, page = 1)
  dev.off()
  
  # GAM diagnostics
  gam.check(mod_all, rep = 100, page = 1)
  
  # Print the concurvity to the console
  print(paste0("Concurvity for year ", year, ":"))
  print(concurvity(mod_all))
  gam_check_plot_file <- paste0("gam_check_plots_all_density_", year, ".png")
png(gam_check_plot_file, width = 800, height = 800)
gam.check(mod_all, rep = 100, page = 1)
dev.off()
}




In [None]:
# Define a function to map months to seasons
get_season <- function(month) {
  if (month %in% c(12, 1, 2)) {
    return("Winter")
  } else if (month %in% c(3, 4, 5)) {
    return("Spring")
  } else if (month %in% c(6, 7, 8)) {
    return("Summer")
  } else if (month %in% c(9, 10, 11)) {
    return("Autumn")
  }
}

# Add a season column to the dataframe
df$season <- sapply(df$accident_month, get_season)

# Iterate over seasons
seasons <- c("Winter", "Spring", "Summer", "Autumn")

for (season in seasons) {
  
  # Filter the data for the specific season
  df_season <- df[df$season == season, ]
  
  # Filter out rows with Total_Events2 < 3
  df_season <- df_season[df_season$Total_Events2 >= 3, ]
  
  # Convert relevant columns to factors
  df_season$accident_year <- as.factor(df_season$accident_year)
  df_season$Area_Type_y <- as.factor(df_season$Area_Type_y)
  
  # Calculate the `pf` column
  df_season$pf <- df_season$Total_Events2 / df_season$Total_house_unit 
  df_season$pf <- df_season$pf * 100000 * 12
  df_season$build_year_new <- df_season$build_time_1980_to_later_list

  # Remove outliers for specific columns
  outlier_columns <- c(
'Black.Alone',
'Pct_SA_62_and_over_list',
'Industrial',
'Transportation.and.storage',
'Pct_EDU_Bachelor_or_higher_list',
'POPPCT_URBAN',
'Pct_HOU_Occupied_units_list',
'build_year_new',
'zndx',
'tmp_c'
  )
  for (col in outlier_columns) {
    df_season <- remove_outlier(df_season, c(col))
  }
  
  # Define the GAM model
  k_value = 3
  mod_all <- gam(
    pf ~ factor(STATE) + #factor(Area_Type_y) +
    s(Pct_HOU_Occupied_units_list, k = k_value) +
    s(build_year_new, k = k_value) +
    s(Pct_EDU_Bachelor_or_higher_list, k = k_value) +
    s(Pct_SA_62_and_over_list, k = k_value) +
    s(Black.Alone, k = k_value) +
    s(Industrial, k = k_value) +
    s(Transportation.and.storage, k = k_value) +
    s(tmp_c, k = k_value) +
    s(zndx, k = k_value)+
    s(POPPCT_URBAN, k = k_value),
    data = df_season,
    family = Gamma(link = "log"),
    select = TRUE
  )
  
  # Summarize the model
  summary_output <- capture.output(summary(mod_all))
  
  # Save the summary to a text file
  summary_file <- paste0("step_1_results_final_remove_outlier_cm_density_", season, ".txt")
  writeLines(summary_output, summary_file)
  
  # Save the concurvity
  concurvity_output <- capture.output(concurvity(mod_all))
  concurvity_file <- paste0("concurvity_density_", season, ".txt")
  writeLines(concurvity_output, concurvity_file)
  
  # Save the comparison of smooths
  comp <- compare_smooths(mod_all, mod_all)
  rest <- unnest(comp, data)
  csv_file <- paste0("step_1_results_final_remove_outlier_cm_density_", season, ".csv")
  write.csv(rest, csv_file)
  
  # Plot the GAM results
  plot_file <- paste0("plot_mod_all_density_", season, ".png")
  png(plot_file, width = 1200, height = 800)
  plot(mod_all, page = 1)
  dev.off()
  
  # GAM diagnostics
  gam.check(mod_all, rep = 100, page = 1)
  
  # Print the concurvity to the console
  print(paste0("Concurvity for season ", season, ":"))
  print(concurvity(mod_all))
  gam_check_plot_file <- paste0("gam_check_plots_all_density_", season, ".png")
png(gam_check_plot_file, width = 800, height = 800)
gam.check(mod_all, rep = 100, page = 1)
dev.off()
}



In [None]:
# Create a lookup for regions based on state abbreviations
region_lookup <- list(
  "Northeast" = c("CT", "ME", "MA", "NH", "NJ", "NY", "PA", "RI", "VT"),
  "Midwest" = c("IL", "IN", "IA", "KS", "MI", "MN", "MO", "NE", "ND", "OH", "SD", "WI"),
  "South" = c("AL", "AR", "DE", "DC", "FL", "GA", "KY", "LA", "MD", "MS", "NC", "OK", "SC", "TN", "TX", "VA", "WV"),
  "West" = c("AK", "AZ", "CA", "CO", "HI", "ID", "MT", "NV", "NM", "OR", "UT", "WA", "WY")
)

# Add a region column to the dataframe
df$region <- sapply(df$STATE, function(state) {
  for (region in names(region_lookup)) {
    if (state %in% region_lookup[[region]]) {
      return(region)
    }
  }
  return(NA) # Assign NA if the state doesn't match any region
})

# Iterate over regions
regions <- c("Northeast", "Midwest", "South", "West")

for (region in regions) {
  
  # Filter the data for the specific region
  df_region <- df[df$region == region, ]
  
  # Filter out rows with Total_Events2 < 3
  df_region <- df_region[df_region$Total_Events2 >= 3, ]
  
  # Convert relevant columns to factors
  df_region$accident_year <- as.factor(df_region$accident_year)
  df_region$Area_Type_y <- as.factor(df_region$Area_Type_y)
  df_region$build_year_new <- df_region$build_time_1980_to_later_list
  # Calculate the `pf` column
  df_region$pf <- df_region$Total_Events2 / df_region$Total_house_unit 
  df_region$pf <- df_region$pf * 100000 * 12
  
  # Remove outliers for specific columns
  outlier_columns <- c(
'Black.Alone',
'Pct_SA_62_and_over_list',
'Industrial',
'Transportation.and.storage',
'Pct_EDU_Bachelor_or_higher_list',
'POPPCT_URBAN',
'Pct_HOU_Occupied_units_list',
'build_year_new',
'zndx',
'tmp_c'
  )
  for (col in outlier_columns) {
    df_region <- remove_outlier(df_region, c(col))
  }
  
  # Define the GAM model
  k_value = 3
  mod_all <- gam(
    pf ~ factor(STATE) + #factor(Area_Type_y) +
    s(Pct_HOU_Occupied_units_list, k = k_value) +
    s(build_year_new, k = k_value) +
    s(Pct_EDU_Bachelor_or_higher_list, k = k_value) +
    s(Pct_SA_62_and_over_list, k = k_value) +
    s(Black.Alone, k = k_value) +
    s(Industrial, k = k_value) +
    s(Transportation.and.storage, k = k_value) +
    s(tmp_c, k = k_value) +
    s(zndx, k = k_value)+
    s(POPPCT_URBAN, k = k_value),

    data = df_region,
    family = Gamma(link = "log"),
    select = TRUE
  )
  
  # Summarize the model
  summary_output <- capture.output(summary(mod_all))
  
  # Save the summary to a text file
  summary_file <- paste0("step_1_results_final_remove_outlier_cm_density_", region, ".txt")
  writeLines(summary_output, summary_file)
  
  # Save the concurvity
  concurvity_output <- capture.output(concurvity(mod_all))
  concurvity_file <- paste0("concurvity_density_", region, ".txt")
  writeLines(concurvity_output, concurvity_file)
  
  # Save the comparison of smooths
  comp <- compare_smooths(mod_all, mod_all)
  rest <- unnest(comp, data)
  csv_file <- paste0("step_1_results_final_remove_outlier_cm_density_", region, ".csv")
  write.csv(rest, csv_file)
  
  # Plot the GAM results
  plot_file <- paste0("plot_mod_all_density_", region, ".png")
  png(plot_file, width = 1200, height = 800)
  plot(mod_all, page = 1)
  dev.off()
  
  # GAM diagnostics
  gam.check(mod_all, rep = 100, page = 1)
  
  # Print the concurvity to the console
  print(paste0("Concurvity for region ", region, ":"))
  print(concurvity(mod_all))

  gam_check_plot_file <- paste0("gam_check_plots_all_density_", region, ".png")
png(gam_check_plot_file, width = 800, height = 800)
gam.check(mod_all, rep = 100, page = 1)
dev.off()
}

