adding more beaches for modeling

Chicago · Sep 12, 2017 · 5a4c44f · 5a4c44f
1 parent c0c180a
commit 5a4c44f
Show file tree

Hide file tree

Showing 3 changed files with 27 additions and 17 deletions.
diff --git a/Data/df.Rds b/Data/df.Rds
diff --git a/Master.R b/Master.R
@@ -37,12 +37,20 @@ rm(list=ls()[!ls() %in% keep])
 #  Comment out the predictors that you do not want to use
 #-------------------------------------------------------------------------------
 
+# remove older years
+df <- df[df$Year %in% c("2012", "2013", "2014", "2015", "2016"),]
+
 # set predictors
 df_model <- df[, c("Escherichia.coli", #dependent variable
                    "Client.ID",
                    # "precipProbability",
                    # "Water.Level",
-                   "Rogers_Escherichia.coli",
+                   "n12th_Escherichia.coli",
+                   "Foster_Escherichia.coli",
+                   "North_Avenue_Escherichia.coli",
+                   "n39th_Escherichia.coli",
+                   "Albion_Escherichia.coli",
+                   # "Rogers_Escherichia.coli",
                    # "Howard_Escherichia.coli",
                    # "n57th_Escherichia.coli", 
                    # "n63rd_Escherichia.coli",
@@ -52,16 +60,16 @@ df_model <- df[, c("Escherichia.coli", #dependent variable
                    # "Rainbow_Escherichia.coli",
                    # "Ohio_DNA.Geo.Mean",
                    # "North_Avenue_DNA.Geo.Mean",
-                   "n63rd_DNA.Geo.Mean",
-                   "South_Shore_DNA.Geo.Mean",
-                   "Montrose_DNA.Geo.Mean",
-                   "Calumet_DNA.Geo.Mean",
-                   "Rainbow_DNA.Geo.Mean",
-                   "Date", #Must use for splitting data, not included in model
-                   "Predicted.Level" #Must use for USGS model comparison, not included in model
+                   # "n63rd_DNA.Geo.Mean",
+                   # "South_Shore_DNA.Geo.Mean",
+                   # "Montrose_DNA.Geo.Mean",
+                   # "Calumet_DNA.Geo.Mean",
+                   # "Rainbow_DNA.Geo.Mean",
+                   "Date" #Must use for splitting data, not included in model
+                   # "Predicted.Level" #Must use for USGS model comparison, not included in model
                    )]
 # to run without USGS for comparison, comment out "Predicted.Level" above and uncomment next line
-# df_model$Predicted.Level <- 1 #meaningless value
+df_model$Predicted.Level <- 1 #meaningless value
 
 #-------------------------------------------------------------------------------
 #  CHOOSE TEST/TRAIN SETS
@@ -70,8 +78,8 @@ df_model <- df[, c("Escherichia.coli", #dependent variable
 #  If you set kFolds to FALSE, the model will use trainStart, trainEnd, etc. (see below)
 #-------------------------------------------------------------------------------
 
-kFolds <- TRUE #If TRUE next 4 lines will not be used but cannot be commented out
-trainStart <- "2006-01-01"
+kFolds <- FALSE #If TRUE next 4 lines will not be used but cannot be commented out
+trainStart <- "2012-01-01"
 trainEnd <- "2015-12-31"
 testStart <- "2016-01-01"
 testEnd <- "2016-12-31"
@@ -107,25 +115,25 @@ downsample <- FALSE #If FALSE comment out the next 3 lines
 #-------------------------------------------------------------------------------
 
 excludeBeaches <- c(
-                    # "12th",
+                    "12th",
                     # "31st",
-                    # "39th",
+                    "39th",
                     # "57th",
                     "63rd",
-                    # "Albion",
+                    "Albion",
                     "Calumet",
-                    # "Foster",
+                    "Foster",
                     # "Howard",
                     # "Jarvis",
                     # "Juneway",
                     # "Leone",
                     "Montrose",
-                    # "North Avenue",
+                    "North Avenue",
                     # "Oak Street",
                     # "Ohio",
                     # "Osterman",
                     "Rainbow",
-                    "Rogers",
+                    # "Rogers",
                     "South Shore"
                     )
 

diff --git a/R/20_Clean.R b/R/20_Clean.R
@@ -129,6 +129,8 @@ names(df)[names(df) == "North Avenue_DNA.Geo.Mean"] <- "North_Avenue_DNA.Geo.Mea
 names(df)[names(df) == "Oak Street_DNA.Geo.Mean"] <- "Oak_Street_DNA.Geo.Mean"
 names(df)[names(df) == "South Shore_DNA.Geo.Mean"] <- "South_Shore_DNA.Geo.Mean"
 names(df)[names(df) == "South Shore_Escherichia.coli"] <- "South_Shore_Escherichia.coli"
+names(df)[names(df) == "North Avenue_Escherichia.coli"] <- "North_Avenue_Escherichia.coli"
+
 
 
 #remove times from Date variable