/
01_DataCleaning.R
157 lines (109 loc) · 5.2 KB
/
01_DataCleaning.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
#######################################
## Data cleaning
## Author: Francesca Mancini
## Date created: 2019-08-08
## Date modified: 2019-08-19
#######################################
library(BRCmap)
library(RODBC)
library(dplyr)
library(ggplot2)
library(sparta)
# load anc clean the hoverfly recording scheme data ----
hrs <- read.csv("..\\Data\\HRS\\RD_HRS.csv", header = T, stringsAsFactors = F)
str(hrs)
summary(hrs$DT_ID)
# I should delete everything that has DT_ID = 8, no date provided
# hrs <- subset(hrs, DT_ID != 8)
hrs <- subset(hrs, DT_ID == 1)
# convert to date format
hrs$startdate <- as.Date(hrs$TO_STARTDATE, format = "%d/%m/%Y")
hrs$enddate <- as.Date(hrs$TO_ENDDATE, format = "%d/%m/%Y")
# retain only 2017 records
hrs <- subset(hrs, format(hrs$enddate,"%Y")==2017)
# retain only records with precision 1Km or less
hrs <- subset(hrs, TO_PRECISION <= 1000)
# change precision of gridref to 5 Km to increase overlap with PoMS data
hrs$GRIDREF_5KM_PREC <- reformat_gr(hrs$TO_GRIDREF, prec_out = 5000,
precision = hrs$TO_PRECISION)
# obtain species names from concept codes -----
# if(.Platform$GUI =="RStudio"){
# brc_pass = winDialogString("Enter database password","")
# channel = odbcConnect("FRAMAN",pwd=brc_pass, believeNRows = FALSE)
# rm(brc_pass)
# }else{
# channel = odbcConnect("FRAMAN", believeNRows = FALSE)
# }
#
# sp_names <- sqlQuery(channel, paste("select name, concept from brc.taxa_taxon_register where concept in (",
# paste(shQuote(unique(hrs[, "CONCEPT"]), type="sh"), collapse = ","), ") and valid in ('V', 'X')"))
#
# write.csv(sp_names, "../Data/HRS/Sp_Names_all.csv", row.names = FALSE)
sp_names <- read.csv("../Data/HRS/Sp_Names_all.csv")
str(sp_names)
# returns 240 species names, when we only have 238 species codes
which(duplicated(sp_names$CONCEPT) == TRUE)
# species Dip_3430 and Dip_3434 have two associated names: Eristalis abusivus/abusiva and Eristalis intricarius/intricaria
# in PoMS these species are referred as abusivus and intricarius, therefore we are using same names here
sp_names <- sp_names[-c(117, 122), ]
hrs_sp_names <- left_join(hrs, sp_names, by = "CONCEPT")
## find the mistakes ----
## the only possible erroneous combinations are
## those where the day is reported as 1 to 12
## Also, the errors that are likely to cause problems are
## those in the winter months, because that will
## make the Julian date estimation difficult to converge
## If the 6th of July has been reported as the 7Th of June
## it won't make much of a difference because in both months
## we expect similar numbers of hoverflies recorded
# extract the observation with those characteristics
poss_errors <- subset(hrs_sp_names,
format(hrs_sp_names$startdate,"%d") < 13)
summary(poss_errors)
summary(hrs_sp_names)
# group the observations by site and date
# to calculate a list length for every
# possibly wrong visit and plot the LL by data
poss_errors_grouped <- poss_errors %>%
group_by(TO_GRIDREF, startdate) %>%
summarise(list_length = n_distinct(NAME)) %>%
ungroup()
plot(poss_errors_grouped$list_length ~ poss_errors_grouped$startdate)
# from the plot it is clear that there are some potentially wrong observations
# in January, February and December, we can extract them from the plot
poss_outliers <- identify(poss_errors_grouped$startdate, poss_errors_grouped$list_length)
poss_outliers <- poss_errors_grouped[outliers,]
# Following Kath's observation that some of these records
# may be duplicates of correct records, where the date has been transposed
# we can try to extract from the original dataset all the records
# from the sites in outliers and the same and trasnposed date
poss_duplicates <- hrs_sp_names %>%
filter(TO_GRIDREF %in% outliers$TO_GRIDREF) %>%
filter(startdate %in% outliers$startdate |
startdate %in% as.Date(outliers$startdate, format = "%Y-d%-m%")) #%>%
# group_by(TO_GRIDREF, CONCEPT, ABUNDANCE_COMMENT) %>%
# filter(n() > 1) #%>%
# do(filter(as.Date(TO_STARTDATE, format = "%d/%m/%Y")))
# write.csv(poss_duplicates, "./Data/HRS_2017_possible_duplicates.csv",
# row.names = FALSE)
# after manuallly looking at the csv file
# it appears that the records are not duplicated,
# therefore the best thing to do at the moment
# is to exclude all those records that are very unlikely
# from the dataset
# these dates will have to be revisited and
# the accuracy f the dates verified from the original data files
plot(poss_errors_grouped$list_length ~ poss_errors_grouped$startdate)
outliers <- identify(poss_errors_grouped$startdate, poss_errors_grouped$list_length)
outliers <- poss_errors_grouped[outliers,]
# filter the original data using site and date from outliers
hrs_sp_names_filtered <- hrs_sp_names %>%
anti_join(outliers, by = c("TO_GRIDREF", "startdate"))
# make sure all outliers have been removed
hrs_sp_names_filtered_grouped <- hrs_sp_names_filtered %>%
group_by(TO_GRIDREF, startdate) %>%
summarise(list_length = n_distinct(NAME)) %>%
ungroup()
plot(hrs_sp_names_filtered_grouped$list_length ~ hrs_sp_names_filtered_grouped$startdate)
write.csv(hrs_sp_names_filtered, "./Data/HRS_2017_filtered.csv",
row.names = FALSE)