In [None]:
options(repr.plot.width=6, repr.plot.height=4)

In [None]:
if (!require(tidyverse)) {
    install.packages("tidyverse")
    library(tidyverse)
}

if (!require(data.table)) {
    install.packages("data.table")
    library(data.table)
}

In [None]:
expenseData <- readr::read_csv("ExpenseReports.csv")

In [None]:
head(expenseData)

In [None]:
expenseData$Date <- as.Date(expenseData$Date, "%m/%d/%Y")
expenseData$Year <- as.integer(lubridate::year(expenseData$Date))

In [None]:
tail(expenseData, 10)

In [None]:
nrow(expenseData)

In [None]:
str(expenseData)

In [None]:
glimpse(expenseData)

In [None]:
summary(expenseData)

In [None]:
aggregates <- expenseData %>%
    group_by(Year) %>%
    summarize(n = n(), total = sum(Amount), meanAmt = mean(Amount), medianAmt = median(Amount))

In [None]:
ggplot(data = aggregates, aes(x = Year, y = meanAmt)) +
    geom_line() +
    scale_x_continuous(breaks = (2011:2019)) +
    theme_minimal()

In [None]:
ggplot(data = aggregates, aes(x = Year, y = medianAmt)) +
    geom_line() +
    scale_x_continuous(breaks = (2011:2019)) +
    theme_minimal()

In [None]:
cityTypeAggregates <- expenseData %>%
    group_by(Year, CityType) %>%
    summarize(n = n(), total = sum(Amount), meanAmt = mean(Amount), medianAmt = median(Amount))

In [None]:
ggplot(data = cityTypeAggregates, aes(x = Year, y = medianAmt, color = CityType)) +
    geom_line() +
    scale_x_continuous(breaks = (2011:2019)) +
    scale_color_brewer(type = "qual", palette = "Dark2") +
    theme_minimal()

In [None]:
reg <- lm(formula = Amount ~ EmployeeName, data = expenseData)

In [None]:
summary(reg)

In [None]:
reg <- lm(formula = Amount ~ CityType + Year, data = expenseData)

In [None]:
summary(reg)

In [None]:
reg <- lm(formula = Amount ~ CityType + EmployeeName + Year, data = expenseData)
summary(reg)

In [None]:
unique(expenseData$CityType)

In [None]:
length(unique(expenseData$CityType))

In [None]:
rapply(expenseData, function(x) { length(unique(x)) })

In [None]:
data.table::setDT(expenseData)[, .N, keyby=EmployeeName] %>% arrange(desc(N))

In [None]:
data.table::setDT(expenseData)[, .N, keyby=CityType]

In [None]:
data.table::dcast(expenseData, EmployeeName ~ CityType, fun.aggregate = length, value.var = "Year")

In [None]:
data.table::dcast(expenseData, EmployeeName ~ CityType, fun.aggregate = sum, value.var = "Amount")

In [None]:
data.table::setDT(expenseData)[, .N, keyby=Amount] %>% arrange(desc(N)) %>% head(20)

In [None]:
data.table::setDT(expenseData)[, .N, keyby=as.integer(Amount)] %>% arrange(desc(N)) %>% head(10)

In [None]:
ggplot(data = expenseData, aes(x = Date, y = Amount, color = CityType)) +
    geom_point(aes(alpha = 0.1)) +
    scale_color_brewer(type = "qual", palette = "Dark2") +
    theme_minimal()

In [None]:
inexpensiveCities <- dplyr::filter(expenseData, CityType == "Inexpensive City")
moderatelyExpensiveCities <- dplyr::filter(expenseData, CityType == "Moderately Expensive City")
expensiveCities <- dplyr::filter(expenseData, CityType == "Expensive City")

In [None]:
ggplot(data = inexpensiveCities, aes(x = factor(Year), y = Amount)) +
    geom_boxplot() +
    theme_minimal()

In [None]:
ggplot(data = moderatelyExpensiveCities, aes(x = factor(Year), y = Amount)) +
    geom_boxplot() +
    theme_minimal()

In [None]:
ggplot(data = expensiveCities, aes(x = factor(Year), y = Amount)) +
    geom_boxplot() +
    theme_minimal()

In [None]:
ggplot(data = expensiveCities, aes(x = Amount)) +
    geom_histogram(binwidth = 3) +
    theme_minimal()

In [None]:
ggplot(data = moderatelyExpensiveCities, aes(x = Amount)) +
    geom_histogram(binwidth = 2) +
    theme_minimal()

In [None]:
ggplot(data = inexpensiveCities, aes(x = Amount)) +
    geom_histogram(binwidth = 1) +
    theme_minimal()

In [None]:
ggplot(data = inexpensiveCities, aes(x = Amount)) +
    geom_histogram(binwidth = 1) +
    facet_wrap(facets = ~factor(Year), ncol = 3) +
    theme_minimal()

In [None]:
ggplot(data = dplyr::filter(inexpensiveCities, EmployeeName == "Sophia Aubrey"), aes(x = Amount)) +
    geom_histogram(binwidth = 1) +
    facet_wrap(facets = ~Year, ncol = 3) +
    theme_minimal()

In [None]:
expenseData$AmountInt <- as.integer(expenseData$Amount)

In [None]:
expenseData$TrailingZeroes <- nchar(expenseData$AmountInt) - nchar(sub("0*$", "", expenseData$AmountInt))

In [None]:
expenseData %>%
group_by(TrailingZeroes) %>%
summarize(n = n())

In [None]:
expenseData %>%
    group_by(AmountInt %% 10) %>%
    summarize(n = n())

In [None]:
expenseData %>%
    filter(AmountInt != 39) %>%
    group_by(AmountInt %% 10) %>%
    summarize(n = n())

In [None]:
expenseData %>%
    filter(AmountInt >= 30 & AmountInt < 39) %>%
    group_by(AmountInt %% 10) %>%
    summarize(n = n())

In [None]:
if (!require(benford.analysis)) {
    install.packages("benford.analysis", repos = "http://cran.us.r-project.org")
    library(benford.analysis)
}

In [None]:
cp <- benford(data = expenseData$Amount, number.of.digits = 1, sign = "positive", discrete = TRUE, round = 3)

In [None]:
plot(cp, except=c("second order", "summation", "mantissa", "chi square", "abs diff", "ex summation"))

In [None]:
mungedInts <- dplyr::case_when(expenseData$AmountInt == 39 ~ 19, TRUE ~ expenseData$Amount)
cp2 <- benford(data = mungedInts, number.of.digits = 1, sign = "positive", discrete = TRUE, round = 3)
plot(cp2, except = c("second order", "summation", "mantissa", "chi square", "abs diff", "ex summation"))

In [None]:
plot(benford(data = filter(expenseData, Date < "2017-01-01")$Amount, number.of.digits = 1, discrete = TRUE),
    except=c("second order", "summation", "mantissa", "chi square", "abs diff", "ex summation"))

In [None]:
plot(benford(data = filter(expenseData, Date >= "2017-01-01")$Amount, number.of.digits = 1, discrete = TRUE),
    except=c("second order", "summation", "mantissa", "chi square", "abs diff", "ex summation"))