From c200b10e6809a099e4c42b3f0bc3cd143adb9e3a Mon Sep 17 00:00:00 2001 From: NateRByers Date: Sat, 27 Apr 2024 13:34:33 -0400 Subject: [PATCH 1/3] intro --- R/build_all_lessons.R | 2 +- docs/1-Introduction/readme.md | 64 +++++++-------- docs/2-Functions-and-Importing-Data/readme.md | 58 +++++++------- .../readme.md | 78 +++++++++---------- .../readme.md | 46 +++++------ docs/5-Plotting/readme.md | 48 ++++++------ docs/6-Basic-Statistics/readme.md | 46 +++++------ docs/7-Quality-Assurance/readme.md | 14 ++-- inst/tutorials/1-Introduction/lesson.Rmd | 60 +++++++------- .../2-Functions-and-Importing-Data/lesson.Rmd | 58 +++++++------- .../lesson.Rmd | 78 +++++++++---------- .../lesson.Rmd | 46 +++++------ inst/tutorials/5-Plotting/lesson.Rmd | 48 ++++++------ inst/tutorials/6-Basic-Statistics/lesson.Rmd | 46 +++++------ inst/tutorials/7-Quality-Assurance/lesson.Rmd | 14 ++-- source/1-Introduction/lesson1.yaml | 37 +++++---- 16 files changed, 375 insertions(+), 368 deletions(-) diff --git a/R/build_all_lessons.R b/R/build_all_lessons.R index af6bca1..553dfa4 100644 --- a/R/build_all_lessons.R +++ b/R/build_all_lessons.R @@ -10,7 +10,7 @@ #' @export build_lessons <- function(source_dir = file.path(getwd(), "source"), learnr = TRUE, github = TRUE) { # Validate base directory - if (!dir.exists(base_dir)) { + if (!dir.exists(source_dir)) { stop("The specified base directory does not exist.") } diff --git a/docs/1-Introduction/readme.md b/docs/1-Introduction/readme.md index a6b70cd..093fa37 100644 --- a/docs/1-Introduction/readme.md +++ b/docs/1-Introduction/readme.md @@ -12,7 +12,7 @@ This lesson is a part of the Introduction to R for Air Quality Data Science. The - [Basic Math](#basic-math) - [Order of Operations](#order-of-operations) -- [Note on Comments and Code Blocks](#note-on-comments-and-code-blocks) +- [Comments and Code Blocks](#comments-and-code-blocks) - [Variables](#variables) @@ -55,7 +55,7 @@ To download R, [see this page](https://cran.r-project.org/). You will need to se system (PC or Mac). Accept the default options during the installation. -Once you have installed R, you can open the program itself. On PC, if you have selected the desktop shortcut during installation, the R icon +Once you have installed R, you can open the program itself. On a PC, if you have selected the desktop shortcut during installation, the R icon will look like this: @@ -108,7 +108,7 @@ Open up a script if you haven't already (“File” -> “New File” -> “R Sc the lines into your script. -```{r ex-eiHaz-1, exercise = FALSE, eval = TRUE, exercise.cap = 'Practice Basic Math Operations'} +```{r ex-bXHav-1, exercise = FALSE, eval = TRUE, exercise.cap = 'Practice Basic Math Operations'} 10 + 5 10 - 5 10 * 5 @@ -135,17 +135,17 @@ R follows the usual order of arithmetical operations and uses parentheses for gr see the different values that are returned. -```{r ex-C58km-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Understanding Order of Operations'} +```{r ex-oWkbF-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Understanding Order of Operations'} 10 - 3 / 5 ``` -```{r ex-2P9sT-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Practicing Grouping with Parentheses'} +```{r ex-tpqM5-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Practicing Grouping with Parentheses'} (10 - 3) / 5 ``` -## Note on Comments and Code Blocks +## Comments and Code Blocks To write a comment in your script that will not be evaluated, type `#` in front of your comment. The text after `#` will not be evaluated. There is no multi-line commenting in R, so every comment line must begin with the `#` character. @@ -154,17 +154,17 @@ no multi-line commenting in R, so every comment line must begin with the `#` cha Run all of the code below and see what gets returned in the R console (bottom left panel in RStudio). -```{r ex-7hEfM-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Experiment with Comments'} +```{r ex-vnjMK-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Experiment with Comments'} # Full line comment 5^2 # partial line comment ``` -In the example above and the previous section, you have seen the R code and its output. The code blocks with output look like this, with `1+1` being +In the example above and the previous section, you can see the R code and its output. The code blocks with output look like this, with `1+1` being the R code and `## [1] 2` being the output: -```{r ex-l629B-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Code and Output Example'} +```{r ex-ygOvs-2, exercise = FALSE, eval = TRUE, exercise.cap = 'Code and Output Example'} 1+1 ``` @@ -172,8 +172,9 @@ the R code and `## [1] 2` being the output: However, in the R console the code and output would look like this: -```{r ex-hkQIj-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Console Code Execution'} -1 + 1 +```{r ex-sjM7J-3, exercise = FALSE, eval = FALSE, echo = TRUE, exercise.cap = 'Console Code Execution'} +> 1 + 1 +[1] 2 ``` @@ -183,7 +184,7 @@ A variable is a letter or combination of alphanumeric characters that is used to with the dash to create an arrow symbol pointing left `<-`. Below, the variables `x` and `y` are created by assigning some numbers to them. -```{r ex-JADmU-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Creating Variables'} +```{r ex-6kwt4-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Creating Variables'} x <- 10 y <- 5 x + y @@ -198,12 +199,12 @@ In RStudio, you will see the variables we created in the top right panel. If you've already created a variable, you can replace the value with another value. -```{r ex-Y1soB-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Exploring Variable Assignment'} +```{r ex-QTvM3-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Exploring Variable Assignment'} x ``` -```{r ex-O0ucr-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Updating Variable Values'} +```{r ex-licIL-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Updating Variable Values'} x <- 20 x @@ -220,9 +221,9 @@ There are 3 important rules to remember when creating variable names: Try running the following code and you will see that in your global environment there are two different objects listed. -```{r ex-5emEn-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Variable Naming Rules'} -x <- 5 -X <- 5 +```{r ex-E0eQ3-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Variable Naming Rules'} +y <- 5 +Y <- 10 ``` @@ -237,13 +238,14 @@ R has three main data types: | numeric | numbers | `1`, `3.14`, `log(10)` | | logical | binary | `TRUE`, `FALSE` | -The `character` type requires single or double quotes. The logical values -`TRUE` and `FALSE` should not be quoted and require full caps. +The `character` type requires single or double quotes. The numeric type +must be unquoted numbers, and the full-caps logical values `TRUE` and +`FALSE` must also be unquoted. ## Grouping Data -There are several ways to group data to make them easier to work with: +There are several ways to store groups of data to make them easier to work with: ## Vectors @@ -251,13 +253,13 @@ There are several ways to group data to make them easier to work with: A vector variable can contain only one type of data (numeric, character, or logical). We use `c()` to create vectors. -```{r ex-7qwhu-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Creating Numeric Vectors'} +```{r ex-4EGWf-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Creating Numeric Vectors'} x <- c(1, 2, 3, 4, 5) x ``` -```{r ex-f4koA-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Creating Character Vectors'} +```{r ex-HSgMf-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Creating Character Vectors'} fruit <- c("apples", "bananas", "oranges") fruit @@ -267,7 +269,7 @@ If you try to type in text without using quotations marks for character values ( running the code below. -```{r ex-kAAyQ-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Error Without Quotes'} +```{r ex-BKUZJ-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Error Without Quotes'} fruit <- c(apples, bananas, oranges) ``` @@ -277,7 +279,7 @@ find them and it returns an error. The members of a vector can be accessed by us `fruit` vector, you can use the single bracket with the number 3: -```{r ex-cYSCq-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Accessing Vector Elements'} +```{r ex-zbMt9-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Accessing Vector Elements'} fruit[3] ``` @@ -287,7 +289,7 @@ fruit[3] Lists are like vectors but can contain any mix of data types. We use `list()` to create a list variable. -```{r ex-3RJPl-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Creating Lists'} +```{r ex-oQfOX-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Creating Lists'} x <- list("Benzene", 1.3, TRUE) x @@ -298,7 +300,7 @@ is the second value in the list, so it is shown below the double bracket `[[2]]` list. -```{r ex-NeQlK-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Accessing List Elements'} +```{r ex-fn20s-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Accessing List Elements'} x[[2]] ``` @@ -306,7 +308,7 @@ x[[2]] Lists can also contain vectors and other lists. -```{r ex-ZvztJ-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Lists Containing Vectors and Lists'} +```{r ex-ws3nw-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Lists Containing Vectors and Lists'} my_vector <- c(1, 2, 3) my_list <- list("Benzene", 1.3, TRUE) y <- list(TRUE, my_vector, my_list) @@ -318,7 +320,7 @@ In this example, you can use two double brackets to access the value `1.3` by se `my_list`: -```{r ex-w7rur-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Nested List Access'} +```{r ex-TuTmm-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Nested List Access'} y[[3]][[2]] ``` @@ -329,7 +331,7 @@ Data frames are data tables in R. We use `data.frame()` to create a data frame o vectors of the same length and use them to create a data frame. -```{r ex-w74nJ-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Creating Data Frames'} +```{r ex-SWmKd-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Creating Data Frames'} pollutant <- c("Benzene", "Toluene", "Xylenes") concentration <- c(1.3, 5.5, 6.0) carcinogen <- c(TRUE, FALSE, FALSE) @@ -342,7 +344,7 @@ The output above shows a table with the vector variable names as column names, a create a data frame where the vectors are not all the same length, you will see the error shown below. -```{r ex-ExWhT-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Data Frame Dimension Mismatch Error'} +```{r ex-kWXeH-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Data Frame Dimension Mismatch Error'} pollutant <- c("Benzene", "Toluene") concentration <- c(1.3, 5.5, 6.0) carcinogen <- c(TRUE, FALSE, FALSE) @@ -463,7 +465,7 @@ v <- c(1, 2, 3, 4, 5) ### Exercise 4 -Create a list `l` that contains a number (e.g., 5), a string (e.g., 'apple'), and a logical value (e.g., TRUE). +Create a list `l` that contains the number 5, the string 'apple', and the logical value `TRUE`.
Click for Hint diff --git a/docs/2-Functions-and-Importing-Data/readme.md b/docs/2-Functions-and-Importing-Data/readme.md index 200fbc9..098eedd 100644 --- a/docs/2-Functions-and-Importing-Data/readme.md +++ b/docs/2-Functions-and-Importing-Data/readme.md @@ -41,7 +41,7 @@ Functions are similar to variables in that they are short names that reference s R has many built-in functions that perform common tasks. When you open RStudio you can immediately use a function called `mean( )`. Here is an example of using the `mean( )` function to find the average of a vector of integers. We first save a vector of integers in the `x` variable then put the variable inside the parentheses of the function. -```{r ex-pTqZj-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Mean'} +```{r ex-eLRXQ-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Mean'} x <- c(4, 8, 1, 14, 34) mean(x) @@ -50,22 +50,22 @@ mean(x) As you would expect, R has many built-in math functions. Below are a series of examples. -```{r ex-yND37-2, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Natural Logarithm'} +```{r ex-Q8bC4-2, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Natural Logarithm'} log(27) #Natural logarithm ``` -```{r ex-jU7Am-3, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Base 10 Logarithm'} +```{r ex-ZaVwW-3, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Base 10 Logarithm'} log10(100) #base 10 logarithm ``` -```{r ex-ZQ32x-4, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Square Root'} +```{r ex-IKBqr-4, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Square Root'} sqrt(225) # Square root ``` -```{r ex-ebfqc-5, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Absolute Value'} +```{r ex-CUuAz-5, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Absolute Value'} abs(-5) #Absolute value ``` @@ -76,7 +76,7 @@ All of the examples show that the general form is `function_name( )`. The name o Many functions also have additional options you can choose, which are called the _arguments_. To see what needs to go inside `( )`, type a question mark in front of the function and run it in the R console. -```{r ex-pAyr5-6, eval = FALSE} +```{r ex-C7zVQ-6, eval = FALSE} ?mean() ``` @@ -92,17 +92,17 @@ On the help page, under `Usage`, you see `mean(x, ...)`. This means that the onl Many built-in functions in R have multiple arguments. This allows you to give the function some more information to perform calculation you want. The example below shows how to use the `digits` argument in the `round( )` function. Providing different values to the `digits` argument will return different values. -```{r ex-vmvgZ-7, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Round to Zero Digits'} +```{r ex-E9a5s-7, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Round to Zero Digits'} round(12.3456) ``` -```{r ex-OvuGT-8, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Round to Three Digits'} +```{r ex-tOKoE-8, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Round to Three Digits'} round(12.3456, digits=3) ``` -```{r ex-HlH14-9, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Round to One Digit'} +```{r ex-I6qbQ-9, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Round to One Digit'} round(12.3456, digits=1) ``` @@ -117,7 +117,7 @@ When you start an R session there are many built-in functions that are immediate Returns the sum of a vector of numeric values. -```{r ex-SYHY6-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Sum of Values'} +```{r ex-K4ObJ-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Sum of Values'} sum(c(2.3, 7.5, 9, -10)) ``` @@ -127,7 +127,7 @@ sum(c(2.3, 7.5, 9, -10)) Get the minimum value from a numeric vector. -```{r ex-MJ7Pe-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Minimum Value'} +```{r ex-q812b-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Minimum Value'} min(c(6, 9, 3, 11, -2)) ``` @@ -137,7 +137,7 @@ min(c(6, 9, 3, 11, -2)) Get the maximum value from a numeric vector. -```{r ex-LzDZL-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Maximum Value'} +```{r ex-hfNKR-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Maximum Value'} max(c(6, 9, 3, 11, -2)) ``` @@ -147,7 +147,7 @@ max(c(6, 9, 3, 11, -2)) Create a numeric vector with a certain sequence. The example below creates a vector of integers from 1 to 5. -```{r ex-jl489-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Create Sequence with `seq()`'} +```{r ex-9JUH0-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Create Sequence with `seq()`'} seq(from = 1, to = 5, by = 1) ``` @@ -155,7 +155,7 @@ seq(from = 1, to = 5, by = 1) Another way to create a sequence of integers is to use the colon. -```{r ex-ynkCB-2, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Create Sequence with Colon Operator'} +```{r ex-0u96f-2, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Create Sequence with Colon Operator'} 1:5 ``` @@ -165,7 +165,7 @@ Another way to create a sequence of integers is to use the colon. Concatenate two or more strings. -```{r ex-OvWyo-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Concatenate Strings'} +```{r ex-mPcrl-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Concatenate Strings'} x <- "Hello" y <- "world!" paste(x, y, sep = " ") @@ -175,7 +175,7 @@ paste(x, y, sep = " ") Any numbers will be converted to strings. -```{r ex-Eo96b-2, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Concatenate Strings with Numbers'} +```{r ex-Pm6eO-2, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Concatenate Strings with Numbers'} x <- "You're number " y <- 1 z <- "!" @@ -191,7 +191,7 @@ The `substr()` function allows you to pull out a section from a string based on For example, in AQS data a monitor ID may be written in the following format: [State code - County code - Site number - Parameter code - POC]. If we only wanted to pull out the site number for this monitor ID we could do the following: -```{r ex-KU9pI-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Extract Site Number'} +```{r ex-eHDmM-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Extract Site Number'} wisconsin_monitor <- c('55-021-0015-44201-2') # Ozone monitor in Columbia County, WI site_id <- substr(wisconsin_monitor, start = 8, stop = 11) # start and stop position within the character string. site_id @@ -206,14 +206,14 @@ R allows you to place a function inside another function to perform multiple tas For instance, if you want to create a sequence of numbers and then take the mean of that sequence, you could either do it in a couple of steps, or all at once. -```{r ex-dRWxJ-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Mean in Two Steps'} +```{r ex-T8pZq-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Mean in Two Steps'} #Two steps x <- seq(from=1, to=10, by=3) mean(x) ``` -```{r ex-akXzu-2, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Mean in One Step'} +```{r ex-Kj09K-2, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Mean in One Step'} #One step mean(seq(from=1, to=10, by=3)) @@ -230,7 +230,7 @@ Most of the statistical summary functions in R have the argument `na.rm`. This s For example, there is a built-in data frame in R called `airquality` with daily measurements from a monitor in New York from 1973 (see `?airquality`). If we load the data frame using the `data()` function and take a look at the top 6 rows using the `head()` function, we can see some missing values represented as `NA`. -```{r ex-bMl16-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Inspect `airquality` Data'} +```{r ex-9R5U8-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Inspect `airquality` Data'} data("airquality") head(airquality) @@ -239,7 +239,7 @@ head(airquality) The `mean()` function, for example, has the argument `na.rm` set to `FALSE`. This means that the `NA` values will not be removed from the vector for which it is calculating the mean. As a result, it will return an `NA` because it cannot properly calculate the average. Here we use the `Ozone` column from the `airquality` data frame. -```{r ex-6qias-2, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Mean with NA Values'} +```{r ex-zcdjA-2, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Mean with NA Values'} mean(airquality$Ozone) ``` @@ -247,7 +247,7 @@ mean(airquality$Ozone) To get the mean value, we set `na.rm = TRUE`. -```{r ex-fNT1X-3, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Mean with NA Removal'} +```{r ex-7SSID-3, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Mean with NA Removal'} mean(airquality$Ozone, na.rm = TRUE) ``` @@ -269,7 +269,7 @@ For example, if you wanted to find serial correlation in an environmental data s First, you might try to use the function. -```{r ex-xPpYc-1, error = TRUE} +```{r ex-qhfaw-1, error = TRUE} x <- c(1.3, 3.5, 2.6, 3.4, 6.4) serialCorrelationTest(x) @@ -291,12 +291,12 @@ A window will pop up. Start typing "EnvStats" into the "Packages" box, select th Now that we've installed the package, we still can't use the function we want. We need to load the package first (opening the app). We use the `library()` function to do this. -```{r ex-GsKlT-2, message = FALSE} +```{r ex-iDDVR-2, message = FALSE} library(EnvStats) ``` -```{r ex-0rqDo-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Use `serialCorrelationTest` from EnvStats'} +```{r ex-k8gbf-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Use `serialCorrelationTest` from EnvStats'} x <- c(1.3, 3.5, 2.6, 3.4, 6.4) serialCorrelationTest(x) @@ -318,7 +318,7 @@ R can import data from just about any format, including CSV, Excel, Databases, G R has a built-in function called `read.csv()` for reading `.csv` files. Download the `chicago_daily.csv` file [here](../data/chicago_daily.csv) and save it to your working directory. If you don't know what your working directory is, run this code in R and it will tell you. -```{r ex-a5NTs-1, eval = FALSE} +```{r ex-CSl8K-1, eval = FALSE} getwd() ``` @@ -326,7 +326,7 @@ getwd() Use `read.csv()` by providing the location and name of the file as the first argument. If the file is in your working directory, simply supply the name of the file. Below, the data from the file is read into R and saved as a data frame, which is the data type for storing tables. The function `head()` will show the first few lines. -```{r ex-Wu9LI-2, eval = FALSE} +```{r ex-Ibwgs-2, eval = FALSE} chicago_daily <- read.csv("chicago_daily.csv") head(chicago_daily) @@ -337,7 +337,7 @@ head(chicago_daily) There are several packages that can be used to import data from an Excel file, such as `xlsx`, `XLConnect`, and `readxl`. In this example, we'll use the `readxl` package. If you do not have the package installed, you can use RStudio to install as described in the section above on packages. You can also use the function `install.packages( )`. -```{r ex-Tus5Q-1, eval = FALSE} +```{r ex-3ilbk-1, eval = FALSE} install.packages("readxl") ``` @@ -353,7 +353,7 @@ library(readxl) Use the `read_excel()` function from the `readxl` package to read emissions data from [this Excel workbook](./data/emissions_IL_2022.xlsx). Download the file to your working directory and read the first worksheet (named "UNIT_DATA"), skipping the first 6 rows. -```{r ex-SL0nd-3, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Read and Inspect Excel Data'} +```{r ex-ZKauj-3, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Read and Inspect Excel Data'} library(readxl) emissions <- read_excel("./data/emissions_IL_2022.xlsx", sheet = "UNIT_DATA", skip = 6) head(emissions) diff --git a/docs/3-Subsetting-Sorting-and-Combining/readme.md b/docs/3-Subsetting-Sorting-and-Combining/readme.md index 9970e98..c3d78e1 100644 --- a/docs/3-Subsetting-Sorting-and-Combining/readme.md +++ b/docs/3-Subsetting-Sorting-and-Combining/readme.md @@ -31,7 +31,7 @@ The example data for exercises in this lesson is available directly from this pa To load the `chicago_air` data frame we will be using in the lesson, simply use the `data()` function to load the data frame. -```{r ex-V0UKu-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Load `chicago_air` Data Frame'} +```{r ex-2Gbuo-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Load `chicago_air` Data Frame'} data(chicago_air) ``` @@ -42,7 +42,7 @@ You should see the `chicago_air` variable in the top right panel of RStudio, whi We will also use some functions from the `dplyr` package. You will need to install the package if you haven't already. -```{r ex-m65s2-2, exercise = FALSE, exercise.eval = FALSE, eval = FALSE, exercise.cap = 'Install `dplyr` Package'} +```{r ex-KBDaW-2, exercise = FALSE, exercise.eval = FALSE, eval = FALSE, exercise.cap = 'Install `dplyr` Package'} install.packages("dplyr") ``` @@ -55,7 +55,7 @@ We always want to make sure our data looks the way it is supposed to before we b The best way to take a quick look at the first few rows of a data frame is to use the `head()` function. -```{r ex-jS5BI-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'View the First Few Rows of `chicago_air`'} +```{r ex-cXEro-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'View the First Few Rows of `chicago_air`'} data(chicago_air) head(chicago_air) @@ -64,7 +64,7 @@ head(chicago_air) You can specify the number of lines to display by using the `n` parameter. -```{r ex-eYeSa-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Specify Number of Lines with `head()`'} +```{r ex-jSptn-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Specify Number of Lines with `head()`'} data(chicago_air) head(chicago_air, n = 3) @@ -73,7 +73,7 @@ head(chicago_air, n = 3) You can also look at the bottom of the data frame by using the `tail()` function. -```{r ex-iNyBl-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'View the Last Few Rows of `chicago_air`'} +```{r ex-DbIdT-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'View the Last Few Rows of `chicago_air`'} data(chicago_air) tail(chicago_air) @@ -82,7 +82,7 @@ tail(chicago_air) In RStudio, you can either click on the name of the data frame in the top right panel or use the `View()` function to generate a web based table of the data in the top left panel. -```{r ex-ztSGB-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Use `View()` to Inspect `chicago_air`'} +```{r ex-YNSwn-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Use `View()` to Inspect `chicago_air`'} data(chicago_air) View(chicago_air) @@ -93,7 +93,7 @@ View(chicago_air) By inspecting the data frame this way, you can see that the records are daily values of ozone, temperature, and solar radiation. For more information about the data set you can type a question mark in from the name of the data frame variable in the console. -```{r ex-yaDLj-5, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Learn More About `chicago_air`'} +```{r ex-quVoO-5, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Learn More About `chicago_air`'} ?chicago_air ``` @@ -119,7 +119,7 @@ Values in a data frame can be selected, individually or in a group, based on the Below is a data frame called `my_data` that has 3 rows and 2 columns. -```{r ex-ZlKMC-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Explore Data Frame Indexing with `my_data`'} +```{r ex-rUpwS-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Explore Data Frame Indexing with `my_data`'} my_data <- data.frame(colors = c("red", "green", "yellow"), fruit = c("apple", "grape", "banana")) @@ -130,7 +130,7 @@ my_data To select a particular cell from the `my_data` data frame, we use the `[row, column]` construction. We place those square brackets at the end of the data frame variable `my_data[]` and use integers to select a value. If we wanted to select the "green" value, we would use `my_data[2, 1]`. -```{r ex-tZkMp-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Select a Specific Value Using Indexing'} +```{r ex-lfswa-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Select a Specific Value Using Indexing'} my_data[2, 1] ``` @@ -138,7 +138,7 @@ my_data[2, 1] To select "banana", we use `my_data[3, 2]`. -```{r ex-TeTjy-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access Another Specific Value Using Indexing'} +```{r ex-w3DNF-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access Another Specific Value Using Indexing'} my_data[3, 2] ``` @@ -146,7 +146,7 @@ my_data[3, 2] We can also access data from a vector using the same indexing idea. In this case, you don’t need the comma to separate the rows and columns since you are accessing one dimensional data. Below is a vector of numbers. -```{r ex-aUPaG-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Vector Indexing Example'} +```{r ex-SXN7m-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Vector Indexing Example'} x <- c(1, 3, 2, 7, 25.3, 6) x @@ -155,7 +155,7 @@ x If we want to access the 5th element of the vector, we would use `x[5]`. -```{r ex-ieIcZ-5, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access a Vector Element by Index'} +```{r ex-W7JCa-5, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access a Vector Element by Index'} x[5] ``` @@ -163,7 +163,7 @@ x[5] Now that we understand indexing we can subset the `chicago_air` data frame by using the brackets `[ , ]` function. (This is a rare example of a function in R that does not have the form `function_name()`.) -```{r ex-vzNTE-6, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Subset `chicago_air` Data Frame Using Indexing'} +```{r ex-gz6TG-6, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Subset `chicago_air` Data Frame Using Indexing'} data(chicago_air) chicago_air[1, ] @@ -172,7 +172,7 @@ chicago_air[1, ] If you want more than one row, you can supply a vector of row numbers. Below, the vector access the 1st, 2nd, and 5th rows of the data frame. -```{r ex-eCOIn-7, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access Multiple Rows with a Vector of Row Numbers'} +```{r ex-bziTN-7, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access Multiple Rows with a Vector of Row Numbers'} data(chicago_air) chicago_air[c(1, 2, 5), ] @@ -181,7 +181,7 @@ chicago_air[c(1, 2, 5), ] To get a column from the data frame, specify the column number in the brackets, to the right of the comma. By leaving the row value blank, you are telling it to return all rows associated with column 1. Below, we wrap the output in the `head()` function to limit the number of rows printed. -```{r ex-HpAgp-8, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access a Single Column'} +```{r ex-b9oJu-8, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access a Single Column'} data(chicago_air) head(chicago_air[, 1]) @@ -193,7 +193,7 @@ As you can see, a vector is returned. When a column of a data frame is selected You can also obtain more than one column by supplying a vector of column numbers. -```{r ex-Vdbe5-9, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access Multiple Columns'} +```{r ex-j9Z0m-9, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access Multiple Columns'} data(chicago_air) head(chicago_air[, c(3, 4, 6)]) @@ -205,7 +205,7 @@ Since more than one column is selected, then a data frame is returned. A column name can be used to select a vector. -```{r ex-5imUG-10, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access a Column by Name'} +```{r ex-k1uUZ-10, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access a Column by Name'} data(chicago_air) head(chicago_air[, "solar"]) @@ -214,7 +214,7 @@ head(chicago_air[, "solar"]) Or a vector of column names can subset to a slimmed down data frame. -```{r ex-9HNMT-11, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Subset to Specific Columns by Name'} +```{r ex-8qN6e-11, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Subset to Specific Columns by Name'} data(chicago_air) head(chicago_air[, c("ozone", "temp", "month")]) @@ -223,7 +223,7 @@ head(chicago_air[, c("ozone", "temp", "month")]) Both rows and columns can be specified at the same time. The example below returns the first 5 rows of the temperature and solar columns. -```{r ex-zVLsd-12, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Specify Both Rows and Columns for Subsetting'} +```{r ex-lI3Jg-12, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Specify Both Rows and Columns for Subsetting'} chicago_air[1:5, c("temp", "solar")] ``` @@ -233,7 +233,7 @@ chicago_air[1:5, c("temp", "solar")] In R, the dollar sign `$` is a special character that can be used to access a data frame column by name. The dollar sign is placed immediately after the variable name. For example, if we wanted to access the temperature values in the `chicago_air` data frame, then we would use `chicago_air$temp`. -```{r ex-xxkpq-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access a Column with `$`'} +```{r ex-NnbKI-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access a Column with `$`'} data(chicago_air) head(chicago_air$temp) @@ -267,12 +267,12 @@ Below is a table of logical operators in R that can be used to create logical co The result of a logical expression is a logical data type, a boolean value `TRUE` or `FALSE`. -```{r ex-KUVEy-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Logical Expressions Example 1'} +```{r ex-1sHEg-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Logical Expressions Example 1'} 1 + 1 == 2 ``` -```{r ex-sW9E7-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Logical Expressions Example 2'} +```{r ex-kO2Er-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Logical Expressions Example 2'} 10 > 20 ``` @@ -280,7 +280,7 @@ The result of a logical expression is a logical data type, a boolean value `TRUE Vectors can also be used in a logical expression. A vector of values on the left hand side of a logical operator will return a vector of the same length with boolean values. -```{r ex-rGePa-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Vector Logical Expression'} +```{r ex-rODK2-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Vector Logical Expression'} c(25, 80, 55) > 60 ``` @@ -291,7 +291,7 @@ This concept can be used to subset a data frame. A logical vector can be used in We can use the data frame of colors and fruit again to demonstrate. -```{r ex-q6eZv-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Create a Logical Vector for Subsetting'} +```{r ex-gHaZ4-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Create a Logical Vector for Subsetting'} my_data <- data.frame(colors = c("red", "green", "yellow"), fruit = c("apple", "grape", "banana")) @@ -302,7 +302,7 @@ my_data If we only wanted records with the "yellow" color, we could use the vector `c(FALSE, FALSE, TRUE)`. Place this vector in the brackets of the data frame, where we select rows. -```{r ex-jUWa9-5, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Subset Using a Logical Vector'} +```{r ex-x978o-5, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Subset Using a Logical Vector'} my_data[c(FALSE, FALSE, TRUE), ] ``` @@ -313,7 +313,7 @@ A data frame is returned. The only record is from the 3rd row of the logical vec But a more useful way of creating the logical vector is with a logical expression. Below we access the "color" column as a vector using the `$` operator. Then we create a logical vector using a logical expression. -```{r ex-Vgi5s-6, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Create a Logical Vector with a Logical Expression'} +```{r ex-hYfZH-6, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Create a Logical Vector with a Logical Expression'} colors <- my_data$colors colors @@ -327,7 +327,7 @@ yellow Now we can use the logical vector `yellow` to subset the data frame down to records that have the color yellow. -```{r ex-V1e3J-7, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Subset Data Frame Using Logical Vector'} +```{r ex-iiUPn-7, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Subset Data Frame Using Logical Vector'} my_data[yellow, ] ``` @@ -335,7 +335,7 @@ my_data[yellow, ] The `chicago_air` data frame can be subset in a similar way. Below, a logical vector `hot` is created to represent hot days above 90 degrees. The data frame is subset down to records with hot days. -```{r ex-Q5XvW-8, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Subset `chicago_air` for Hot Days'} +```{r ex-ybdNp-8, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Subset `chicago_air` for Hot Days'} data(chicago_air) hot <- chicago_air$temp > 90 @@ -351,7 +351,7 @@ A logical vector can also be used in combination with the function `filter()`. The `filter()` function is from a package called `dplyr` which provides many functions for manipulating data frames. -```{r ex-b0WgF-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Load `dplyr` and Explore `filter()`'} +```{r ex-TJFSC-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Load `dplyr` and Explore `filter()`'} # if you have not installed dplyr @@ -367,7 +367,7 @@ The benefit of using `filter()` is that it works the way other functions in R ty If we want to filter down to records in the `chicago_air` data frame where ozone was above 60 ppb (.060 ppm), we would use the following code. -```{r ex-hY8PL-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Filter `chicago_air` for High Ozone Levels'} +```{r ex-O4IYp-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Filter `chicago_air` for High Ozone Levels'} data(chicago_air) high_ozone <- filter(chicago_air, ozone > 0.060) @@ -378,7 +378,7 @@ high_ozone If we wanted all of the high ozone days in the 6th month, we add another expression separated by a comma. -```{r ex-lw1Jw-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Filter for High Ozone Levels in June'} +```{r ex-hXWfn-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Filter for High Ozone Levels in June'} data(chicago_air) high_ozone_june <- filter(chicago_air, ozone > 0.060, month == 6) @@ -389,7 +389,7 @@ high_ozone_june Additional logical expressions can be added by separating each expression with a comma. The comma serves as a logical AND. Below is an equivalent output to the output above, using `&` instead of a comma. -```{r ex-oaDbu-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Alternative `filter()` Syntax Using `&`'} +```{r ex-Kp4Gx-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Alternative `filter()` Syntax Using `&`'} data(chicago_air) high_ozone_june <- filter(chicago_air, ozone > 0.060 & month == 6) @@ -405,7 +405,7 @@ The `dplyr` package also has a function named `arrange()` that will sort a data Below, the `chicago_air` data frame is ordered by the `ozone` column. The default is ascending order. -```{r ex-ebGDs-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Sort `chicago_air` by `ozone`'} +```{r ex-OnGT8-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Sort `chicago_air` by `ozone`'} # if the dplyr library is not already loaded library(dplyr) data(chicago_air) @@ -419,7 +419,7 @@ head(ozone_ordered) To use descending order, wrap the column in the `desc()` function (also from the `dplyr` package). -```{r ex-0dILl-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Sort `chicago_air` in Descending Order by `ozone`'} +```{r ex-dQB5v-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Sort `chicago_air` in Descending Order by `ozone`'} data(chicago_air) ozone_descending <- arrange(chicago_air, desc(ozone)) @@ -430,7 +430,7 @@ head(ozone_descending) Additional columns can be used to sort the data frame, separated by a comma. -```{r ex-X4ytE-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Sort `chicago_air` by Multiple Columns'} +```{r ex-0nc9T-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Sort `chicago_air` by Multiple Columns'} data(chicago_air) ozone_temp <- arrange(chicago_air, desc(ozone), desc(temp)) @@ -446,7 +446,7 @@ If we are working with multiple data frames in R, it is sometimes useful to comb To illustrate, we will make two subsets of the `chicago_air` data frame, then combine them together using the `bind_rows()` function. Below, the original number of records in the `chicago_air` data frame is shown using the `nrow()` function. We will split the data frame and recombine to a data frame with the original number of records. -```{r ex-jPLYk-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Count Rows in `chicago_air`'} +```{r ex-ZoezH-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Count Rows in `chicago_air`'} # if you have not loaded the dplyr package library(dplyr) data(chicago_air) @@ -458,7 +458,7 @@ nrow(chicago_air) Now we split the data frame into warm and cool data frames using the `filter()` function. -```{r ex-A3T3G-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Split `chicago_air` into Warm and Cool Subsets'} +```{r ex-jhgMN-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Split `chicago_air` into Warm and Cool Subsets'} data(chicago_air) warm <- filter(chicago_air, temp > 80) @@ -474,7 +474,7 @@ nrow(cool) We can confirm that the rows from these two data frames add up to the original data frame. -```{r ex-HLCSY-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Verify Row Counts Before Combining'} +```{r ex-iKU5j-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Verify Row Counts Before Combining'} nrow(warm) + nrow(cool) == nrow(chicago_air) ``` @@ -482,7 +482,7 @@ nrow(warm) + nrow(cool) == nrow(chicago_air) Now we combine using the `bind_rows()` function and confirm that the new `recombined` data frame has the same number of records as the original data frame. -```{r ex-Wn3Wu-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Recombine Data Frames and Verify Row Count'} +```{r ex-7dCVE-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Recombine Data Frames and Verify Row Count'} data(chicago_air) recombined <- bind_rows(warm, cool) diff --git a/docs/4-Writing-Functions-Conditionals-and-Loops/readme.md b/docs/4-Writing-Functions-Conditionals-and-Loops/readme.md index 7fb433d..16255f2 100644 --- a/docs/4-Writing-Functions-Conditionals-and-Loops/readme.md +++ b/docs/4-Writing-Functions-Conditionals-and-Loops/readme.md @@ -31,7 +31,7 @@ This lesson assumes you are familiar with the material in the previous lessons: The data for these lessons is available from this package. It is assumed that this package is already installed and loaded into the R session. If you need to refer to the package, simply refer to it as "this package". -```{r ex-tCCqN-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Load Data from This Package'} +```{r ex-CTluV-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Load Data from This Package'} # Assuming the package is already loaded data(chicago_air) @@ -48,7 +48,7 @@ the thing that's saved is not a data object but lines of R code. To save your own function, use this construction: -```{r ex-u9JJY-1, eval = FALSE, exercise = TRUE, exercise.cap = 'Creating Your Own Function'} +```{r ex-LXwfs-1, eval = FALSE, exercise = TRUE, exercise.cap = 'Creating Your Own Function'} my_function_name <- function() { # lines of R code @@ -61,7 +61,7 @@ We can write a simple function that prints something to the console. Here is a function named `print_hello`. -```{r ex-9UHUQ-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Simple Function to Print Hello'} +```{r ex-oJcqC-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Simple Function to Print Hello'} print_hello <- function() { print("Hello") @@ -70,7 +70,7 @@ print_hello <- function() { ``` -```{r ex-HvwPT-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing the print_hello Function'} +```{r ex-fkz5A-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing the print_hello Function'} print_hello() ``` @@ -86,7 +86,7 @@ Here we recreate the same function, but this time we add an argument `text` insi of the parentheses. -```{r ex-Xwz1u-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Function with an Argument'} +```{r ex-esx9e-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Function with an Argument'} print_hello <- function(text) { message <- paste("Hello", text) @@ -97,7 +97,7 @@ print_hello <- function(text) { ``` -```{r ex-EoMGY-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing the Modified print_hello Function'} +```{r ex-XsG8h-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing the Modified print_hello Function'} print_hello(text = "everybody!") ``` @@ -110,7 +110,7 @@ a criteria pollutant standard. We could make simple function that takes two argu one for the measurement value, and one for the standard value. -```{r ex-WfnOi-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Function with Two Arguments'} +```{r ex-LGi3P-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Function with Two Arguments'} check_standard <- function(measurement, standard) { measurement > standard @@ -119,7 +119,7 @@ check_standard <- function(measurement, standard) { ``` -```{r ex-6z3K3-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing check_standard Function'} +```{r ex-NbLk3-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing check_standard Function'} check_standard(measurement = 84, standard = 70) ``` @@ -133,7 +133,7 @@ then flexibility to use a different value. To set a default value, we use `= 70` when we create the function. -```{r ex-RDVGD-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Function with Default Value'} +```{r ex-f8iTW-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Function with Default Value'} check_standard <- function(measurement, standard = 70) { measurement > standard @@ -142,7 +142,7 @@ check_standard <- function(measurement, standard = 70) { ``` -```{r ex-1dri8-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing check_standard with Default Value'} +```{r ex-QMHje-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing check_standard with Default Value'} check_standard(measurement = 50) ``` @@ -163,12 +163,12 @@ Here we show that using two numbers in a different order will return different outputs. -```{r ex-yRNTd-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing Positional Arguments'} +```{r ex-LLJD7-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing Positional Arguments'} check_standard(60, 70) ``` -```{r ex-uSbY3-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing Positional Arguments in Reverse Order'} +```{r ex-i1zJN-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing Positional Arguments in Reverse Order'} check_standard(70, 60) ``` @@ -185,7 +185,7 @@ will run if the logical expression is `TRUE` is placed inside curly braces. Belo is the outline (not actual R code). -```{r ex-JqqhA-1, eval = FALSE, exercise = TRUE, exercise.cap = 'if Function Outline'} +```{r ex-TcrFf-1, eval = FALSE, exercise = TRUE, exercise.cap = 'if Function Outline'} if(logical expression>) { @@ -194,7 +194,7 @@ if(logical expression>) { ``` -```{r ex-IR16v-2, eval = FALSE, exercise = TRUE, exercise.cap = 'if-else Function Outline'} +```{r ex-zgNbe-2, eval = FALSE, exercise = TRUE, exercise.cap = 'if-else Function Outline'} if() { @@ -207,7 +207,7 @@ if() { ``` -```{r ex-zeNFr-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'if Function Example'} +```{r ex-b9XAw-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'if Function Example'} ozone <- 0.075 if(ozone > 0.065) { @@ -222,7 +222,7 @@ if(ozone > 0.065) { ``` -```{r ex-KlrX8-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Another if Function Example'} +```{r ex-P4dii-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Another if Function Example'} ozone <- 0.06 if(ozone > 0.065) { @@ -237,12 +237,12 @@ if(ozone > 0.065) { ``` -```{r ex-IxhE3-5, eval = FALSE, exercise = TRUE, exercise.cap = 'ifelse Function Outline'} +```{r ex-SWlRF-5, eval = FALSE, exercise = TRUE, exercise.cap = 'ifelse Function Outline'} ifelse(, , ) ``` -```{r ex-8LMYz-6, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'ifelse Function Example'} +```{r ex-t7oAn-6, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'ifelse Function Example'} ozone_value <- 0.06 message <- ifelse(ozone_value > 0.065, "Potential Health Effects", "All Good") @@ -261,7 +261,7 @@ For loops are used to repeat an operation a set number of times. Here is the basic outline: -```{r ex-zfyff-1, eval = FALSE, exercise = TRUE, exercise.cap = 'For Loop Outline'} +```{r ex-L8axJ-1, eval = FALSE, exercise = TRUE, exercise.cap = 'For Loop Outline'} for(i in sequence){ @@ -270,7 +270,7 @@ for(i in sequence){ ``` -```{r ex-ERYIB-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'For Loop Example'} +```{r ex-ybMKy-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'For Loop Example'} for(i in c(1, 2, 3)) { print(i) @@ -288,7 +288,7 @@ Here is an example data frame we will use. It represents a few values from three monitors. -```{r ex-Tc6UH-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'For Loop with Data Frame'} +```{r ex-icnkn-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'For Loop with Data Frame'} monitors <- data.frame(monitor1 = c(50, 60, 58, 52), monitor2 = c(55, 59, 65, 61), monitor3 = c(70, 62, 68, 71)) @@ -297,7 +297,7 @@ monitors ``` -```{r ex-cObbo-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Finding Maximum Values with For Loop'} +```{r ex-WNnr2-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Finding Maximum Values with For Loop'} max_values <- c() for(i in c(1, 2, 3)) { @@ -328,7 +328,7 @@ The example below applies the `max()` function to the `monitors` data frame from the previous section. -```{r ex-G0Hie-1, eval = FALSE, exercise = TRUE, exercise.cap = 'apply Function Example'} +```{r ex-pr0eu-1, eval = FALSE, exercise = TRUE, exercise.cap = 'apply Function Example'} monitors_max <- apply(monitors, MARGIN = 2, FUN = max) monitors_max diff --git a/docs/5-Plotting/readme.md b/docs/5-Plotting/readme.md index f34f77b..71aa968 100644 --- a/docs/5-Plotting/readme.md +++ b/docs/5-Plotting/readme.md @@ -28,7 +28,7 @@ This lesson assumes you are familiar with the material in the lesson on It also uses functions from the `ggplot2` package which needs to be installed. -```{r ex-VHjmi-1, eval = FALSE} +```{r ex-jjPLB-1, eval = FALSE} install.packages("ggplot2") ``` @@ -36,7 +36,7 @@ install.packages("ggplot2") The example data for the exercises is available from this package. To load the `chicago_air` data frame we will be using in the lesson, simply use the `data()` function. -```{r ex-6xdwP-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Loading Data'} +```{r ex-wgpUZ-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Loading Data'} data(chicago_air) ``` @@ -48,7 +48,7 @@ will be displayed on the y-axis of a coordinate graph, with the index number of vector taking the x-axis values. -```{r ex-kloVP-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Basic Plot'} +```{r ex-oYDPX-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Basic Plot'} benzene <- c(1.3, 4.5, 2.6, 3.4, 6.4) plot(benzene) @@ -61,7 +61,7 @@ the `chicago_air` data frame to create a scatterplot of temperature on the x-axi and ozone on the y-axis. -```{r ex-yI1m8-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Scatter Plot'} +```{r ex-QIAYA-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Scatter Plot'} plot(x = chicago_air$temp, y = chicago_air$ozone) ``` @@ -70,7 +70,7 @@ To see data plotted over time, we need to convert the `date` column to a `Date` data type. -```{r ex-8t8ki-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Converting Date Column'} +```{r ex-uzSQI-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Converting Date Column'} chicago_air$date <- as.Date(chicago_air$date) ``` @@ -78,7 +78,7 @@ chicago_air$date <- as.Date(chicago_air$date) Here is ozone plotted by day as a line graph. -```{r ex-gBt3L-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Line Graph'} +```{r ex-t8iMQ-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Line Graph'} plot(x = chicago_air$date, y = chicago_air$ozone, type = 'l') ``` @@ -88,7 +88,7 @@ to control the look of the graph. The plot below demonstrates a few of these options. Run `?plot` to see a list of all the arguments in the help file. -```{r ex-UC8gH-5, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Customized Plot'} +```{r ex-jdAhG-5, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Customized Plot'} plot(x = chicago_air$date, y = chicago_air$ozone, type='l', pch = 16, @@ -107,7 +107,7 @@ of a data set as a histogram. Below is the default output of the ozone data from the `chicago_air` data frame. -```{r ex-AfjSo-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Default Histogram'} +```{r ex-yKUas-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Default Histogram'} hist(chicago_air$ozone) ``` @@ -117,7 +117,7 @@ each bar, with the `breaks` argument. For example, supplying `breaks = 20` will make a histogram with 20 bars. Other arguments allow you to control the titles and colors of the plot. Run `?hist` to see a complete list of arguments on the help page. -```{r ex-GKiyT-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Customized Histogram'} +```{r ex-Z3Z3l-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Customized Histogram'} hist(chicago_air$ozone, breaks = 20, main = "Histogram of Ozone", @@ -134,7 +134,7 @@ argument. If a data frame is used, then the columns can be referenced without th `$` operator, and a formula must be used. -```{r ex-LQRTa-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Simple Box Plot'} +```{r ex-pMLrR-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Simple Box Plot'} boxplot(chicago_air$ozone) ``` @@ -157,7 +157,7 @@ column in the data frame is being plotted, and which column is used to do the grouping. We can make a plot of ozone by month using the `chicago_air` data frame. -```{r ex-wASHb-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Box Plot by Month'} +```{r ex-xOp8M-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Box Plot by Month'} boxplot(ozone ~ month, data = chicago_air) ``` @@ -183,7 +183,7 @@ To use `ggplot2`, we typically follow this sequence of steps: Let's begin by loading the `ggplot2` package. -```{r ex-wWAYi-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Loading ggplot2 Package'} +```{r ex-ii28k-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Loading ggplot2 Package'} library(ggplot2) ``` @@ -195,7 +195,7 @@ is short for aesthetic. The primary arguments in the `aes( )` function are `x` and `y`. These determine which column from the data frame is used on the x and y axes. -```{r ex-GZ5TT-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Base ggplot'} +```{r ex-EvIUn-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Base ggplot'} ggplot(chicago_air, aes(x = temp, y = ozone)) ``` @@ -206,14 +206,14 @@ to the plot, which is done by adding a function using the `+` sign. For a point plot, we add the `geom_point()` function. -```{r ex-A9uPW-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'ggplot Scatter Plot'} +```{r ex-9SCpO-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'ggplot Scatter Plot'} ggplot(chicago_air, aes(x = temp, y = ozone)) + geom_point() ``` Additional modifications can be made. Customize it by adding color, title, and labels. -```{r ex-4gRln-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Customized ggplot Scatter Plot'} +```{r ex-TfF6b-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Customized ggplot Scatter Plot'} ggplot(chicago_air, aes(x = temp, y = ozone)) + geom_point(color = "forestgreen") + ggtitle('Relationship between Ozone and Temperature') + @@ -228,7 +228,7 @@ on the month. To do this, we need to make the month column a factor and use the argument `color` in the `aes( )` function. -```{r ex-obXRx-5, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Color Coded Scatter Plot'} +```{r ex-2SiKx-5, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Color Coded Scatter Plot'} ggplot(chicago_air, aes(x = temp, y = ozone, color = factor(month))) + geom_point() + ggtitle('Relationship between Ozone and Temperature') + @@ -247,7 +247,7 @@ To create a line plot of ozone over time, we use the `as.Date()` function on the column and replace the `geom_point( )` function with the `geom_line( )` function. -```{r ex-nraMo-6, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Line Plot Over Time'} +```{r ex-F1gzi-6, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Line Plot Over Time'} ggplot(chicago_air, aes(x = as.Date(date), y = ozone)) + geom_line() ``` @@ -257,7 +257,7 @@ the width of each bar, the `fill` argument the color of the bars, and the `color argument the outline of the bars. -```{r ex-iWQ2u-7, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'ggplot Histogram'} +```{r ex-7i93G-7, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'ggplot Histogram'} ggplot(chicago_air, aes(ozone)) + geom_histogram(binwidth=0.005, fill='darkorange', color='black') @@ -266,7 +266,7 @@ ggplot(chicago_air, aes(ozone)) + The `geom_boxplot()` function will create a box plot. -```{r ex-9VkHH-8, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Simple ggplot Box Plot'} +```{r ex-jOxh0-8, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Simple ggplot Box Plot'} ggplot(chicago_air, aes(ozone)) + geom_boxplot() ``` @@ -275,7 +275,7 @@ Using the `y` argument can split the data into groups. Here we use the `factor() function on the month column to create 12 box plots on the graph. -```{r ex-WhQ7D-9, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Grouped Box Plot'} +```{r ex-I5X47-9, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Grouped Box Plot'} ggplot(chicago_air, aes(x = ozone, y = factor(month))) + geom_boxplot() ``` @@ -291,7 +291,7 @@ multiple plots or facets. The `facet_wrap()` function allows you to use a column to choose the facets. Below is a faceted histogram of ozone values. -```{r ex-ktjSQ-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Faceted Histogram'} +```{r ex-1LcZl-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Faceted Histogram'} ggplot(chicago_air, aes(ozone)) + geom_histogram() + facet_wrap("month") @@ -307,7 +307,7 @@ represented as a shaded area. Below, the argument `method` is given the value `lm` which stands for a linear model. -```{r ex-OJjWb-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Linear Model Fitted Line', message = FALSE, warning = FALSE} +```{r ex-1gxvv-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Linear Model Fitted Line', message = FALSE, warning = FALSE} ggplot(chicago_air, aes(temp, ozone) ) + geom_point() + geom_smooth(method=lm) @@ -319,7 +319,7 @@ argument will draw a nonlinear curve which represents localized relationships be the x and y variables. -```{r ex-X0bua-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Nonlinear Curve Fitting', message = FALSE, warning = FALSE} +```{r ex-sp9SY-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Nonlinear Curve Fitting', message = FALSE, warning = FALSE} ggplot(chicago_air, aes(temp, ozone) ) + geom_point() + geom_smooth(method=loess) @@ -337,7 +337,7 @@ pane. You can also save a plot made by `ggplot2` using the `ggsave()` function. -```{r ex-o54Qy-1, eval = FALSE} +```{r ex-SsPB0-1, eval = FALSE} my_plot <- ggplot(chicago_air, aes(temp, ozone) ) + geom_point() + geom_smooth(method=loess) diff --git a/docs/6-Basic-Statistics/readme.md b/docs/6-Basic-Statistics/readme.md index 59edde7..81ab583 100644 --- a/docs/6-Basic-Statistics/readme.md +++ b/docs/6-Basic-Statistics/readme.md @@ -26,7 +26,7 @@ This lesson assumes you are familiar with the material in the lesson on [Functio Statistical functions are used in this lesson that require installation of the following packages. -```{r ex-kEu82-1, eval = FALSE} +```{r ex-mrYgt-1, eval = FALSE} install.packages("envstats") ``` @@ -40,7 +40,7 @@ R has many built-in functions for descriptive statistics. We will use these functions to understand the example environmental data available in this package. -```{r ex-sZibE-1, exercise = TRUE, exercise.cap = 'Extract example data'} +```{r ex-pgJHO-1, exercise = TRUE, exercise.cap = 'Extract example data'} data <- example_data # Assuming example_data is available in this package ``` @@ -53,17 +53,17 @@ These functions let us know the range of the data values, i.e., the highest and lowest values. -```{r ex-QIVup-2, exercise = TRUE, exercise.cap = 'Find minimum value'} +```{r ex-hpFYT-2, exercise = TRUE, exercise.cap = 'Find minimum value'} min(data, na.rm=TRUE) ``` -```{r ex-6Ct7O-3, exercise = TRUE, exercise.cap = 'Find maximum value'} +```{r ex-y5aaF-3, exercise = TRUE, exercise.cap = 'Find maximum value'} max(data, na.rm=TRUE) ``` -```{r ex-ulsae-4, exercise = TRUE, exercise.cap = 'Find range of values'} +```{r ex-PX97c-4, exercise = TRUE, exercise.cap = 'Find range of values'} range(data, na.rm=TRUE) ``` @@ -71,7 +71,7 @@ range(data, na.rm=TRUE) We can also get the mean and the quartile values from the `summary()` function. -```{r ex-RGwdi-5, exercise = TRUE, exercise.cap = 'Summary statistics'} +```{r ex-nqZNH-5, exercise = TRUE, exercise.cap = 'Summary statistics'} summary(data) ``` @@ -81,7 +81,7 @@ the spread is for the values in the central range of the distribution, i.e., bet the 1st quartile and the 3rd quartile. -```{r ex-tWmQ7-6, exercise = TRUE, exercise.cap = 'Calculate IQR'} +```{r ex-iaLj1-6, exercise = TRUE, exercise.cap = 'Calculate IQR'} IQR(data, na.rm=TRUE) ``` @@ -91,7 +91,7 @@ of the box itself shows the first and third quartile, while the line in the midd of the box shows the median. -```{r ex-kMf08-7, exercise = TRUE, exercise.cap = 'Visualize IQR with boxplot'} +```{r ex-BwSRl-7, exercise = TRUE, exercise.cap = 'Visualize IQR with boxplot'} boxplot(data) ``` @@ -101,12 +101,12 @@ boxplot(data) R has functions for finding the mean and median of a set of values. -```{r ex-jHChv-1, exercise = TRUE, exercise.cap = 'Calculate mean'} +```{r ex-SppiP-1, exercise = TRUE, exercise.cap = 'Calculate mean'} mean(data, na.rm=TRUE) ``` -```{r ex-gpKV1-2, exercise = TRUE, exercise.cap = 'Calculate median'} +```{r ex-y80m5-2, exercise = TRUE, exercise.cap = 'Calculate median'} median(data, na.rm=TRUE) ``` @@ -115,12 +115,12 @@ The functions `var()` and `sd()` calculate the variance and standard deviation, respectively. -```{r ex-LtqaD-3, exercise = TRUE, exercise.cap = 'Calculate variance'} +```{r ex-gFKd7-3, exercise = TRUE, exercise.cap = 'Calculate variance'} var(data, na.rm=TRUE) ``` -```{r ex-DBhEl-4, exercise = TRUE, exercise.cap = 'Calculate standard deviation'} +```{r ex-8sPJi-4, exercise = TRUE, exercise.cap = 'Calculate standard deviation'} sd(data, na.rm=TRUE) ``` @@ -133,7 +133,7 @@ the `t.test()` function to perform a two-sample t-test on the example data. First, let's visualize our dataset. -```{r ex-6PxiC-1, warning = FALSE, message = FALSE, exercise = TRUE, exercise.cap = 'Visualize dataset'} +```{r ex-v3dCg-1, warning = FALSE, message = FALSE, exercise = TRUE, exercise.cap = 'Visualize dataset'} ggplot(data, aes(factor(variable), value)) + geom_boxplot() ``` @@ -143,7 +143,7 @@ difference in concentrations. Below is a plot of those two groups side by side. -```{r ex-0vyqv-2, warning = FALSE, message = FALSE, exercise = TRUE, exercise.cap = 'Compare two groups'} +```{r ex-jD3VH-2, warning = FALSE, message = FALSE, exercise = TRUE, exercise.cap = 'Compare two groups'} filtered_data <- filter(data, group == "Group1" | group == "Group2") ggplot(filtered_data, aes(factor(group), value)) + geom_boxplot() @@ -154,7 +154,7 @@ We should also check for normality before doing any statistical tests. Below are histograms of the datasets. -```{r ex-qKmkV-3, exercise = TRUE, exercise.cap = 'Check for normality with histograms'} +```{r ex-0b2AA-3, exercise = TRUE, exercise.cap = 'Check for normality with histograms'} ggplot(filtered_data, aes(value)) + facet_grid(rows = vars(group)) + geom_histogram() @@ -168,14 +168,14 @@ comes from a normal distribution. If the p-value of the test is less than .05, we reject the null hypothesis and conclude the data is not normal. -```{r ex-fmFSx-4, exercise = TRUE, exercise.cap = 'Shapiro-Wilk test for Group1'} +```{r ex-duUuX-4, exercise = TRUE, exercise.cap = 'Shapiro-Wilk test for Group1'} group1_data <- filter(data, group == "Group1") shapiro.test(group1_data$value) ``` -```{r ex-0MQAR-5, exercise = TRUE, exercise.cap = 'Shapiro-Wilk test for Group2'} +```{r ex-qu9LC-5, exercise = TRUE, exercise.cap = 'Shapiro-Wilk test for Group2'} group2_data <- filter(data, group == "Group2") shapiro.test(group2_data$value) @@ -192,7 +192,7 @@ datasets are from the same distribution or not. The assumption, or null hypothes is that they are, in fact, mean values from the same distribution. -```{r ex-1KzfZ-6, exercise = TRUE, exercise.cap = 'Student's t-test between two groups'} +```{r ex-XBtU4-6, exercise = TRUE, exercise.cap = 'Student's t-test between two groups'} t.test(group1_data$value, group2_data$value) ``` @@ -217,7 +217,7 @@ The `EnvStats` package has a comprehensive list of basic and more advanced stati tests for Environmental Data. -```{r ex-zhTAo-1, eval = FALSE} +```{r ex-F9Tl0-1, eval = FALSE} library(EnvStats) ?FcnsByCatHypothTests @@ -236,7 +236,7 @@ arguments that we only want to include complete observations and the Pearson met of finding correlations. -```{r ex-4sCkU-1, exercise = TRUE, exercise.cap = 'Correlation matrix of select variables'} +```{r ex-pWrru-1, exercise = TRUE, exercise.cap = 'Correlation matrix of select variables'} cor(data[, c("Variable1", "Variable2", "Variable3")], use = "complete.obs", method ="pearson") @@ -253,7 +253,7 @@ We could also perform a correlation test using the `cor.test()` function. Here we test the correlation between two variables. -```{r ex-I2kAN-2, exercise = TRUE, exercise.cap = 'Test correlation between two variables'} +```{r ex-unIry-2, exercise = TRUE, exercise.cap = 'Test correlation between two variables'} cor.test(data$Variable1, data$Variable2, method = "pearson") ``` @@ -267,7 +267,7 @@ do not reject the null hypothesis. We conclude there is no correlation between these two variables. -```{r ex-PgGD2-3, exercise = TRUE, exercise.cap = 'Test correlation between another set of two variables'} +```{r ex-5rmSI-3, exercise = TRUE, exercise.cap = 'Test correlation between another set of two variables'} cor.test(data$Variable1, data$Variable3, method = "pearson") ``` @@ -278,7 +278,7 @@ plot between each pair of columns in the data frame. Setting `lower.panel = pane will draw a smooth line through the scatter plots on the lower panels. -```{r ex-YxGjd-4, exercise = TRUE, exercise.cap = 'Pairwise plots of select variables'} +```{r ex-MeoBl-4, exercise = TRUE, exercise.cap = 'Pairwise plots of select variables'} pairs(data[, c("Variable1", "Variable2", "Variable3")], lower.panel = panel.smooth) ``` diff --git a/docs/7-Quality-Assurance/readme.md b/docs/7-Quality-Assurance/readme.md index cbfb91d..81a4cd5 100644 --- a/docs/7-Quality-Assurance/readme.md +++ b/docs/7-Quality-Assurance/readme.md @@ -21,7 +21,7 @@ This lesson assumes you are familiar with the material in the lesson on [Functio The data used throughout these lessons is provided by this package. To access the data, simply use the `data()` function with the name of the dataset provided by this package. -```{r ex-eepns-1, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Load Example Data Frame'} +```{r ex-mLauk-1, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Load Example Data Frame'} # Assuming this package is already loaded into your R session data("example_dataset") @@ -34,7 +34,7 @@ data("example_dataset") Data types are the first thing to consider when using data in R. Many errors can happen if we assume that our data is a certain type, when in reality it is not. After reading data into R, we should look at the data types in RStudio or using the function `str()`. -```{r ex-uGfyE-1, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Inspect Example Dataset Data Types'} +```{r ex-8Z93E-1, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Inspect Example Dataset Data Types'} str(example_dataset) ``` @@ -42,7 +42,7 @@ str(example_dataset) Here is an example of text that is read into R, and a certain column might be `character` when we expected it to be `Date`. -```{r ex-pYBYE-2, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Check Data Types'} +```{r ex-UHgmY-2, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Check Data Types'} sample_data <- read.csv(text = " date,value 2022-08-01,100 @@ -56,7 +56,7 @@ str(sample_data) We can use the `as.Date()` function to transform the column after reading the data, or we can use the `colClasses` argument in the `read.csv` function to ensure it's read correctly. -```{r ex-jeoAo-3, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Correcting Data Types with colClasses'} +```{r ex-j5i0t-3, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Correcting Data Types with colClasses'} sample_data <- read.csv(colClasses = c("Date", "numeric"), text = " date,value 2022-08-01,100 @@ -72,7 +72,7 @@ str(sample_data) For both character and numeric data types, there may be values that should not be allowed. -```{r ex-IvYci-1, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Identify and Correct Unallowed Data'} +```{r ex-iUtMe-1, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Identify and Correct Unallowed Data'} # Example of correcting unallowed values values <- c(1, 2, -1, 3, -2, 4) values[values < 0] <- NA @@ -86,7 +86,7 @@ values Handling outliers is difficult because we do not necessarily want to remove data that may be uncommon but within the realm of possibility. -```{r ex-Q1tvb-1, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Identify and Handle Outliers'} +```{r ex-ZvmnZ-1, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Identify and Handle Outliers'} # Example of identifying and handling outliers data("example_dataset") boxplot(example_dataset$value) @@ -98,7 +98,7 @@ boxplot(example_dataset$value) If you run a command and get an error, then R should print an error message. Common syntax mistakes include missing commas, unmatched parentheses, and the wrong type of closing brace. -```{r ex-ps7vH-1, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Examples of Common Syntax Mistakes'} +```{r ex-ME5GN-1, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Examples of Common Syntax Mistakes'} # Example of a common syntax mistake: missing commas x <- c("a", "b" "c") diff --git a/inst/tutorials/1-Introduction/lesson.Rmd b/inst/tutorials/1-Introduction/lesson.Rmd index c14245d..81983eb 100644 --- a/inst/tutorials/1-Introduction/lesson.Rmd +++ b/inst/tutorials/1-Introduction/lesson.Rmd @@ -47,7 +47,7 @@ Open up a script if you haven't already (“File” -> “New File” -> “R Sc the lines into your script. -```{r ex-gspio-1, exercise = FALSE, eval = TRUE, exercise.cap = 'Practice Basic Math Operations'} +```{r ex-MTiVW-1, exercise = FALSE, eval = TRUE, exercise.cap = 'Practice Basic Math Operations'} 10 + 5 10 - 5 10 * 5 @@ -74,17 +74,17 @@ R follows the usual order of arithmetical operations and uses parentheses for gr see the different values that are returned. -```{r ex-LkNe4-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Understanding Order of Operations'} +```{r ex-ebpxn-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Understanding Order of Operations'} 10 - 3 / 5 ``` -```{r ex-7QN0K-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Practicing Grouping with Parentheses'} +```{r ex-ufpL9-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Practicing Grouping with Parentheses'} (10 - 3) / 5 ``` -## Note on Comments and Code Blocks +## Comments and Code Blocks To write a comment in your script that will not be evaluated, type `#` in front of your comment. The text after `#` will not be evaluated. There is no multi-line commenting in R, so every comment line must begin with the `#` character. @@ -93,17 +93,17 @@ no multi-line commenting in R, so every comment line must begin with the `#` cha Run all of the code below and see what gets returned in the R console (bottom left panel in RStudio). -```{r ex-N32ao-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Experiment with Comments'} +```{r ex-cyWBd-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Experiment with Comments'} # Full line comment 5^2 # partial line comment ``` -In the example above and the previous section, you have seen the R code and its output. The code blocks with output look like this, with `1+1` being +In the example above and the previous section, you can see the R code and its output. The code blocks with output look like this, with `1+1` being the R code and `## [1] 2` being the output: -```{r ex-pot7N-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Code and Output Example'} +```{r ex-byZ3W-2, exercise = FALSE, eval = TRUE, exercise.cap = 'Code and Output Example'} 1+1 ``` @@ -111,8 +111,9 @@ the R code and `## [1] 2` being the output: However, in the R console the code and output would look like this: -```{r ex-fensd-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Console Code Execution'} -1 + 1 +```{r ex-9gHIa-3, exercise = FALSE, eval = FALSE, echo = TRUE, exercise.cap = 'Console Code Execution'} +> 1 + 1 +[1] 2 ``` @@ -122,7 +123,7 @@ A variable is a letter or combination of alphanumeric characters that is used to with the dash to create an arrow symbol pointing left `<-`. Below, the variables `x` and `y` are created by assigning some numbers to them. -```{r ex-V9kEg-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Creating Variables'} +```{r ex-mHvvD-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Creating Variables'} x <- 10 y <- 5 x + y @@ -137,12 +138,12 @@ In RStudio, you will see the variables we created in the top right panel. If you've already created a variable, you can replace the value with another value. -```{r ex-G9E1a-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Exploring Variable Assignment'} +```{r ex-kUDcq-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Exploring Variable Assignment'} x ``` -```{r ex-cr3rg-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Updating Variable Values'} +```{r ex-5d8sV-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Updating Variable Values'} x <- 20 x @@ -159,9 +160,9 @@ There are 3 important rules to remember when creating variable names: Try running the following code and you will see that in your global environment there are two different objects listed. -```{r ex-2mgf1-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Variable Naming Rules'} -x <- 5 -X <- 5 +```{r ex-Vw1tV-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Variable Naming Rules'} +y <- 5 +Y <- 10 ``` @@ -176,13 +177,14 @@ R has three main data types: | numeric | numbers | `1`, `3.14`, `log(10)` | | logical | binary | `TRUE`, `FALSE` | -The `character` type requires single or double quotes. The logical values -`TRUE` and `FALSE` should not be quoted and require full caps. +The `character` type requires single or double quotes. The numeric type +must be unquoted numbers, and the full-caps logical values `TRUE` and +`FALSE` must also be unquoted. ## Grouping Data -There are several ways to group data to make them easier to work with: +There are several ways to store groups of data to make them easier to work with: ### Vectors @@ -190,13 +192,13 @@ There are several ways to group data to make them easier to work with: A vector variable can contain only one type of data (numeric, character, or logical). We use `c()` to create vectors. -```{r ex-ZfVqE-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Creating Numeric Vectors'} +```{r ex-axso2-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Creating Numeric Vectors'} x <- c(1, 2, 3, 4, 5) x ``` -```{r ex-HzYA5-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Creating Character Vectors'} +```{r ex-ouMaS-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Creating Character Vectors'} fruit <- c("apples", "bananas", "oranges") fruit @@ -206,7 +208,7 @@ If you try to type in text without using quotations marks for character values ( running the code below. -```{r ex-gNcFG-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Error Without Quotes'} +```{r ex-4S8mo-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Error Without Quotes'} fruit <- c(apples, bananas, oranges) ``` @@ -216,7 +218,7 @@ find them and it returns an error. The members of a vector can be accessed by us `fruit` vector, you can use the single bracket with the number 3: -```{r ex-wLVHd-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Accessing Vector Elements'} +```{r ex-CqAJ8-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Accessing Vector Elements'} fruit[3] ``` @@ -226,7 +228,7 @@ fruit[3] Lists are like vectors but can contain any mix of data types. We use `list()` to create a list variable. -```{r ex-g7gro-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Creating Lists'} +```{r ex-1xLzn-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Creating Lists'} x <- list("Benzene", 1.3, TRUE) x @@ -237,7 +239,7 @@ is the second value in the list, so it is shown below the double bracket `[[2]]` list. -```{r ex-nOMMc-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Accessing List Elements'} +```{r ex-nhD28-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Accessing List Elements'} x[[2]] ``` @@ -245,7 +247,7 @@ x[[2]] Lists can also contain vectors and other lists. -```{r ex-g5Yed-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Lists Containing Vectors and Lists'} +```{r ex-9qNwR-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Lists Containing Vectors and Lists'} my_vector <- c(1, 2, 3) my_list <- list("Benzene", 1.3, TRUE) y <- list(TRUE, my_vector, my_list) @@ -257,7 +259,7 @@ In this example, you can use two double brackets to access the value `1.3` by se `my_list`: -```{r ex-E83T7-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Nested List Access'} +```{r ex-4m4IT-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Nested List Access'} y[[3]][[2]] ``` @@ -268,7 +270,7 @@ Data frames are data tables in R. We use `data.frame()` to create a data frame o vectors of the same length and use them to create a data frame. -```{r ex-y4an5-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Creating Data Frames'} +```{r ex-MaAmV-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Creating Data Frames'} pollutant <- c("Benzene", "Toluene", "Xylenes") concentration <- c(1.3, 5.5, 6.0) carcinogen <- c(TRUE, FALSE, FALSE) @@ -281,7 +283,7 @@ The output above shows a table with the vector variable names as column names, a create a data frame where the vectors are not all the same length, you will see the error shown below. -```{r ex-u7NB0-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Data Frame Dimension Mismatch Error'} +```{r ex-kt2Ak-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Data Frame Dimension Mismatch Error'} pollutant <- c("Benzene", "Toluene") concentration <- c(1.3, 5.5, 6.0) carcinogen <- c(TRUE, FALSE, FALSE) @@ -386,7 +388,7 @@ grade_this_code( ### Exercise 4 -Create a list `l` that contains a number (e.g., 5), a string (e.g., 'apple'), and a logical value (e.g., TRUE). +Create a list `l` that contains the number 5, the string 'apple', and the logical value `TRUE`. ```{r exercise4, exercise = TRUE} # Your code here diff --git a/inst/tutorials/2-Functions-and-Importing-Data/lesson.Rmd b/inst/tutorials/2-Functions-and-Importing-Data/lesson.Rmd index c533749..5cc4371 100644 --- a/inst/tutorials/2-Functions-and-Importing-Data/lesson.Rmd +++ b/inst/tutorials/2-Functions-and-Importing-Data/lesson.Rmd @@ -33,7 +33,7 @@ Functions are similar to variables in that they are short names that reference s R has many built-in functions that perform common tasks. When you open RStudio you can immediately use a function called `mean( )`. Here is an example of using the `mean( )` function to find the average of a vector of integers. We first save a vector of integers in the `x` variable then put the variable inside the parentheses of the function. -```{r ex-cmUTC-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Mean'} +```{r ex-zjABO-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Mean'} x <- c(4, 8, 1, 14, 34) mean(x) @@ -42,22 +42,22 @@ mean(x) As you would expect, R has many built-in math functions. Below are a series of examples. -```{r ex-zaJxk-2, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Natural Logarithm'} +```{r ex-tk9tQ-2, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Natural Logarithm'} log(27) #Natural logarithm ``` -```{r ex-5GbKn-3, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Base 10 Logarithm'} +```{r ex-q55TG-3, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Base 10 Logarithm'} log10(100) #base 10 logarithm ``` -```{r ex-HabsM-4, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Square Root'} +```{r ex-LgzHK-4, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Square Root'} sqrt(225) # Square root ``` -```{r ex-wTInz-5, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Absolute Value'} +```{r ex-Ppiy7-5, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Absolute Value'} abs(-5) #Absolute value ``` @@ -68,7 +68,7 @@ All of the examples show that the general form is `function_name( )`. The name o Many functions also have additional options you can choose, which are called the _arguments_. To see what needs to go inside `( )`, type a question mark in front of the function and run it in the R console. -```{r ex-aVvdw-6, eval = FALSE} +```{r ex-akxtC-6, eval = FALSE} ?mean() ``` @@ -84,17 +84,17 @@ On the help page, under `Usage`, you see `mean(x, ...)`. This means that the onl Many built-in functions in R have multiple arguments. This allows you to give the function some more information to perform calculation you want. The example below shows how to use the `digits` argument in the `round( )` function. Providing different values to the `digits` argument will return different values. -```{r ex-HNwrA-7, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Round to Zero Digits'} +```{r ex-rhZdj-7, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Round to Zero Digits'} round(12.3456) ``` -```{r ex-cSpAU-8, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Round to Three Digits'} +```{r ex-8j1Yx-8, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Round to Three Digits'} round(12.3456, digits=3) ``` -```{r ex-H0L8k-9, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Round to One Digit'} +```{r ex-d2mMB-9, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Round to One Digit'} round(12.3456, digits=1) ``` @@ -109,7 +109,7 @@ When you start an R session there are many built-in functions that are immediate Returns the sum of a vector of numeric values. -```{r ex-E6iwI-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Sum of Values'} +```{r ex-K0Mih-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Sum of Values'} sum(c(2.3, 7.5, 9, -10)) ``` @@ -119,7 +119,7 @@ sum(c(2.3, 7.5, 9, -10)) Get the minimum value from a numeric vector. -```{r ex-Us8lI-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Minimum Value'} +```{r ex-NQbSf-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Minimum Value'} min(c(6, 9, 3, 11, -2)) ``` @@ -129,7 +129,7 @@ min(c(6, 9, 3, 11, -2)) Get the maximum value from a numeric vector. -```{r ex-RszAt-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Maximum Value'} +```{r ex-z2NwD-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Maximum Value'} max(c(6, 9, 3, 11, -2)) ``` @@ -139,7 +139,7 @@ max(c(6, 9, 3, 11, -2)) Create a numeric vector with a certain sequence. The example below creates a vector of integers from 1 to 5. -```{r ex-U8l4J-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Create Sequence with `seq()`'} +```{r ex-kuKw1-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Create Sequence with `seq()`'} seq(from = 1, to = 5, by = 1) ``` @@ -147,7 +147,7 @@ seq(from = 1, to = 5, by = 1) Another way to create a sequence of integers is to use the colon. -```{r ex-agVOw-2, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Create Sequence with Colon Operator'} +```{r ex-yzJw9-2, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Create Sequence with Colon Operator'} 1:5 ``` @@ -157,7 +157,7 @@ Another way to create a sequence of integers is to use the colon. Concatenate two or more strings. -```{r ex-vVrx6-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Concatenate Strings'} +```{r ex-zKlrp-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Concatenate Strings'} x <- "Hello" y <- "world!" paste(x, y, sep = " ") @@ -167,7 +167,7 @@ paste(x, y, sep = " ") Any numbers will be converted to strings. -```{r ex-Oro96-2, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Concatenate Strings with Numbers'} +```{r ex-CLJeb-2, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Concatenate Strings with Numbers'} x <- "You're number " y <- 1 z <- "!" @@ -183,7 +183,7 @@ The `substr()` function allows you to pull out a section from a string based on For example, in AQS data a monitor ID may be written in the following format: [State code - County code - Site number - Parameter code - POC]. If we only wanted to pull out the site number for this monitor ID we could do the following: -```{r ex-0nLU3-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Extract Site Number'} +```{r ex-AJsza-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Extract Site Number'} wisconsin_monitor <- c('55-021-0015-44201-2') # Ozone monitor in Columbia County, WI site_id <- substr(wisconsin_monitor, start = 8, stop = 11) # start and stop position within the character string. site_id @@ -198,14 +198,14 @@ R allows you to place a function inside another function to perform multiple tas For instance, if you want to create a sequence of numbers and then take the mean of that sequence, you could either do it in a couple of steps, or all at once. -```{r ex-vtTfG-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Mean in Two Steps'} +```{r ex-v4Oam-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Mean in Two Steps'} #Two steps x <- seq(from=1, to=10, by=3) mean(x) ``` -```{r ex-nvg8w-2, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Mean in One Step'} +```{r ex-bbQSh-2, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Mean in One Step'} #One step mean(seq(from=1, to=10, by=3)) @@ -222,7 +222,7 @@ Most of the statistical summary functions in R have the argument `na.rm`. This s For example, there is a built-in data frame in R called `airquality` with daily measurements from a monitor in New York from 1973 (see `?airquality`). If we load the data frame using the `data()` function and take a look at the top 6 rows using the `head()` function, we can see some missing values represented as `NA`. -```{r ex-pk6V5-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Inspect `airquality` Data'} +```{r ex-uOhJx-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Inspect `airquality` Data'} data("airquality") head(airquality) @@ -231,7 +231,7 @@ head(airquality) The `mean()` function, for example, has the argument `na.rm` set to `FALSE`. This means that the `NA` values will not be removed from the vector for which it is calculating the mean. As a result, it will return an `NA` because it cannot properly calculate the average. Here we use the `Ozone` column from the `airquality` data frame. -```{r ex-C5qdu-2, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Mean with NA Values'} +```{r ex-n8XjO-2, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Mean with NA Values'} mean(airquality$Ozone) ``` @@ -239,7 +239,7 @@ mean(airquality$Ozone) To get the mean value, we set `na.rm = TRUE`. -```{r ex-RrdJ8-3, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Mean with NA Removal'} +```{r ex-ERw6X-3, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Mean with NA Removal'} mean(airquality$Ozone, na.rm = TRUE) ``` @@ -261,7 +261,7 @@ For example, if you wanted to find serial correlation in an environmental data s First, you might try to use the function. -```{r ex-PBrNS-1, error = TRUE} +```{r ex-g63qu-1, error = TRUE} x <- c(1.3, 3.5, 2.6, 3.4, 6.4) serialCorrelationTest(x) @@ -283,12 +283,12 @@ A window will pop up. Start typing "EnvStats" into the "Packages" box, select th Now that we've installed the package, we still can't use the function we want. We need to load the package first (opening the app). We use the `library()` function to do this. -```{r ex-rywe1-2, message = FALSE} +```{r ex-B7VCB-2, message = FALSE} library(EnvStats) ``` -```{r ex-1bVTU-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Use `serialCorrelationTest` from EnvStats'} +```{r ex-gZ7KC-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Use `serialCorrelationTest` from EnvStats'} x <- c(1.3, 3.5, 2.6, 3.4, 6.4) serialCorrelationTest(x) @@ -310,7 +310,7 @@ R can import data from just about any format, including CSV, Excel, Databases, G R has a built-in function called `read.csv()` for reading `.csv` files. Download the `chicago_daily.csv` file [here](../data/chicago_daily.csv) and save it to your working directory. If you don't know what your working directory is, run this code in R and it will tell you. -```{r ex-qLsu5-1, eval = FALSE} +```{r ex-GCoXE-1, eval = FALSE} getwd() ``` @@ -318,7 +318,7 @@ getwd() Use `read.csv()` by providing the location and name of the file as the first argument. If the file is in your working directory, simply supply the name of the file. Below, the data from the file is read into R and saved as a data frame, which is the data type for storing tables. The function `head()` will show the first few lines. -```{r ex-A2dCy-2, eval = FALSE} +```{r ex-2Fbtm-2, eval = FALSE} chicago_daily <- read.csv("chicago_daily.csv") head(chicago_daily) @@ -329,7 +329,7 @@ head(chicago_daily) There are several packages that can be used to import data from an Excel file, such as `xlsx`, `XLConnect`, and `readxl`. In this example, we'll use the `readxl` package. If you do not have the package installed, you can use RStudio to install as described in the section above on packages. You can also use the function `install.packages( )`. -```{r ex-RqOGT-1, eval = FALSE} +```{r ex-bCVPT-1, eval = FALSE} install.packages("readxl") ``` @@ -345,7 +345,7 @@ library(readxl) Use the `read_excel()` function from the `readxl` package to read emissions data from [this Excel workbook](./data/emissions_IL_2022.xlsx). Download the file to your working directory and read the first worksheet (named "UNIT_DATA"), skipping the first 6 rows. -```{r ex-nADMe-3, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Read and Inspect Excel Data'} +```{r ex-QWZFw-3, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Read and Inspect Excel Data'} library(readxl) emissions <- read_excel("./data/emissions_IL_2022.xlsx", sheet = "UNIT_DATA", skip = 6) head(emissions) diff --git a/inst/tutorials/3-Subsetting-Sorting-and-Combining/lesson.Rmd b/inst/tutorials/3-Subsetting-Sorting-and-Combining/lesson.Rmd index 774907f..6b91380 100644 --- a/inst/tutorials/3-Subsetting-Sorting-and-Combining/lesson.Rmd +++ b/inst/tutorials/3-Subsetting-Sorting-and-Combining/lesson.Rmd @@ -31,7 +31,7 @@ The example data for exercises in this lesson is available directly from this pa To load the `chicago_air` data frame we will be using in the lesson, simply use the `data()` function to load the data frame. -```{r ex-MwffZ-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Load `chicago_air` Data Frame'} +```{r ex-6wiK1-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Load `chicago_air` Data Frame'} data(chicago_air) ``` @@ -42,7 +42,7 @@ You should see the `chicago_air` variable in the top right panel of RStudio, whi We will also use some functions from the `dplyr` package. You will need to install the package if you haven't already. -```{r ex-HEi1w-2, exercise = FALSE, exercise.eval = FALSE, eval = FALSE, exercise.cap = 'Install `dplyr` Package'} +```{r ex-zhXCH-2, exercise = FALSE, exercise.eval = FALSE, eval = FALSE, exercise.cap = 'Install `dplyr` Package'} install.packages("dplyr") ``` @@ -55,7 +55,7 @@ We always want to make sure our data looks the way it is supposed to before we b The best way to take a quick look at the first few rows of a data frame is to use the `head()` function. -```{r ex-g7dKe-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'View the First Few Rows of `chicago_air`'} +```{r ex-KnlRD-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'View the First Few Rows of `chicago_air`'} data(chicago_air) head(chicago_air) @@ -64,7 +64,7 @@ head(chicago_air) You can specify the number of lines to display by using the `n` parameter. -```{r ex-u8wVK-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Specify Number of Lines with `head()`'} +```{r ex-5LDbm-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Specify Number of Lines with `head()`'} data(chicago_air) head(chicago_air, n = 3) @@ -73,7 +73,7 @@ head(chicago_air, n = 3) You can also look at the bottom of the data frame by using the `tail()` function. -```{r ex-nhtel-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'View the Last Few Rows of `chicago_air`'} +```{r ex-1qAUt-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'View the Last Few Rows of `chicago_air`'} data(chicago_air) tail(chicago_air) @@ -82,7 +82,7 @@ tail(chicago_air) In RStudio, you can either click on the name of the data frame in the top right panel or use the `View()` function to generate a web based table of the data in the top left panel. -```{r ex-v41GA-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Use `View()` to Inspect `chicago_air`'} +```{r ex-GCv7f-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Use `View()` to Inspect `chicago_air`'} data(chicago_air) View(chicago_air) @@ -93,7 +93,7 @@ View(chicago_air) By inspecting the data frame this way, you can see that the records are daily values of ozone, temperature, and solar radiation. For more information about the data set you can type a question mark in from the name of the data frame variable in the console. -```{r ex-qMoyo-5, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Learn More About `chicago_air`'} +```{r ex-JDWqG-5, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Learn More About `chicago_air`'} ?chicago_air ``` @@ -119,7 +119,7 @@ Values in a data frame can be selected, individually or in a group, based on the Below is a data frame called `my_data` that has 3 rows and 2 columns. -```{r ex-eaGnM-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Explore Data Frame Indexing with `my_data`'} +```{r ex-Vhe21-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Explore Data Frame Indexing with `my_data`'} my_data <- data.frame(colors = c("red", "green", "yellow"), fruit = c("apple", "grape", "banana")) @@ -130,7 +130,7 @@ my_data To select a particular cell from the `my_data` data frame, we use the `[row, column]` construction. We place those square brackets at the end of the data frame variable `my_data[]` and use integers to select a value. If we wanted to select the "green" value, we would use `my_data[2, 1]`. -```{r ex-CDdj4-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Select a Specific Value Using Indexing'} +```{r ex-c26CI-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Select a Specific Value Using Indexing'} my_data[2, 1] ``` @@ -138,7 +138,7 @@ my_data[2, 1] To select "banana", we use `my_data[3, 2]`. -```{r ex-1LtUp-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access Another Specific Value Using Indexing'} +```{r ex-bdSdp-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access Another Specific Value Using Indexing'} my_data[3, 2] ``` @@ -146,7 +146,7 @@ my_data[3, 2] We can also access data from a vector using the same indexing idea. In this case, you don’t need the comma to separate the rows and columns since you are accessing one dimensional data. Below is a vector of numbers. -```{r ex-Pop2I-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Vector Indexing Example'} +```{r ex-54xbn-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Vector Indexing Example'} x <- c(1, 3, 2, 7, 25.3, 6) x @@ -155,7 +155,7 @@ x If we want to access the 5th element of the vector, we would use `x[5]`. -```{r ex-AFssX-5, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access a Vector Element by Index'} +```{r ex-lzp3w-5, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access a Vector Element by Index'} x[5] ``` @@ -163,7 +163,7 @@ x[5] Now that we understand indexing we can subset the `chicago_air` data frame by using the brackets `[ , ]` function. (This is a rare example of a function in R that does not have the form `function_name()`.) -```{r ex-TeYW3-6, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Subset `chicago_air` Data Frame Using Indexing'} +```{r ex-dvaSW-6, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Subset `chicago_air` Data Frame Using Indexing'} data(chicago_air) chicago_air[1, ] @@ -172,7 +172,7 @@ chicago_air[1, ] If you want more than one row, you can supply a vector of row numbers. Below, the vector access the 1st, 2nd, and 5th rows of the data frame. -```{r ex-ZXIMO-7, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access Multiple Rows with a Vector of Row Numbers'} +```{r ex-m0BhJ-7, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access Multiple Rows with a Vector of Row Numbers'} data(chicago_air) chicago_air[c(1, 2, 5), ] @@ -181,7 +181,7 @@ chicago_air[c(1, 2, 5), ] To get a column from the data frame, specify the column number in the brackets, to the right of the comma. By leaving the row value blank, you are telling it to return all rows associated with column 1. Below, we wrap the output in the `head()` function to limit the number of rows printed. -```{r ex-uhfbH-8, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access a Single Column'} +```{r ex-g7A32-8, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access a Single Column'} data(chicago_air) head(chicago_air[, 1]) @@ -193,7 +193,7 @@ As you can see, a vector is returned. When a column of a data frame is selected You can also obtain more than one column by supplying a vector of column numbers. -```{r ex-3JZfl-9, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access Multiple Columns'} +```{r ex-fMpFj-9, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access Multiple Columns'} data(chicago_air) head(chicago_air[, c(3, 4, 6)]) @@ -205,7 +205,7 @@ Since more than one column is selected, then a data frame is returned. A column name can be used to select a vector. -```{r ex-eeM4R-10, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access a Column by Name'} +```{r ex-kzV6j-10, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access a Column by Name'} data(chicago_air) head(chicago_air[, "solar"]) @@ -214,7 +214,7 @@ head(chicago_air[, "solar"]) Or a vector of column names can subset to a slimmed down data frame. -```{r ex-Skj8x-11, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Subset to Specific Columns by Name'} +```{r ex-oGEmr-11, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Subset to Specific Columns by Name'} data(chicago_air) head(chicago_air[, c("ozone", "temp", "month")]) @@ -223,7 +223,7 @@ head(chicago_air[, c("ozone", "temp", "month")]) Both rows and columns can be specified at the same time. The example below returns the first 5 rows of the temperature and solar columns. -```{r ex-7gN6S-12, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Specify Both Rows and Columns for Subsetting'} +```{r ex-XFSJ4-12, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Specify Both Rows and Columns for Subsetting'} chicago_air[1:5, c("temp", "solar")] ``` @@ -233,7 +233,7 @@ chicago_air[1:5, c("temp", "solar")] In R, the dollar sign `$` is a special character that can be used to access a data frame column by name. The dollar sign is placed immediately after the variable name. For example, if we wanted to access the temperature values in the `chicago_air` data frame, then we would use `chicago_air$temp`. -```{r ex-EXVkv-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access a Column with `$`'} +```{r ex-6xvxH-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access a Column with `$`'} data(chicago_air) head(chicago_air$temp) @@ -267,12 +267,12 @@ Below is a table of logical operators in R that can be used to create logical co The result of a logical expression is a logical data type, a boolean value `TRUE` or `FALSE`. -```{r ex-HKRuI-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Logical Expressions Example 1'} +```{r ex-5wAI4-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Logical Expressions Example 1'} 1 + 1 == 2 ``` -```{r ex-DLGbi-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Logical Expressions Example 2'} +```{r ex-ejfud-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Logical Expressions Example 2'} 10 > 20 ``` @@ -280,7 +280,7 @@ The result of a logical expression is a logical data type, a boolean value `TRUE Vectors can also be used in a logical expression. A vector of values on the left hand side of a logical operator will return a vector of the same length with boolean values. -```{r ex-VzHeJ-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Vector Logical Expression'} +```{r ex-DlaPL-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Vector Logical Expression'} c(25, 80, 55) > 60 ``` @@ -291,7 +291,7 @@ This concept can be used to subset a data frame. A logical vector can be used in We can use the data frame of colors and fruit again to demonstrate. -```{r ex-gluSH-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Create a Logical Vector for Subsetting'} +```{r ex-21WYI-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Create a Logical Vector for Subsetting'} my_data <- data.frame(colors = c("red", "green", "yellow"), fruit = c("apple", "grape", "banana")) @@ -302,7 +302,7 @@ my_data If we only wanted records with the "yellow" color, we could use the vector `c(FALSE, FALSE, TRUE)`. Place this vector in the brackets of the data frame, where we select rows. -```{r ex-9Txmz-5, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Subset Using a Logical Vector'} +```{r ex-sEk2e-5, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Subset Using a Logical Vector'} my_data[c(FALSE, FALSE, TRUE), ] ``` @@ -313,7 +313,7 @@ A data frame is returned. The only record is from the 3rd row of the logical vec But a more useful way of creating the logical vector is with a logical expression. Below we access the "color" column as a vector using the `$` operator. Then we create a logical vector using a logical expression. -```{r ex-Mff6L-6, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Create a Logical Vector with a Logical Expression'} +```{r ex-MnvL1-6, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Create a Logical Vector with a Logical Expression'} colors <- my_data$colors colors @@ -327,7 +327,7 @@ yellow Now we can use the logical vector `yellow` to subset the data frame down to records that have the color yellow. -```{r ex-YlPvi-7, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Subset Data Frame Using Logical Vector'} +```{r ex-a5y6T-7, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Subset Data Frame Using Logical Vector'} my_data[yellow, ] ``` @@ -335,7 +335,7 @@ my_data[yellow, ] The `chicago_air` data frame can be subset in a similar way. Below, a logical vector `hot` is created to represent hot days above 90 degrees. The data frame is subset down to records with hot days. -```{r ex-dEH8E-8, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Subset `chicago_air` for Hot Days'} +```{r ex-2uYLb-8, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Subset `chicago_air` for Hot Days'} data(chicago_air) hot <- chicago_air$temp > 90 @@ -351,7 +351,7 @@ A logical vector can also be used in combination with the function `filter()`. The `filter()` function is from a package called `dplyr` which provides many functions for manipulating data frames. -```{r ex-CtUY3-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Load `dplyr` and Explore `filter()`'} +```{r ex-lbWOB-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Load `dplyr` and Explore `filter()`'} # if you have not installed dplyr @@ -367,7 +367,7 @@ The benefit of using `filter()` is that it works the way other functions in R ty If we want to filter down to records in the `chicago_air` data frame where ozone was above 60 ppb (.060 ppm), we would use the following code. -```{r ex-zjHQr-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Filter `chicago_air` for High Ozone Levels'} +```{r ex-5JvvD-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Filter `chicago_air` for High Ozone Levels'} data(chicago_air) high_ozone <- filter(chicago_air, ozone > 0.060) @@ -378,7 +378,7 @@ high_ozone If we wanted all of the high ozone days in the 6th month, we add another expression separated by a comma. -```{r ex-3rCJQ-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Filter for High Ozone Levels in June'} +```{r ex-ZTpqB-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Filter for High Ozone Levels in June'} data(chicago_air) high_ozone_june <- filter(chicago_air, ozone > 0.060, month == 6) @@ -389,7 +389,7 @@ high_ozone_june Additional logical expressions can be added by separating each expression with a comma. The comma serves as a logical AND. Below is an equivalent output to the output above, using `&` instead of a comma. -```{r ex-03OUB-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Alternative `filter()` Syntax Using `&`'} +```{r ex-4F3Dj-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Alternative `filter()` Syntax Using `&`'} data(chicago_air) high_ozone_june <- filter(chicago_air, ozone > 0.060 & month == 6) @@ -405,7 +405,7 @@ The `dplyr` package also has a function named `arrange()` that will sort a data Below, the `chicago_air` data frame is ordered by the `ozone` column. The default is ascending order. -```{r ex-Idjbr-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Sort `chicago_air` by `ozone`'} +```{r ex-gBIfC-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Sort `chicago_air` by `ozone`'} # if the dplyr library is not already loaded library(dplyr) data(chicago_air) @@ -419,7 +419,7 @@ head(ozone_ordered) To use descending order, wrap the column in the `desc()` function (also from the `dplyr` package). -```{r ex-oRZ8b-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Sort `chicago_air` in Descending Order by `ozone`'} +```{r ex-BHhgd-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Sort `chicago_air` in Descending Order by `ozone`'} data(chicago_air) ozone_descending <- arrange(chicago_air, desc(ozone)) @@ -430,7 +430,7 @@ head(ozone_descending) Additional columns can be used to sort the data frame, separated by a comma. -```{r ex-s0Nus-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Sort `chicago_air` by Multiple Columns'} +```{r ex-S7YuE-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Sort `chicago_air` by Multiple Columns'} data(chicago_air) ozone_temp <- arrange(chicago_air, desc(ozone), desc(temp)) @@ -446,7 +446,7 @@ If we are working with multiple data frames in R, it is sometimes useful to comb To illustrate, we will make two subsets of the `chicago_air` data frame, then combine them together using the `bind_rows()` function. Below, the original number of records in the `chicago_air` data frame is shown using the `nrow()` function. We will split the data frame and recombine to a data frame with the original number of records. -```{r ex-4pCHD-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Count Rows in `chicago_air`'} +```{r ex-iN7cR-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Count Rows in `chicago_air`'} # if you have not loaded the dplyr package library(dplyr) data(chicago_air) @@ -458,7 +458,7 @@ nrow(chicago_air) Now we split the data frame into warm and cool data frames using the `filter()` function. -```{r ex-vu7bj-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Split `chicago_air` into Warm and Cool Subsets'} +```{r ex-bFcau-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Split `chicago_air` into Warm and Cool Subsets'} data(chicago_air) warm <- filter(chicago_air, temp > 80) @@ -474,7 +474,7 @@ nrow(cool) We can confirm that the rows from these two data frames add up to the original data frame. -```{r ex-v7jyI-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Verify Row Counts Before Combining'} +```{r ex-C466c-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Verify Row Counts Before Combining'} nrow(warm) + nrow(cool) == nrow(chicago_air) ``` @@ -482,7 +482,7 @@ nrow(warm) + nrow(cool) == nrow(chicago_air) Now we combine using the `bind_rows()` function and confirm that the new `recombined` data frame has the same number of records as the original data frame. -```{r ex-XcZR8-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Recombine Data Frames and Verify Row Count'} +```{r ex-ZxDYZ-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Recombine Data Frames and Verify Row Count'} data(chicago_air) recombined <- bind_rows(warm, cool) diff --git a/inst/tutorials/4-Writing-Functions-Conditionals-and-Loops/lesson.Rmd b/inst/tutorials/4-Writing-Functions-Conditionals-and-Loops/lesson.Rmd index fe05942..c74ede1 100644 --- a/inst/tutorials/4-Writing-Functions-Conditionals-and-Loops/lesson.Rmd +++ b/inst/tutorials/4-Writing-Functions-Conditionals-and-Loops/lesson.Rmd @@ -32,7 +32,7 @@ This lesson assumes you are familiar with the material in the previous lessons: The data for these lessons is available from this package. It is assumed that this package is already installed and loaded into the R session. If you need to refer to the package, simply refer to it as "this package". -```{r ex-uqM3N-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Load Data from This Package'} +```{r ex-TzN8D-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Load Data from This Package'} # Assuming the package is already loaded data(chicago_air) @@ -49,7 +49,7 @@ the thing that's saved is not a data object but lines of R code. To save your own function, use this construction: -```{r ex-0KxG9-1, eval = FALSE, exercise = TRUE, exercise.cap = 'Creating Your Own Function'} +```{r ex-uKJbS-1, eval = FALSE, exercise = TRUE, exercise.cap = 'Creating Your Own Function'} my_function_name <- function() { # lines of R code @@ -62,7 +62,7 @@ We can write a simple function that prints something to the console. Here is a function named `print_hello`. -```{r ex-qvWQW-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Simple Function to Print Hello'} +```{r ex-hTa1P-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Simple Function to Print Hello'} print_hello <- function() { print("Hello") @@ -71,7 +71,7 @@ print_hello <- function() { ``` -```{r ex-7YnpO-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing the print_hello Function'} +```{r ex-GyDJs-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing the print_hello Function'} print_hello() ``` @@ -87,7 +87,7 @@ Here we recreate the same function, but this time we add an argument `text` insi of the parentheses. -```{r ex-vn8sm-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Function with an Argument'} +```{r ex-TD71X-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Function with an Argument'} print_hello <- function(text) { message <- paste("Hello", text) @@ -98,7 +98,7 @@ print_hello <- function(text) { ``` -```{r ex-r0twA-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing the Modified print_hello Function'} +```{r ex-pzP6b-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing the Modified print_hello Function'} print_hello(text = "everybody!") ``` @@ -111,7 +111,7 @@ a criteria pollutant standard. We could make simple function that takes two argu one for the measurement value, and one for the standard value. -```{r ex-HmsJT-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Function with Two Arguments'} +```{r ex-dB1MP-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Function with Two Arguments'} check_standard <- function(measurement, standard) { measurement > standard @@ -120,7 +120,7 @@ check_standard <- function(measurement, standard) { ``` -```{r ex-53CNb-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing check_standard Function'} +```{r ex-kpphy-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing check_standard Function'} check_standard(measurement = 84, standard = 70) ``` @@ -134,7 +134,7 @@ then flexibility to use a different value. To set a default value, we use `= 70` when we create the function. -```{r ex-iWazW-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Function with Default Value'} +```{r ex-VGkJm-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Function with Default Value'} check_standard <- function(measurement, standard = 70) { measurement > standard @@ -143,7 +143,7 @@ check_standard <- function(measurement, standard = 70) { ``` -```{r ex-bCrRC-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing check_standard with Default Value'} +```{r ex-wET6k-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing check_standard with Default Value'} check_standard(measurement = 50) ``` @@ -164,12 +164,12 @@ Here we show that using two numbers in a different order will return different outputs. -```{r ex-ihRr3-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing Positional Arguments'} +```{r ex-Azdwt-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing Positional Arguments'} check_standard(60, 70) ``` -```{r ex-jSEHz-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing Positional Arguments in Reverse Order'} +```{r ex-CS7VP-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing Positional Arguments in Reverse Order'} check_standard(70, 60) ``` @@ -186,7 +186,7 @@ will run if the logical expression is `TRUE` is placed inside curly braces. Belo is the outline (not actual R code). -```{r ex-UsrG4-1, eval = FALSE, exercise = TRUE, exercise.cap = 'if Function Outline'} +```{r ex-w4ftE-1, eval = FALSE, exercise = TRUE, exercise.cap = 'if Function Outline'} if(logical expression>) { @@ -195,7 +195,7 @@ if(logical expression>) { ``` -```{r ex-XKaWg-2, eval = FALSE, exercise = TRUE, exercise.cap = 'if-else Function Outline'} +```{r ex-7ZGO9-2, eval = FALSE, exercise = TRUE, exercise.cap = 'if-else Function Outline'} if() { @@ -208,7 +208,7 @@ if() { ``` -```{r ex-7jJXi-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'if Function Example'} +```{r ex-BaGTG-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'if Function Example'} ozone <- 0.075 if(ozone > 0.065) { @@ -223,7 +223,7 @@ if(ozone > 0.065) { ``` -```{r ex-PqKQb-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Another if Function Example'} +```{r ex-cYEUK-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Another if Function Example'} ozone <- 0.06 if(ozone > 0.065) { @@ -238,12 +238,12 @@ if(ozone > 0.065) { ``` -```{r ex-bMcX0-5, eval = FALSE, exercise = TRUE, exercise.cap = 'ifelse Function Outline'} +```{r ex-H30oE-5, eval = FALSE, exercise = TRUE, exercise.cap = 'ifelse Function Outline'} ifelse(, , ) ``` -```{r ex-fisPk-6, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'ifelse Function Example'} +```{r ex-kz8FH-6, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'ifelse Function Example'} ozone_value <- 0.06 message <- ifelse(ozone_value > 0.065, "Potential Health Effects", "All Good") @@ -262,7 +262,7 @@ For loops are used to repeat an operation a set number of times. Here is the basic outline: -```{r ex-4kY8A-1, eval = FALSE, exercise = TRUE, exercise.cap = 'For Loop Outline'} +```{r ex-nMEPX-1, eval = FALSE, exercise = TRUE, exercise.cap = 'For Loop Outline'} for(i in sequence){ @@ -271,7 +271,7 @@ for(i in sequence){ ``` -```{r ex-FNkLh-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'For Loop Example'} +```{r ex-5Xu1i-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'For Loop Example'} for(i in c(1, 2, 3)) { print(i) @@ -289,7 +289,7 @@ Here is an example data frame we will use. It represents a few values from three monitors. -```{r ex-aDbOs-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'For Loop with Data Frame'} +```{r ex-nq2hb-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'For Loop with Data Frame'} monitors <- data.frame(monitor1 = c(50, 60, 58, 52), monitor2 = c(55, 59, 65, 61), monitor3 = c(70, 62, 68, 71)) @@ -298,7 +298,7 @@ monitors ``` -```{r ex-krXCI-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Finding Maximum Values with For Loop'} +```{r ex-Ga6ae-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Finding Maximum Values with For Loop'} max_values <- c() for(i in c(1, 2, 3)) { @@ -329,7 +329,7 @@ The example below applies the `max()` function to the `monitors` data frame from the previous section. -```{r ex-6BUHA-1, eval = FALSE, exercise = TRUE, exercise.cap = 'apply Function Example'} +```{r ex-qOJGn-1, eval = FALSE, exercise = TRUE, exercise.cap = 'apply Function Example'} monitors_max <- apply(monitors, MARGIN = 2, FUN = max) monitors_max diff --git a/inst/tutorials/5-Plotting/lesson.Rmd b/inst/tutorials/5-Plotting/lesson.Rmd index 3dfcd5b..efc6368 100644 --- a/inst/tutorials/5-Plotting/lesson.Rmd +++ b/inst/tutorials/5-Plotting/lesson.Rmd @@ -29,7 +29,7 @@ This lesson assumes you are familiar with the material in the lesson on It also uses functions from the `ggplot2` package which needs to be installed. -```{r ex-Fwg8X-1, eval = FALSE} +```{r ex-8sfdI-1, eval = FALSE} install.packages("ggplot2") ``` @@ -37,7 +37,7 @@ install.packages("ggplot2") The example data for the exercises is available from this package. To load the `chicago_air` data frame we will be using in the lesson, simply use the `data()` function. -```{r ex-gB1KA-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Loading Data'} +```{r ex-aXr0c-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Loading Data'} data(chicago_air) ``` @@ -49,7 +49,7 @@ will be displayed on the y-axis of a coordinate graph, with the index number of vector taking the x-axis values. -```{r ex-A2RBs-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Basic Plot'} +```{r ex-VoMZI-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Basic Plot'} benzene <- c(1.3, 4.5, 2.6, 3.4, 6.4) plot(benzene) @@ -62,7 +62,7 @@ the `chicago_air` data frame to create a scatterplot of temperature on the x-axi and ozone on the y-axis. -```{r ex-rgEB7-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Scatter Plot'} +```{r ex-2ZeNC-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Scatter Plot'} plot(x = chicago_air$temp, y = chicago_air$ozone) ``` @@ -71,7 +71,7 @@ To see data plotted over time, we need to convert the `date` column to a `Date` data type. -```{r ex-KSvM5-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Converting Date Column'} +```{r ex-x6rUw-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Converting Date Column'} chicago_air$date <- as.Date(chicago_air$date) ``` @@ -79,7 +79,7 @@ chicago_air$date <- as.Date(chicago_air$date) Here is ozone plotted by day as a line graph. -```{r ex-d6zoh-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Line Graph'} +```{r ex-qobUB-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Line Graph'} plot(x = chicago_air$date, y = chicago_air$ozone, type = 'l') ``` @@ -89,7 +89,7 @@ to control the look of the graph. The plot below demonstrates a few of these options. Run `?plot` to see a list of all the arguments in the help file. -```{r ex-aVnUD-5, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Customized Plot'} +```{r ex-IzNJV-5, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Customized Plot'} plot(x = chicago_air$date, y = chicago_air$ozone, type='l', pch = 16, @@ -108,7 +108,7 @@ of a data set as a histogram. Below is the default output of the ozone data from the `chicago_air` data frame. -```{r ex-6siGd-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Default Histogram'} +```{r ex-GRaK9-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Default Histogram'} hist(chicago_air$ozone) ``` @@ -118,7 +118,7 @@ each bar, with the `breaks` argument. For example, supplying `breaks = 20` will make a histogram with 20 bars. Other arguments allow you to control the titles and colors of the plot. Run `?hist` to see a complete list of arguments on the help page. -```{r ex-pRpic-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Customized Histogram'} +```{r ex-RO2ag-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Customized Histogram'} hist(chicago_air$ozone, breaks = 20, main = "Histogram of Ozone", @@ -135,7 +135,7 @@ argument. If a data frame is used, then the columns can be referenced without th `$` operator, and a formula must be used. -```{r ex-e9R4i-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Simple Box Plot'} +```{r ex-oqO8i-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Simple Box Plot'} boxplot(chicago_air$ozone) ``` @@ -158,7 +158,7 @@ column in the data frame is being plotted, and which column is used to do the grouping. We can make a plot of ozone by month using the `chicago_air` data frame. -```{r ex-B2wt0-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Box Plot by Month'} +```{r ex-C9rP6-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Box Plot by Month'} boxplot(ozone ~ month, data = chicago_air) ``` @@ -184,7 +184,7 @@ To use `ggplot2`, we typically follow this sequence of steps: Let's begin by loading the `ggplot2` package. -```{r ex-rxsKI-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Loading ggplot2 Package'} +```{r ex-y5bMX-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Loading ggplot2 Package'} library(ggplot2) ``` @@ -196,7 +196,7 @@ is short for aesthetic. The primary arguments in the `aes( )` function are `x` and `y`. These determine which column from the data frame is used on the x and y axes. -```{r ex-UCuu5-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Base ggplot'} +```{r ex-rZGGh-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Base ggplot'} ggplot(chicago_air, aes(x = temp, y = ozone)) ``` @@ -207,14 +207,14 @@ to the plot, which is done by adding a function using the `+` sign. For a point plot, we add the `geom_point()` function. -```{r ex-sUCGG-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'ggplot Scatter Plot'} +```{r ex-EMp9A-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'ggplot Scatter Plot'} ggplot(chicago_air, aes(x = temp, y = ozone)) + geom_point() ``` Additional modifications can be made. Customize it by adding color, title, and labels. -```{r ex-RN8sI-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Customized ggplot Scatter Plot'} +```{r ex-pUDJf-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Customized ggplot Scatter Plot'} ggplot(chicago_air, aes(x = temp, y = ozone)) + geom_point(color = "forestgreen") + ggtitle('Relationship between Ozone and Temperature') + @@ -229,7 +229,7 @@ on the month. To do this, we need to make the month column a factor and use the argument `color` in the `aes( )` function. -```{r ex-IJnw2-5, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Color Coded Scatter Plot'} +```{r ex-HFIjR-5, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Color Coded Scatter Plot'} ggplot(chicago_air, aes(x = temp, y = ozone, color = factor(month))) + geom_point() + ggtitle('Relationship between Ozone and Temperature') + @@ -248,7 +248,7 @@ To create a line plot of ozone over time, we use the `as.Date()` function on the column and replace the `geom_point( )` function with the `geom_line( )` function. -```{r ex-zoGks-6, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Line Plot Over Time'} +```{r ex-kqYgY-6, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Line Plot Over Time'} ggplot(chicago_air, aes(x = as.Date(date), y = ozone)) + geom_line() ``` @@ -258,7 +258,7 @@ the width of each bar, the `fill` argument the color of the bars, and the `color argument the outline of the bars. -```{r ex-6VLkg-7, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'ggplot Histogram'} +```{r ex-ryF4b-7, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'ggplot Histogram'} ggplot(chicago_air, aes(ozone)) + geom_histogram(binwidth=0.005, fill='darkorange', color='black') @@ -267,7 +267,7 @@ ggplot(chicago_air, aes(ozone)) + The `geom_boxplot()` function will create a box plot. -```{r ex-f3W71-8, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Simple ggplot Box Plot'} +```{r ex-PJz74-8, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Simple ggplot Box Plot'} ggplot(chicago_air, aes(ozone)) + geom_boxplot() ``` @@ -276,7 +276,7 @@ Using the `y` argument can split the data into groups. Here we use the `factor() function on the month column to create 12 box plots on the graph. -```{r ex-M0JuT-9, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Grouped Box Plot'} +```{r ex-aeCuC-9, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Grouped Box Plot'} ggplot(chicago_air, aes(x = ozone, y = factor(month))) + geom_boxplot() ``` @@ -292,7 +292,7 @@ multiple plots or facets. The `facet_wrap()` function allows you to use a column to choose the facets. Below is a faceted histogram of ozone values. -```{r ex-MoOh5-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Faceted Histogram'} +```{r ex-0Fva3-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Faceted Histogram'} ggplot(chicago_air, aes(ozone)) + geom_histogram() + facet_wrap("month") @@ -308,7 +308,7 @@ represented as a shaded area. Below, the argument `method` is given the value `lm` which stands for a linear model. -```{r ex-uHhcH-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Linear Model Fitted Line', message = FALSE, warning = FALSE} +```{r ex-fRNCC-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Linear Model Fitted Line', message = FALSE, warning = FALSE} ggplot(chicago_air, aes(temp, ozone) ) + geom_point() + geom_smooth(method=lm) @@ -320,7 +320,7 @@ argument will draw a nonlinear curve which represents localized relationships be the x and y variables. -```{r ex-AksR0-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Nonlinear Curve Fitting', message = FALSE, warning = FALSE} +```{r ex-njElP-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Nonlinear Curve Fitting', message = FALSE, warning = FALSE} ggplot(chicago_air, aes(temp, ozone) ) + geom_point() + geom_smooth(method=loess) @@ -338,7 +338,7 @@ pane. You can also save a plot made by `ggplot2` using the `ggsave()` function. -```{r ex-IKuqR-1, eval = FALSE} +```{r ex-dlAjr-1, eval = FALSE} my_plot <- ggplot(chicago_air, aes(temp, ozone) ) + geom_point() + geom_smooth(method=loess) diff --git a/inst/tutorials/6-Basic-Statistics/lesson.Rmd b/inst/tutorials/6-Basic-Statistics/lesson.Rmd index d0e26e3..8083137 100644 --- a/inst/tutorials/6-Basic-Statistics/lesson.Rmd +++ b/inst/tutorials/6-Basic-Statistics/lesson.Rmd @@ -29,7 +29,7 @@ This lesson assumes you are familiar with the material in the lesson on [Functio Statistical functions are used in this lesson that require installation of the following packages. -```{r ex-G6vAl-1, eval = FALSE} +```{r ex-oiHQs-1, eval = FALSE} install.packages("envstats") ``` @@ -43,7 +43,7 @@ R has many built-in functions for descriptive statistics. We will use these functions to understand the example environmental data available in this package. -```{r ex-mDjei-1, exercise = TRUE, exercise.cap = 'Extract example data'} +```{r ex-EAgK4-1, exercise = TRUE, exercise.cap = 'Extract example data'} data <- example_data # Assuming example_data is available in this package ``` @@ -56,17 +56,17 @@ These functions let us know the range of the data values, i.e., the highest and lowest values. -```{r ex-O3Ass-2, exercise = TRUE, exercise.cap = 'Find minimum value'} +```{r ex-OvzXh-2, exercise = TRUE, exercise.cap = 'Find minimum value'} min(data, na.rm=TRUE) ``` -```{r ex-19cGC-3, exercise = TRUE, exercise.cap = 'Find maximum value'} +```{r ex-S5tF6-3, exercise = TRUE, exercise.cap = 'Find maximum value'} max(data, na.rm=TRUE) ``` -```{r ex-tx343-4, exercise = TRUE, exercise.cap = 'Find range of values'} +```{r ex-fZq9J-4, exercise = TRUE, exercise.cap = 'Find range of values'} range(data, na.rm=TRUE) ``` @@ -74,7 +74,7 @@ range(data, na.rm=TRUE) We can also get the mean and the quartile values from the `summary()` function. -```{r ex-R8OKn-5, exercise = TRUE, exercise.cap = 'Summary statistics'} +```{r ex-WlGqt-5, exercise = TRUE, exercise.cap = 'Summary statistics'} summary(data) ``` @@ -84,7 +84,7 @@ the spread is for the values in the central range of the distribution, i.e., bet the 1st quartile and the 3rd quartile. -```{r ex-tRmi4-6, exercise = TRUE, exercise.cap = 'Calculate IQR'} +```{r ex-MrRFk-6, exercise = TRUE, exercise.cap = 'Calculate IQR'} IQR(data, na.rm=TRUE) ``` @@ -94,7 +94,7 @@ of the box itself shows the first and third quartile, while the line in the midd of the box shows the median. -```{r ex-QmJoh-7, exercise = TRUE, exercise.cap = 'Visualize IQR with boxplot'} +```{r ex-Zl2MW-7, exercise = TRUE, exercise.cap = 'Visualize IQR with boxplot'} boxplot(data) ``` @@ -104,12 +104,12 @@ boxplot(data) R has functions for finding the mean and median of a set of values. -```{r ex-GT6qd-1, exercise = TRUE, exercise.cap = 'Calculate mean'} +```{r ex-xtlnV-1, exercise = TRUE, exercise.cap = 'Calculate mean'} mean(data, na.rm=TRUE) ``` -```{r ex-HKOuN-2, exercise = TRUE, exercise.cap = 'Calculate median'} +```{r ex-FGd5i-2, exercise = TRUE, exercise.cap = 'Calculate median'} median(data, na.rm=TRUE) ``` @@ -118,12 +118,12 @@ The functions `var()` and `sd()` calculate the variance and standard deviation, respectively. -```{r ex-RjYGl-3, exercise = TRUE, exercise.cap = 'Calculate variance'} +```{r ex-Gymng-3, exercise = TRUE, exercise.cap = 'Calculate variance'} var(data, na.rm=TRUE) ``` -```{r ex-LmY7H-4, exercise = TRUE, exercise.cap = 'Calculate standard deviation'} +```{r ex-ywSV1-4, exercise = TRUE, exercise.cap = 'Calculate standard deviation'} sd(data, na.rm=TRUE) ``` @@ -136,7 +136,7 @@ the `t.test()` function to perform a two-sample t-test on the example data. First, let's visualize our dataset. -```{r ex-ZG8No-1, warning = FALSE, message = FALSE, exercise = TRUE, exercise.cap = 'Visualize dataset'} +```{r ex-IZZoZ-1, warning = FALSE, message = FALSE, exercise = TRUE, exercise.cap = 'Visualize dataset'} ggplot(data, aes(factor(variable), value)) + geom_boxplot() ``` @@ -146,7 +146,7 @@ difference in concentrations. Below is a plot of those two groups side by side. -```{r ex-Ok1Il-2, warning = FALSE, message = FALSE, exercise = TRUE, exercise.cap = 'Compare two groups'} +```{r ex-jRYZK-2, warning = FALSE, message = FALSE, exercise = TRUE, exercise.cap = 'Compare two groups'} filtered_data <- filter(data, group == "Group1" | group == "Group2") ggplot(filtered_data, aes(factor(group), value)) + geom_boxplot() @@ -157,7 +157,7 @@ We should also check for normality before doing any statistical tests. Below are histograms of the datasets. -```{r ex-Il2v8-3, exercise = TRUE, exercise.cap = 'Check for normality with histograms'} +```{r ex-Newy0-3, exercise = TRUE, exercise.cap = 'Check for normality with histograms'} ggplot(filtered_data, aes(value)) + facet_grid(rows = vars(group)) + geom_histogram() @@ -171,14 +171,14 @@ comes from a normal distribution. If the p-value of the test is less than .05, we reject the null hypothesis and conclude the data is not normal. -```{r ex-ify6Z-4, exercise = TRUE, exercise.cap = 'Shapiro-Wilk test for Group1'} +```{r ex-FAgan-4, exercise = TRUE, exercise.cap = 'Shapiro-Wilk test for Group1'} group1_data <- filter(data, group == "Group1") shapiro.test(group1_data$value) ``` -```{r ex-UHZGF-5, exercise = TRUE, exercise.cap = 'Shapiro-Wilk test for Group2'} +```{r ex-4JfUa-5, exercise = TRUE, exercise.cap = 'Shapiro-Wilk test for Group2'} group2_data <- filter(data, group == "Group2") shapiro.test(group2_data$value) @@ -195,7 +195,7 @@ datasets are from the same distribution or not. The assumption, or null hypothes is that they are, in fact, mean values from the same distribution. -```{r ex-29Rah-6, exercise = TRUE, exercise.cap = 'Student's t-test between two groups'} +```{r ex-4ZJf3-6, exercise = TRUE, exercise.cap = 'Student's t-test between two groups'} t.test(group1_data$value, group2_data$value) ``` @@ -220,7 +220,7 @@ The `EnvStats` package has a comprehensive list of basic and more advanced stati tests for Environmental Data. -```{r ex-RK0J7-1, eval = FALSE} +```{r ex-eyXT2-1, eval = FALSE} library(EnvStats) ?FcnsByCatHypothTests @@ -239,7 +239,7 @@ arguments that we only want to include complete observations and the Pearson met of finding correlations. -```{r ex-AnIql-1, exercise = TRUE, exercise.cap = 'Correlation matrix of select variables'} +```{r ex-RvBN0-1, exercise = TRUE, exercise.cap = 'Correlation matrix of select variables'} cor(data[, c("Variable1", "Variable2", "Variable3")], use = "complete.obs", method ="pearson") @@ -256,7 +256,7 @@ We could also perform a correlation test using the `cor.test()` function. Here we test the correlation between two variables. -```{r ex-E5Mem-2, exercise = TRUE, exercise.cap = 'Test correlation between two variables'} +```{r ex-33Nto-2, exercise = TRUE, exercise.cap = 'Test correlation between two variables'} cor.test(data$Variable1, data$Variable2, method = "pearson") ``` @@ -270,7 +270,7 @@ do not reject the null hypothesis. We conclude there is no correlation between these two variables. -```{r ex-uy8vN-3, exercise = TRUE, exercise.cap = 'Test correlation between another set of two variables'} +```{r ex-uUpmT-3, exercise = TRUE, exercise.cap = 'Test correlation between another set of two variables'} cor.test(data$Variable1, data$Variable3, method = "pearson") ``` @@ -281,7 +281,7 @@ plot between each pair of columns in the data frame. Setting `lower.panel = pane will draw a smooth line through the scatter plots on the lower panels. -```{r ex-w5c7c-4, exercise = TRUE, exercise.cap = 'Pairwise plots of select variables'} +```{r ex-lt9er-4, exercise = TRUE, exercise.cap = 'Pairwise plots of select variables'} pairs(data[, c("Variable1", "Variable2", "Variable3")], lower.panel = panel.smooth) ``` diff --git a/inst/tutorials/7-Quality-Assurance/lesson.Rmd b/inst/tutorials/7-Quality-Assurance/lesson.Rmd index b68eef4..c2558ec 100644 --- a/inst/tutorials/7-Quality-Assurance/lesson.Rmd +++ b/inst/tutorials/7-Quality-Assurance/lesson.Rmd @@ -27,7 +27,7 @@ This lesson assumes you are familiar with the material in the lesson on [Functio The data used throughout these lessons is provided by this package. To access the data, simply use the `data()` function with the name of the dataset provided by this package. -```{r ex-W3BBr-1, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Load Example Data Frame'} +```{r ex-vhRJG-1, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Load Example Data Frame'} # Assuming this package is already loaded into your R session data("example_dataset") @@ -40,7 +40,7 @@ data("example_dataset") Data types are the first thing to consider when using data in R. Many errors can happen if we assume that our data is a certain type, when in reality it is not. After reading data into R, we should look at the data types in RStudio or using the function `str()`. -```{r ex-KAhqJ-1, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Inspect Example Dataset Data Types'} +```{r ex-n1jfS-1, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Inspect Example Dataset Data Types'} str(example_dataset) ``` @@ -48,7 +48,7 @@ str(example_dataset) Here is an example of text that is read into R, and a certain column might be `character` when we expected it to be `Date`. -```{r ex-hYZgN-2, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Check Data Types'} +```{r ex-k9zoX-2, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Check Data Types'} sample_data <- read.csv(text = " date,value 2022-08-01,100 @@ -62,7 +62,7 @@ str(sample_data) We can use the `as.Date()` function to transform the column after reading the data, or we can use the `colClasses` argument in the `read.csv` function to ensure it's read correctly. -```{r ex-xn8A6-3, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Correcting Data Types with colClasses'} +```{r ex-T2bJJ-3, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Correcting Data Types with colClasses'} sample_data <- read.csv(colClasses = c("Date", "numeric"), text = " date,value 2022-08-01,100 @@ -78,7 +78,7 @@ str(sample_data) For both character and numeric data types, there may be values that should not be allowed. -```{r ex-RFyjT-1, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Identify and Correct Unallowed Data'} +```{r ex-nnd6k-1, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Identify and Correct Unallowed Data'} # Example of correcting unallowed values values <- c(1, 2, -1, 3, -2, 4) values[values < 0] <- NA @@ -92,7 +92,7 @@ values Handling outliers is difficult because we do not necessarily want to remove data that may be uncommon but within the realm of possibility. -```{r ex-dPr5b-1, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Identify and Handle Outliers'} +```{r ex-u3chC-1, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Identify and Handle Outliers'} # Example of identifying and handling outliers data("example_dataset") boxplot(example_dataset$value) @@ -104,7 +104,7 @@ boxplot(example_dataset$value) If you run a command and get an error, then R should print an error message. Common syntax mistakes include missing commas, unmatched parentheses, and the wrong type of closing brace. -```{r ex-pJCSj-1, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Examples of Common Syntax Mistakes'} +```{r ex-Swbgo-1, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Examples of Common Syntax Mistakes'} # Example of a common syntax mistake: missing commas x <- c("a", "b" "c") diff --git a/source/1-Introduction/lesson1.yaml b/source/1-Introduction/lesson1.yaml index 1fc28f6..aa46317 100644 --- a/source/1-Introduction/lesson1.yaml +++ b/source/1-Introduction/lesson1.yaml @@ -43,7 +43,7 @@ content: - type: list bullets: - If you cannot access data easily in your BI tool, R can read just about any data source. - - If you need to save a large number of files, R can automate that process in a way that BI tools cannot. + - If you need to download, save, or otherwise process a large number of files, R can automate those tasks in a way that BI tools cannot. - Custom data transformations that are not possible in BI tools can be done with R. - Custom data visualizations that are not available in BI tools can be done with R. - Predictive modeling that is not available in BI tools, or only in a rudimentary way, can be done in R. @@ -62,7 +62,7 @@ content: system (PC or Mac). Accept the default options during the installation. - type: paragraph content: | - Once you have installed R, you can open the program itself. On PC, if you have selected the desktop shortcut during installation, the R icon + Once you have installed R, you can open the program itself. On a PC, if you have selected the desktop shortcut during installation, the R icon will look like this: - type: image src: "./images/r_icon.png" @@ -187,7 +187,7 @@ content: content: | (10 - 3) / 5 - type: section - title: Note on Comments and Code Blocks + title: Comments and Code Blocks content: - type: paragraph content: | @@ -207,13 +207,13 @@ content: 5^2 # partial line comment - type: paragraph content: | - In the example above and the previous section, you have seen the R code and its output. The code blocks with output look like this, with `1+1` being + In the example above and the previous section, you can see the R code and its output. The code blocks with output look like this, with `1+1` being the R code and `## [1] 2` being the output: - type: code language: r options: - exercise: true - exercise.eval: false + exercise: false + eval: true exercise.cap: "Code and Output Example" content: | 1+1 @@ -223,11 +223,13 @@ content: - type: code language: r options: - exercise: true - exercise.eval: false + exercise: false + eval: false + echo: true exercise.cap: "Console Code Execution" content: | - 1 + 1 + > 1 + 1 + [1] 2 - type: section title: Variables content: @@ -282,9 +284,9 @@ content: There are 3 important rules to remember when creating variable names: - type: list numbers: - - You can't start your variable with a number. + - You can't start your variable name with a number. - You can't use spaces or special characters ($,%,#,-). Periods `.` and underscores `_` are ok. - - Capitalization __DOES__ matter in R. That is, R will consider `x` and `X` to be different variables. + - Capitalization __DOES__ matter in R. That is, R will consider `y` and `Y` to be different variables. - type: paragraph content: | Try running the following code and you will see that in your global environment there are two different objects listed. @@ -295,8 +297,8 @@ content: exercise.eval: false exercise.cap: "Variable Naming Rules" content: | - x <- 5 - X <- 5 + y <- 5 + Y <- 10 - type: section title: Data Types content: @@ -323,14 +325,15 @@ content: - "`TRUE`, `FALSE`" - type: paragraph content: | - The `character` type requires single or double quotes. The logical values - `TRUE` and `FALSE` should not be quoted and require full caps. + The `character` type requires single or double quotes. The numeric type + must be unquoted numbers, and the full-caps logical values `TRUE` and + `FALSE` must also be unquoted. - type: section title: Grouping Data content: - type: paragraph content: | - There are several ways to group data to make them easier to work with: + There are several ways to store groups of data to make them easier to work with: - type: list bullets: - A __vector__ stores multiple values of the same type (e.g. all numeric values). @@ -511,7 +514,7 @@ exercises: We use the `c()` function in R to combine elements into a vector. To create consecutive integers, you can also use the colon `:` operator, which is a convenient way to create sequences of numbers. code: | v <- c(1, 2, 3, 4, 5) - - instructions: "Create a list `l` that contains a number (e.g., 5), a string (e.g., 'apple'), and a logical value (e.g., TRUE)." + - instructions: "Create a list `l` that contains the number 5, the string 'apple', and the logical value `TRUE`." hints: - "# Use the `list()` function to create a list that can contain elements of different types." - "# To create a list with a number, a string, and a logical value, you can use `list(5, 'apple', TRUE)`." From 44dd2646ae550095f5cd12e930ff8a22e22d2c0c Mon Sep 17 00:00:00 2001 From: NateRByers Date: Sun, 28 Apr 2024 11:20:28 -0400 Subject: [PATCH 2/3] lessons 3 through 5 --- docs/1-Introduction/readme.md | 40 ++--- docs/2-Functions-and-Importing-Data/readme.md | 69 +++---- .../readme.md | 116 +++++++----- .../readme.md | 93 ++++++---- docs/5-Plotting/readme.md | 170 ++++++++++++++---- docs/6-Basic-Statistics/readme.md | 46 ++--- docs/7-Quality-Assurance/readme.md | 14 +- inst/tutorials/1-Introduction/lesson.Rmd | 40 ++--- .../2-Functions-and-Importing-Data/lesson.Rmd | 69 +++---- .../lesson.Rmd | 116 +++++++----- .../lesson.Rmd | 91 ++++++---- inst/tutorials/5-Plotting/lesson.Rmd | 161 +++++++++++++---- inst/tutorials/6-Basic-Statistics/lesson.Rmd | 46 ++--- inst/tutorials/7-Quality-Assurance/lesson.Rmd | 14 +- .../lesson2.yaml | 11 +- .../lesson3.yaml | 22 ++- .../lesson4.yaml | 52 ++++-- source/5-Plotting/lesson5.yaml | 94 ++++++++-- 18 files changed, 829 insertions(+), 435 deletions(-) diff --git a/docs/1-Introduction/readme.md b/docs/1-Introduction/readme.md index 093fa37..d035b5e 100644 --- a/docs/1-Introduction/readme.md +++ b/docs/1-Introduction/readme.md @@ -108,7 +108,7 @@ Open up a script if you haven't already (“File” -> “New File” -> “R Sc the lines into your script. -```{r ex-bXHav-1, exercise = FALSE, eval = TRUE, exercise.cap = 'Practice Basic Math Operations'} +```{r ex-utrcI-1, exercise = FALSE, eval = TRUE, exercise.cap = 'Practice Basic Math Operations'} 10 + 5 10 - 5 10 * 5 @@ -135,12 +135,12 @@ R follows the usual order of arithmetical operations and uses parentheses for gr see the different values that are returned. -```{r ex-oWkbF-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Understanding Order of Operations'} +```{r ex-Cvmbu-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Understanding Order of Operations'} 10 - 3 / 5 ``` -```{r ex-tpqM5-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Practicing Grouping with Parentheses'} +```{r ex-IKuQk-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Practicing Grouping with Parentheses'} (10 - 3) / 5 ``` @@ -154,7 +154,7 @@ no multi-line commenting in R, so every comment line must begin with the `#` cha Run all of the code below and see what gets returned in the R console (bottom left panel in RStudio). -```{r ex-vnjMK-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Experiment with Comments'} +```{r ex-MNLdZ-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Experiment with Comments'} # Full line comment 5^2 # partial line comment @@ -164,7 +164,7 @@ In the example above and the previous section, you can see the R code and its ou the R code and `## [1] 2` being the output: -```{r ex-ygOvs-2, exercise = FALSE, eval = TRUE, exercise.cap = 'Code and Output Example'} +```{r ex-4ennA-2, exercise = FALSE, eval = TRUE, exercise.cap = 'Code and Output Example'} 1+1 ``` @@ -172,7 +172,7 @@ the R code and `## [1] 2` being the output: However, in the R console the code and output would look like this: -```{r ex-sjM7J-3, exercise = FALSE, eval = FALSE, echo = TRUE, exercise.cap = 'Console Code Execution'} +```{r ex-ngfqc-3, exercise = FALSE, eval = FALSE, echo = TRUE, exercise.cap = 'Console Code Execution'} > 1 + 1 [1] 2 @@ -184,7 +184,7 @@ A variable is a letter or combination of alphanumeric characters that is used to with the dash to create an arrow symbol pointing left `<-`. Below, the variables `x` and `y` are created by assigning some numbers to them. -```{r ex-6kwt4-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Creating Variables'} +```{r ex-EK419-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Creating Variables'} x <- 10 y <- 5 x + y @@ -199,12 +199,12 @@ In RStudio, you will see the variables we created in the top right panel. If you've already created a variable, you can replace the value with another value. -```{r ex-QTvM3-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Exploring Variable Assignment'} +```{r ex-8QJDo-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Exploring Variable Assignment'} x ``` -```{r ex-licIL-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Updating Variable Values'} +```{r ex-GJ5Vc-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Updating Variable Values'} x <- 20 x @@ -221,7 +221,7 @@ There are 3 important rules to remember when creating variable names: Try running the following code and you will see that in your global environment there are two different objects listed. -```{r ex-E0eQ3-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Variable Naming Rules'} +```{r ex-YZ7mK-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Variable Naming Rules'} y <- 5 Y <- 10 @@ -253,13 +253,13 @@ There are several ways to store groups of data to make them easier to work with: A vector variable can contain only one type of data (numeric, character, or logical). We use `c()` to create vectors. -```{r ex-4EGWf-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Creating Numeric Vectors'} +```{r ex-5fniF-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Creating Numeric Vectors'} x <- c(1, 2, 3, 4, 5) x ``` -```{r ex-HSgMf-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Creating Character Vectors'} +```{r ex-OeOzA-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Creating Character Vectors'} fruit <- c("apples", "bananas", "oranges") fruit @@ -269,7 +269,7 @@ If you try to type in text without using quotations marks for character values ( running the code below. -```{r ex-BKUZJ-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Error Without Quotes'} +```{r ex-FU0tV-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Error Without Quotes'} fruit <- c(apples, bananas, oranges) ``` @@ -279,7 +279,7 @@ find them and it returns an error. The members of a vector can be accessed by us `fruit` vector, you can use the single bracket with the number 3: -```{r ex-zbMt9-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Accessing Vector Elements'} +```{r ex-pWuHH-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Accessing Vector Elements'} fruit[3] ``` @@ -289,7 +289,7 @@ fruit[3] Lists are like vectors but can contain any mix of data types. We use `list()` to create a list variable. -```{r ex-oQfOX-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Creating Lists'} +```{r ex-KYPJd-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Creating Lists'} x <- list("Benzene", 1.3, TRUE) x @@ -300,7 +300,7 @@ is the second value in the list, so it is shown below the double bracket `[[2]]` list. -```{r ex-fn20s-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Accessing List Elements'} +```{r ex-O0Eld-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Accessing List Elements'} x[[2]] ``` @@ -308,7 +308,7 @@ x[[2]] Lists can also contain vectors and other lists. -```{r ex-ws3nw-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Lists Containing Vectors and Lists'} +```{r ex-LRiBO-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Lists Containing Vectors and Lists'} my_vector <- c(1, 2, 3) my_list <- list("Benzene", 1.3, TRUE) y <- list(TRUE, my_vector, my_list) @@ -320,7 +320,7 @@ In this example, you can use two double brackets to access the value `1.3` by se `my_list`: -```{r ex-TuTmm-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Nested List Access'} +```{r ex-J8HgS-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Nested List Access'} y[[3]][[2]] ``` @@ -331,7 +331,7 @@ Data frames are data tables in R. We use `data.frame()` to create a data frame o vectors of the same length and use them to create a data frame. -```{r ex-SWmKd-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Creating Data Frames'} +```{r ex-jnPyU-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Creating Data Frames'} pollutant <- c("Benzene", "Toluene", "Xylenes") concentration <- c(1.3, 5.5, 6.0) carcinogen <- c(TRUE, FALSE, FALSE) @@ -344,7 +344,7 @@ The output above shows a table with the vector variable names as column names, a create a data frame where the vectors are not all the same length, you will see the error shown below. -```{r ex-kWXeH-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Data Frame Dimension Mismatch Error'} +```{r ex-hJsDp-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Data Frame Dimension Mismatch Error'} pollutant <- c("Benzene", "Toluene") concentration <- c(1.3, 5.5, 6.0) carcinogen <- c(TRUE, FALSE, FALSE) diff --git a/docs/2-Functions-and-Importing-Data/readme.md b/docs/2-Functions-and-Importing-Data/readme.md index 098eedd..786cfdc 100644 --- a/docs/2-Functions-and-Importing-Data/readme.md +++ b/docs/2-Functions-and-Importing-Data/readme.md @@ -41,31 +41,32 @@ Functions are similar to variables in that they are short names that reference s R has many built-in functions that perform common tasks. When you open RStudio you can immediately use a function called `mean( )`. Here is an example of using the `mean( )` function to find the average of a vector of integers. We first save a vector of integers in the `x` variable then put the variable inside the parentheses of the function. -```{r ex-eLRXQ-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Mean'} +```{r ex-ldqeB-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Mean'} x <- c(4, 8, 1, 14, 34) mean(x) ``` -As you would expect, R has many built-in math functions. Below are a series of examples. +As you would expect, R has many built-in math functions. Below are a few +examples. -```{r ex-Q8bC4-2, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Natural Logarithm'} +```{r ex-LJu2c-2, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Natural Logarithm'} log(27) #Natural logarithm ``` -```{r ex-ZaVwW-3, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Base 10 Logarithm'} +```{r ex-5gmln-3, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Base 10 Logarithm'} log10(100) #base 10 logarithm ``` -```{r ex-IKBqr-4, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Square Root'} +```{r ex-RMHxy-4, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Square Root'} sqrt(225) # Square root ``` -```{r ex-CUuAz-5, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Absolute Value'} +```{r ex-JAkxX-5, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Absolute Value'} abs(-5) #Absolute value ``` @@ -76,7 +77,7 @@ All of the examples show that the general form is `function_name( )`. The name o Many functions also have additional options you can choose, which are called the _arguments_. To see what needs to go inside `( )`, type a question mark in front of the function and run it in the R console. -```{r ex-C7zVQ-6, eval = FALSE} +```{r ex-yDYm5-6, eval = FALSE} ?mean() ``` @@ -92,17 +93,17 @@ On the help page, under `Usage`, you see `mean(x, ...)`. This means that the onl Many built-in functions in R have multiple arguments. This allows you to give the function some more information to perform calculation you want. The example below shows how to use the `digits` argument in the `round( )` function. Providing different values to the `digits` argument will return different values. -```{r ex-E9a5s-7, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Round to Zero Digits'} +```{r ex-tHLQ7-7, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Round to Zero Digits'} round(12.3456) ``` -```{r ex-tOKoE-8, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Round to Three Digits'} +```{r ex-ycGBk-8, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Round to Three Digits'} round(12.3456, digits=3) ``` -```{r ex-I6qbQ-9, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Round to One Digit'} +```{r ex-E1DYK-9, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Round to One Digit'} round(12.3456, digits=1) ``` @@ -117,7 +118,7 @@ When you start an R session there are many built-in functions that are immediate Returns the sum of a vector of numeric values. -```{r ex-K4ObJ-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Sum of Values'} +```{r ex-4sYL0-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Sum of Values'} sum(c(2.3, 7.5, 9, -10)) ``` @@ -127,7 +128,7 @@ sum(c(2.3, 7.5, 9, -10)) Get the minimum value from a numeric vector. -```{r ex-q812b-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Minimum Value'} +```{r ex-dZn5n-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Minimum Value'} min(c(6, 9, 3, 11, -2)) ``` @@ -137,8 +138,8 @@ min(c(6, 9, 3, 11, -2)) Get the maximum value from a numeric vector. -```{r ex-hfNKR-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Maximum Value'} -max(c(6, 9, 3, 11, -2)) +```{r ex-3HUXM-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Maximum Value'} +max(c(15, 2, 8.3, -10, 21)) ``` @@ -147,7 +148,7 @@ max(c(6, 9, 3, 11, -2)) Create a numeric vector with a certain sequence. The example below creates a vector of integers from 1 to 5. -```{r ex-9JUH0-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Create Sequence with `seq()`'} +```{r ex-2OWF6-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Create Sequence with `seq()`'} seq(from = 1, to = 5, by = 1) ``` @@ -155,7 +156,7 @@ seq(from = 1, to = 5, by = 1) Another way to create a sequence of integers is to use the colon. -```{r ex-0u96f-2, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Create Sequence with Colon Operator'} +```{r ex-pLwp3-2, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Create Sequence with Colon Operator'} 1:5 ``` @@ -165,7 +166,7 @@ Another way to create a sequence of integers is to use the colon. Concatenate two or more strings. -```{r ex-mPcrl-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Concatenate Strings'} +```{r ex-CSTmi-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Concatenate Strings'} x <- "Hello" y <- "world!" paste(x, y, sep = " ") @@ -175,7 +176,7 @@ paste(x, y, sep = " ") Any numbers will be converted to strings. -```{r ex-Pm6eO-2, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Concatenate Strings with Numbers'} +```{r ex-8kS7p-2, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Concatenate Strings with Numbers'} x <- "You're number " y <- 1 z <- "!" @@ -191,7 +192,7 @@ The `substr()` function allows you to pull out a section from a string based on For example, in AQS data a monitor ID may be written in the following format: [State code - County code - Site number - Parameter code - POC]. If we only wanted to pull out the site number for this monitor ID we could do the following: -```{r ex-eHDmM-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Extract Site Number'} +```{r ex-6Uwcp-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Extract Site Number'} wisconsin_monitor <- c('55-021-0015-44201-2') # Ozone monitor in Columbia County, WI site_id <- substr(wisconsin_monitor, start = 8, stop = 11) # start and stop position within the character string. site_id @@ -206,14 +207,14 @@ R allows you to place a function inside another function to perform multiple tas For instance, if you want to create a sequence of numbers and then take the mean of that sequence, you could either do it in a couple of steps, or all at once. -```{r ex-T8pZq-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Mean in Two Steps'} +```{r ex-6U2qd-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Mean in Two Steps'} #Two steps x <- seq(from=1, to=10, by=3) mean(x) ``` -```{r ex-Kj09K-2, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Mean in One Step'} +```{r ex-NJbfk-2, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Mean in One Step'} #One step mean(seq(from=1, to=10, by=3)) @@ -224,13 +225,13 @@ _Note: Typically you don’t want to have too many nested functions because it b ## NA Values -Most of the statistical summary functions in R have the argument `na.rm`. This stands for `NA` remove. `NA` value is how R represents a missing value, similar to the NULL value in a SQL database. +Most of the statistical summary functions in R have the argument `na.rm`. This stands for `NA` remove. The `NA` value is how R represents a missing value, similar to the NULL value in a SQL database. For example, there is a built-in data frame in R called `airquality` with daily measurements from a monitor in New York from 1973 (see `?airquality`). If we load the data frame using the `data()` function and take a look at the top 6 rows using the `head()` function, we can see some missing values represented as `NA`. -```{r ex-9R5U8-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Inspect `airquality` Data'} +```{r ex-VgwFX-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Inspect `airquality` Data'} data("airquality") head(airquality) @@ -239,7 +240,7 @@ head(airquality) The `mean()` function, for example, has the argument `na.rm` set to `FALSE`. This means that the `NA` values will not be removed from the vector for which it is calculating the mean. As a result, it will return an `NA` because it cannot properly calculate the average. Here we use the `Ozone` column from the `airquality` data frame. -```{r ex-zcdjA-2, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Mean with NA Values'} +```{r ex-YUYny-2, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Mean with NA Values'} mean(airquality$Ozone) ``` @@ -247,7 +248,7 @@ mean(airquality$Ozone) To get the mean value, we set `na.rm = TRUE`. -```{r ex-7SSID-3, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Mean with NA Removal'} +```{r ex-Rd5ZH-3, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Mean with NA Removal'} mean(airquality$Ozone, na.rm = TRUE) ``` @@ -269,7 +270,7 @@ For example, if you wanted to find serial correlation in an environmental data s First, you might try to use the function. -```{r ex-qhfaw-1, error = TRUE} +```{r ex-Kdkm9-1, error = TRUE} x <- c(1.3, 3.5, 2.6, 3.4, 6.4) serialCorrelationTest(x) @@ -291,12 +292,12 @@ A window will pop up. Start typing "EnvStats" into the "Packages" box, select th Now that we've installed the package, we still can't use the function we want. We need to load the package first (opening the app). We use the `library()` function to do this. -```{r ex-iDDVR-2, message = FALSE} +```{r ex-ZPGae-2, message = FALSE} library(EnvStats) ``` -```{r ex-k8gbf-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Use `serialCorrelationTest` from EnvStats'} +```{r ex-SIGlj-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Use `serialCorrelationTest` from EnvStats'} x <- c(1.3, 3.5, 2.6, 3.4, 6.4) serialCorrelationTest(x) @@ -305,7 +306,7 @@ serialCorrelationTest(x) Here is a link to a page that lists many useful packages for environmental data analysis: https://cran.r-project.org/web/views/Environmetrics.html -Remember, when you close down RStudio, then start it up again, you don’t have to download the package again. But you do have to load the package to use any function that's not in the R core functionality (this is very easy to forget). +Remember, when you close down RStudio, then start it up again, you don’t have to download the package again. But you do have to use the `library()` function to load the package before you can use any function that's not in the R core functionality (this is very easy to forget). ## Importing Data @@ -318,7 +319,7 @@ R can import data from just about any format, including CSV, Excel, Databases, G R has a built-in function called `read.csv()` for reading `.csv` files. Download the `chicago_daily.csv` file [here](../data/chicago_daily.csv) and save it to your working directory. If you don't know what your working directory is, run this code in R and it will tell you. -```{r ex-CSl8K-1, eval = FALSE} +```{r ex-KBipD-1, eval = FALSE} getwd() ``` @@ -326,7 +327,7 @@ getwd() Use `read.csv()` by providing the location and name of the file as the first argument. If the file is in your working directory, simply supply the name of the file. Below, the data from the file is read into R and saved as a data frame, which is the data type for storing tables. The function `head()` will show the first few lines. -```{r ex-Ibwgs-2, eval = FALSE} +```{r ex-fRIyU-2, eval = FALSE} chicago_daily <- read.csv("chicago_daily.csv") head(chicago_daily) @@ -337,7 +338,7 @@ head(chicago_daily) There are several packages that can be used to import data from an Excel file, such as `xlsx`, `XLConnect`, and `readxl`. In this example, we'll use the `readxl` package. If you do not have the package installed, you can use RStudio to install as described in the section above on packages. You can also use the function `install.packages( )`. -```{r ex-3ilbk-1, eval = FALSE} +```{r ex-vGMD9-1, eval = FALSE} install.packages("readxl") ``` @@ -353,7 +354,7 @@ library(readxl) Use the `read_excel()` function from the `readxl` package to read emissions data from [this Excel workbook](./data/emissions_IL_2022.xlsx). Download the file to your working directory and read the first worksheet (named "UNIT_DATA"), skipping the first 6 rows. -```{r ex-ZKauj-3, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Read and Inspect Excel Data'} +```{r ex-21K7t-3, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Read and Inspect Excel Data'} library(readxl) emissions <- read_excel("./data/emissions_IL_2022.xlsx", sheet = "UNIT_DATA", skip = 6) head(emissions) @@ -366,7 +367,7 @@ head(emissions) ### Exercise 1 -Use the `seq()` function to create a vector from 1 to 20 by 2. For help with the parameters, run `?seq()` in the console and use the documentation. +Use the `seq()` function to create a vector from 1 to 20 by 2. For help with the parameters, run `?seq()` in the console and consult the documentation.
Click for Hint diff --git a/docs/3-Subsetting-Sorting-and-Combining/readme.md b/docs/3-Subsetting-Sorting-and-Combining/readme.md index c3d78e1..94284a4 100644 --- a/docs/3-Subsetting-Sorting-and-Combining/readme.md +++ b/docs/3-Subsetting-Sorting-and-Combining/readme.md @@ -31,7 +31,7 @@ The example data for exercises in this lesson is available directly from this pa To load the `chicago_air` data frame we will be using in the lesson, simply use the `data()` function to load the data frame. -```{r ex-2Gbuo-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Load `chicago_air` Data Frame'} +```{r ex-SqovC-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Load `chicago_air` Data Frame'} data(chicago_air) ``` @@ -42,7 +42,7 @@ You should see the `chicago_air` variable in the top right panel of RStudio, whi We will also use some functions from the `dplyr` package. You will need to install the package if you haven't already. -```{r ex-KBDaW-2, exercise = FALSE, exercise.eval = FALSE, eval = FALSE, exercise.cap = 'Install `dplyr` Package'} +```{r ex-2kcO9-2, exercise = FALSE, exercise.eval = FALSE, eval = FALSE, exercise.cap = 'Install `dplyr` Package'} install.packages("dplyr") ``` @@ -55,7 +55,7 @@ We always want to make sure our data looks the way it is supposed to before we b The best way to take a quick look at the first few rows of a data frame is to use the `head()` function. -```{r ex-cXEro-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'View the First Few Rows of `chicago_air`'} +```{r ex-hv5Qx-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'View the First Few Rows of `chicago_air`'} data(chicago_air) head(chicago_air) @@ -64,7 +64,7 @@ head(chicago_air) You can specify the number of lines to display by using the `n` parameter. -```{r ex-jSptn-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Specify Number of Lines with `head()`'} +```{r ex-3x2rr-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Specify Number of Lines with `head()`'} data(chicago_air) head(chicago_air, n = 3) @@ -73,7 +73,7 @@ head(chicago_air, n = 3) You can also look at the bottom of the data frame by using the `tail()` function. -```{r ex-DbIdT-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'View the Last Few Rows of `chicago_air`'} +```{r ex-BjwqU-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'View the Last Few Rows of `chicago_air`'} data(chicago_air) tail(chicago_air) @@ -82,7 +82,7 @@ tail(chicago_air) In RStudio, you can either click on the name of the data frame in the top right panel or use the `View()` function to generate a web based table of the data in the top left panel. -```{r ex-YNSwn-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Use `View()` to Inspect `chicago_air`'} +```{r ex-H1DeE-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Use `View()` to Inspect `chicago_air`'} data(chicago_air) View(chicago_air) @@ -93,7 +93,7 @@ View(chicago_air) By inspecting the data frame this way, you can see that the records are daily values of ozone, temperature, and solar radiation. For more information about the data set you can type a question mark in from the name of the data frame variable in the console. -```{r ex-quVoO-5, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Learn More About `chicago_air`'} +```{r ex-uojGa-5, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Learn More About `chicago_air`'} ?chicago_air ``` @@ -104,7 +104,7 @@ From the `Description` section of the help page, the `chicago_air` data frame is ## Subsetting -If we want to work with a particular subset of a data frame, we need to know how to select particular records. We will cover how to subset using numeric indexing, logical conditions, and the `filter()` function. +If we want to work with a particular subset of a data frame, we need to know how to select those records. We will cover how to subset using numeric indexing, logical conditions, and the `filter()` function. ## Indexing @@ -119,7 +119,7 @@ Values in a data frame can be selected, individually or in a group, based on the Below is a data frame called `my_data` that has 3 rows and 2 columns. -```{r ex-rUpwS-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Explore Data Frame Indexing with `my_data`'} +```{r ex-KwyTj-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Explore Data Frame Indexing with `my_data`'} my_data <- data.frame(colors = c("red", "green", "yellow"), fruit = c("apple", "grape", "banana")) @@ -130,7 +130,7 @@ my_data To select a particular cell from the `my_data` data frame, we use the `[row, column]` construction. We place those square brackets at the end of the data frame variable `my_data[]` and use integers to select a value. If we wanted to select the "green" value, we would use `my_data[2, 1]`. -```{r ex-lfswa-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Select a Specific Value Using Indexing'} +```{r ex-IAy8F-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Select a Specific Value Using Indexing'} my_data[2, 1] ``` @@ -138,7 +138,7 @@ my_data[2, 1] To select "banana", we use `my_data[3, 2]`. -```{r ex-w3DNF-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access Another Specific Value Using Indexing'} +```{r ex-Xs5Wm-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access Another Specific Value Using Indexing'} my_data[3, 2] ``` @@ -146,7 +146,7 @@ my_data[3, 2] We can also access data from a vector using the same indexing idea. In this case, you don’t need the comma to separate the rows and columns since you are accessing one dimensional data. Below is a vector of numbers. -```{r ex-SXN7m-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Vector Indexing Example'} +```{r ex-y7DY6-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Vector Indexing Example'} x <- c(1, 3, 2, 7, 25.3, 6) x @@ -155,7 +155,7 @@ x If we want to access the 5th element of the vector, we would use `x[5]`. -```{r ex-W7JCa-5, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access a Vector Element by Index'} +```{r ex-cp1Uc-5, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access a Vector Element by Index'} x[5] ``` @@ -163,7 +163,10 @@ x[5] Now that we understand indexing we can subset the `chicago_air` data frame by using the brackets `[ , ]` function. (This is a rare example of a function in R that does not have the form `function_name()`.) -```{r ex-gz6TG-6, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Subset `chicago_air` Data Frame Using Indexing'} +To get one row of the data frame, specify the row number you would like in the brackets, on the left side of the comma. If you leave the column value on the right side of the comma blank, it returns all the columns associated with row number 1. + + +```{r ex-yCmpW-6, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Subset `chicago_air` Data Frame Using Indexing'} data(chicago_air) chicago_air[1, ] @@ -172,7 +175,7 @@ chicago_air[1, ] If you want more than one row, you can supply a vector of row numbers. Below, the vector access the 1st, 2nd, and 5th rows of the data frame. -```{r ex-bziTN-7, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access Multiple Rows with a Vector of Row Numbers'} +```{r ex-cOzSF-7, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access Multiple Rows with a Vector of Row Numbers'} data(chicago_air) chicago_air[c(1, 2, 5), ] @@ -181,7 +184,7 @@ chicago_air[c(1, 2, 5), ] To get a column from the data frame, specify the column number in the brackets, to the right of the comma. By leaving the row value blank, you are telling it to return all rows associated with column 1. Below, we wrap the output in the `head()` function to limit the number of rows printed. -```{r ex-b9oJu-8, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access a Single Column'} +```{r ex-IrR7i-8, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access a Single Column'} data(chicago_air) head(chicago_air[, 1]) @@ -193,7 +196,7 @@ As you can see, a vector is returned. When a column of a data frame is selected You can also obtain more than one column by supplying a vector of column numbers. -```{r ex-j9Z0m-9, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access Multiple Columns'} +```{r ex-loxf0-9, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access Multiple Columns'} data(chicago_air) head(chicago_air[, c(3, 4, 6)]) @@ -205,7 +208,7 @@ Since more than one column is selected, then a data frame is returned. A column name can be used to select a vector. -```{r ex-k1uUZ-10, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access a Column by Name'} +```{r ex-k12Vu-10, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access a Column by Name'} data(chicago_air) head(chicago_air[, "solar"]) @@ -214,7 +217,7 @@ head(chicago_air[, "solar"]) Or a vector of column names can subset to a slimmed down data frame. -```{r ex-8qN6e-11, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Subset to Specific Columns by Name'} +```{r ex-0TavJ-11, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Subset to Specific Columns by Name'} data(chicago_air) head(chicago_air[, c("ozone", "temp", "month")]) @@ -223,17 +226,17 @@ head(chicago_air[, c("ozone", "temp", "month")]) Both rows and columns can be specified at the same time. The example below returns the first 5 rows of the temperature and solar columns. -```{r ex-lI3Jg-12, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Specify Both Rows and Columns for Subsetting'} +```{r ex-DACxr-12, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Specify Both Rows and Columns for Subsetting'} chicago_air[1:5, c("temp", "solar")] ``` ## Access a Column with `$` -In R, the dollar sign `$` is a special character that can be used to access a data frame column by name. The dollar sign is placed immediately after the variable name. For example, if we wanted to access the temperature values in the `chicago_air` data frame, then we would use `chicago_air$temp`. +In R, the dollar sign `$` is a special character that can be used to access a data frame column by name. The dollar sign is placed immediately after the variable name. For example, if we wanted to access the temperature values in the `chicago_air` data frame, then we could use `chicago_air$temp`. -```{r ex-NnbKI-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access a Column with `$`'} +```{r ex-cvUm8-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access a Column with `$`'} data(chicago_air) head(chicago_air$temp) @@ -267,12 +270,12 @@ Below is a table of logical operators in R that can be used to create logical co The result of a logical expression is a logical data type, a boolean value `TRUE` or `FALSE`. -```{r ex-1sHEg-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Logical Expressions Example 1'} +```{r ex-TDHV9-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Logical Expressions Example 1'} 1 + 1 == 2 ``` -```{r ex-kO2Er-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Logical Expressions Example 2'} +```{r ex-55zuN-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Logical Expressions Example 2'} 10 > 20 ``` @@ -280,7 +283,10 @@ The result of a logical expression is a logical data type, a boolean value `TRUE Vectors can also be used in a logical expression. A vector of values on the left hand side of a logical operator will return a vector of the same length with boolean values. -```{r ex-rODK2-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Vector Logical Expression'} +Here, we check if any of the integers in the vector on the left are above 60. A logical vector is returned. + + +```{r ex-hSELm-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Vector Logical Expression'} c(25, 80, 55) > 60 ``` @@ -291,7 +297,7 @@ This concept can be used to subset a data frame. A logical vector can be used in We can use the data frame of colors and fruit again to demonstrate. -```{r ex-gHaZ4-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Create a Logical Vector for Subsetting'} +```{r ex-akTxt-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Create a Logical Vector for Subsetting'} my_data <- data.frame(colors = c("red", "green", "yellow"), fruit = c("apple", "grape", "banana")) @@ -302,7 +308,7 @@ my_data If we only wanted records with the "yellow" color, we could use the vector `c(FALSE, FALSE, TRUE)`. Place this vector in the brackets of the data frame, where we select rows. -```{r ex-x978o-5, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Subset Using a Logical Vector'} +```{r ex-6PDgP-5, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Subset Using a Logical Vector'} my_data[c(FALSE, FALSE, TRUE), ] ``` @@ -313,7 +319,7 @@ A data frame is returned. The only record is from the 3rd row of the logical vec But a more useful way of creating the logical vector is with a logical expression. Below we access the "color" column as a vector using the `$` operator. Then we create a logical vector using a logical expression. -```{r ex-hYfZH-6, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Create a Logical Vector with a Logical Expression'} +```{r ex-Z6Ft7-6, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Create a Logical Vector with a Logical Expression'} colors <- my_data$colors colors @@ -327,7 +333,7 @@ yellow Now we can use the logical vector `yellow` to subset the data frame down to records that have the color yellow. -```{r ex-iiUPn-7, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Subset Data Frame Using Logical Vector'} +```{r ex-cAyf6-7, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Subset Data Frame Using Logical Vector'} my_data[yellow, ] ``` @@ -335,7 +341,7 @@ my_data[yellow, ] The `chicago_air` data frame can be subset in a similar way. Below, a logical vector `hot` is created to represent hot days above 90 degrees. The data frame is subset down to records with hot days. -```{r ex-ybdNp-8, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Subset `chicago_air` for Hot Days'} +```{r ex-XqUl2-8, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Subset `chicago_air` for Hot Days'} data(chicago_air) hot <- chicago_air$temp > 90 @@ -351,7 +357,7 @@ A logical vector can also be used in combination with the function `filter()`. The `filter()` function is from a package called `dplyr` which provides many functions for manipulating data frames. -```{r ex-TJFSC-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Load `dplyr` and Explore `filter()`'} +```{r ex-S2pIk-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Load `dplyr` and Explore `filter()`'} # if you have not installed dplyr @@ -361,13 +367,13 @@ library(dplyr) ``` -The benefit of using `filter()` is that it works the way other functions in R typically work. It used braces with parameters, and not brackets `[ , ]`. The first parameter is the data frame you want to subset, and the second parameter is a logical expression. It also allows you to reference the columns in the data frame by name, without having to access the column using `$`. +The benefit of using `filter()` is that it works the way other functions in R typically work. It used parentheses with parameters `( )`, and not brackets `[ , ]`. The first parameter is the data frame you want to subset, and the second parameter is a logical expression. It also allows you to reference the columns in the data frame by name, without having to access the column using `$`. If we want to filter down to records in the `chicago_air` data frame where ozone was above 60 ppb (.060 ppm), we would use the following code. -```{r ex-O4IYp-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Filter `chicago_air` for High Ozone Levels'} +```{r ex-ZJrGf-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Filter `chicago_air` for High Ozone Levels'} data(chicago_air) high_ozone <- filter(chicago_air, ozone > 0.060) @@ -378,7 +384,7 @@ high_ozone If we wanted all of the high ozone days in the 6th month, we add another expression separated by a comma. -```{r ex-hXWfn-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Filter for High Ozone Levels in June'} +```{r ex-02QFi-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Filter for High Ozone Levels in June'} data(chicago_air) high_ozone_june <- filter(chicago_air, ozone > 0.060, month == 6) @@ -389,7 +395,7 @@ high_ozone_june Additional logical expressions can be added by separating each expression with a comma. The comma serves as a logical AND. Below is an equivalent output to the output above, using `&` instead of a comma. -```{r ex-Kp4Gx-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Alternative `filter()` Syntax Using `&`'} +```{r ex-pQZj5-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Alternative `filter()` Syntax Using `&`'} data(chicago_air) high_ozone_june <- filter(chicago_air, ozone > 0.060 & month == 6) @@ -405,7 +411,7 @@ The `dplyr` package also has a function named `arrange()` that will sort a data Below, the `chicago_air` data frame is ordered by the `ozone` column. The default is ascending order. -```{r ex-OnGT8-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Sort `chicago_air` by `ozone`'} +```{r ex-kqnGo-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Sort `chicago_air` by `ozone`'} # if the dplyr library is not already loaded library(dplyr) data(chicago_air) @@ -419,7 +425,7 @@ head(ozone_ordered) To use descending order, wrap the column in the `desc()` function (also from the `dplyr` package). -```{r ex-dQB5v-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Sort `chicago_air` in Descending Order by `ozone`'} +```{r ex-Eqs1R-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Sort `chicago_air` in Descending Order by `ozone`'} data(chicago_air) ozone_descending <- arrange(chicago_air, desc(ozone)) @@ -430,7 +436,7 @@ head(ozone_descending) Additional columns can be used to sort the data frame, separated by a comma. -```{r ex-0nc9T-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Sort `chicago_air` by Multiple Columns'} +```{r ex-MlGEh-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Sort `chicago_air` by Multiple Columns'} data(chicago_air) ozone_temp <- arrange(chicago_air, desc(ozone), desc(temp)) @@ -446,7 +452,7 @@ If we are working with multiple data frames in R, it is sometimes useful to comb To illustrate, we will make two subsets of the `chicago_air` data frame, then combine them together using the `bind_rows()` function. Below, the original number of records in the `chicago_air` data frame is shown using the `nrow()` function. We will split the data frame and recombine to a data frame with the original number of records. -```{r ex-ZoezH-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Count Rows in `chicago_air`'} +```{r ex-P0z5V-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Count Rows in `chicago_air`'} # if you have not loaded the dplyr package library(dplyr) data(chicago_air) @@ -458,7 +464,7 @@ nrow(chicago_air) Now we split the data frame into warm and cool data frames using the `filter()` function. -```{r ex-jhgMN-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Split `chicago_air` into Warm and Cool Subsets'} +```{r ex-zZOmG-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Split `chicago_air` into Warm and Cool Subsets'} data(chicago_air) warm <- filter(chicago_air, temp > 80) @@ -474,7 +480,7 @@ nrow(cool) We can confirm that the rows from these two data frames add up to the original data frame. -```{r ex-iKU5j-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Verify Row Counts Before Combining'} +```{r ex-vLzB7-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Verify Row Counts Before Combining'} nrow(warm) + nrow(cool) == nrow(chicago_air) ``` @@ -482,7 +488,7 @@ nrow(warm) + nrow(cool) == nrow(chicago_air) Now we combine using the `bind_rows()` function and confirm that the new `recombined` data frame has the same number of records as the original data frame. -```{r ex-7dCVE-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Recombine Data Frames and Verify Row Count'} +```{r ex-vyQkw-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Recombine Data Frames and Verify Row Count'} data(chicago_air) recombined <- bind_rows(warm, cool) @@ -607,6 +613,32 @@ data(chicago_air) descending <- arrange(chicago_air, desc(date)) head(descending) + +``` + +
+ +--- + + +### Exercise 4 + +Create two data frames using the `data.frame()` function. The first data frame should have the columns `monitor_id` and `state` and at least one record. The second data frame should have the same column names and at least one record. Use the `dplyr` function `bind_rows()` to combine the two data frames. + +
Click for Solution + +#### Solution + + + +```r +library(dplyr) + +monitors_1 <- data.frame(monitor_id = c(1, 2, 3), state = c("IL", "IN", "WI")) + +monitors_2 <- data.frame(monitor_id = c(4, 5, 6), state = c("MI", "OH", "MN")) + +bind_rows(monitors_1, monitors_2) ```
diff --git a/docs/4-Writing-Functions-Conditionals-and-Loops/readme.md b/docs/4-Writing-Functions-Conditionals-and-Loops/readme.md index 16255f2..b09a6ea 100644 --- a/docs/4-Writing-Functions-Conditionals-and-Loops/readme.md +++ b/docs/4-Writing-Functions-Conditionals-and-Loops/readme.md @@ -17,7 +17,7 @@ function. - [if Functions](#if-functions) -- [For loop](#for-loop) +- [For loops](#for-loops) - [apply function](#apply-function) @@ -31,7 +31,7 @@ This lesson assumes you are familiar with the material in the previous lessons: The data for these lessons is available from this package. It is assumed that this package is already installed and loaded into the R session. If you need to refer to the package, simply refer to it as "this package". -```{r ex-CTluV-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Load Data from This Package'} +```{r ex-aTV9t-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Load Data from This Package'} # Assuming the package is already loaded data(chicago_air) @@ -48,7 +48,7 @@ the thing that's saved is not a data object but lines of R code. To save your own function, use this construction: -```{r ex-LXwfs-1, eval = FALSE, exercise = TRUE, exercise.cap = 'Creating Your Own Function'} +```{r ex-PQd95-1, eval = FALSE, exercise = TRUE, exercise.cap = 'Creating Your Own Function'} my_function_name <- function() { # lines of R code @@ -61,7 +61,7 @@ We can write a simple function that prints something to the console. Here is a function named `print_hello`. -```{r ex-oJcqC-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Simple Function to Print Hello'} +```{r ex-LwvrO-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Simple Function to Print Hello'} print_hello <- function() { print("Hello") @@ -70,7 +70,7 @@ print_hello <- function() { ``` -```{r ex-fkz5A-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing the print_hello Function'} +```{r ex-Hfydw-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing the print_hello Function'} print_hello() ``` @@ -86,7 +86,7 @@ Here we recreate the same function, but this time we add an argument `text` insi of the parentheses. -```{r ex-esx9e-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Function with an Argument'} +```{r ex-FVuEL-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Function with an Argument'} print_hello <- function(text) { message <- paste("Hello", text) @@ -97,7 +97,7 @@ print_hello <- function(text) { ``` -```{r ex-XsG8h-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing the Modified print_hello Function'} +```{r ex-QOUS5-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing the Modified print_hello Function'} print_hello(text = "everybody!") ``` @@ -105,13 +105,13 @@ print_hello(text = "everybody!") ## Default Values We can create a function with more than one argument, and set default values when -needed. Suppose we would like to make a function that checks if a measurement is below -a criteria pollutant standard. We could make simple function that takes two arguments: -one for the measurement value, and one for the standard value. +needed. Suppose we would like to make a function that checks if a measurement is +greater than a criteria pollutant standard. We could make a simple function that +takes two arguments: one for the measurement value, and one for the standard value. -```{r ex-LGi3P-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Function with Two Arguments'} -check_standard <- function(measurement, standard) { +```{r ex-orqEk-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Function with Two Arguments'} +standard_violated <- function(measurement, standard) { measurement > standard @@ -119,8 +119,8 @@ check_standard <- function(measurement, standard) { ``` -```{r ex-NbLk3-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing check_standard Function'} -check_standard(measurement = 84, standard = 70) +```{r ex-787G5-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing standard_violated Function'} +standard_violated(measurement = 84, standard = 70) ``` @@ -128,13 +128,13 @@ We could write a more specific function for checking a value against the ozone standard. For this function, we want to keep the `standard` parameter but make sure the default is `70`. It may be that we typically want to use this function to check against the current 8-hour ozone standard in parts per billion, but have -then flexibility to use a different value. +the flexibility to use a different value. To set a default value, we use `= 70` when we create the function. -```{r ex-f8iTW-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Function with Default Value'} -check_standard <- function(measurement, standard = 70) { +```{r ex-OSEAX-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Function with Default Value'} +standard_violated <- function(measurement, standard = 70) { measurement > standard @@ -142,8 +142,8 @@ check_standard <- function(measurement, standard = 70) { ``` -```{r ex-QMHje-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing check_standard with Default Value'} -check_standard(measurement = 50) +```{r ex-OcZtl-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing standard_violated with Default Value'} +standard_violated(measurement = 50) ``` @@ -154,7 +154,7 @@ arguments are important. The user can supply values for the arguments in the ord they appeared in the parentheses of the `function( ){}` call, without writing out the argument names. -For example, we can supply two numbers to the `check_standard()` function that we +For example, we can supply two numbers to the `standard_violated()` function that we created above, without writing out the `measurement` and `standard` arguments. When R executes the function, it will assign the numbers to the arguments based on the position in the parentheses. @@ -163,13 +163,13 @@ Here we show that using two numbers in a different order will return different outputs. -```{r ex-LLJD7-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing Positional Arguments'} -check_standard(60, 70) +```{r ex-pBarX-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing Positional Arguments'} +standard_violated(60, 70) ``` -```{r ex-i1zJN-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing Positional Arguments in Reverse Order'} -check_standard(70, 60) +```{r ex-fkuGs-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing Positional Arguments in Reverse Order'} +standard_violated(70, 60) ``` @@ -185,7 +185,7 @@ will run if the logical expression is `TRUE` is placed inside curly braces. Belo is the outline (not actual R code). -```{r ex-TcrFf-1, eval = FALSE, exercise = TRUE, exercise.cap = 'if Function Outline'} +```{r ex-t3z5X-1, eval = FALSE, exercise = TRUE, exercise.cap = 'if Function Outline'} if(logical expression>) { @@ -194,7 +194,7 @@ if(logical expression>) { ``` -```{r ex-zgNbe-2, eval = FALSE, exercise = TRUE, exercise.cap = 'if-else Function Outline'} +```{r ex-fOsIR-2, eval = FALSE, exercise = TRUE, exercise.cap = 'if-else Function Outline'} if() { @@ -207,7 +207,7 @@ if() { ``` -```{r ex-b9XAw-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'if Function Example'} +```{r ex-jEvVd-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'if Function Example'} ozone <- 0.075 if(ozone > 0.065) { @@ -222,7 +222,7 @@ if(ozone > 0.065) { ``` -```{r ex-P4dii-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Another if Function Example'} +```{r ex-KMFJr-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Another if Function Example'} ozone <- 0.06 if(ozone > 0.065) { @@ -237,12 +237,12 @@ if(ozone > 0.065) { ``` -```{r ex-SWlRF-5, eval = FALSE, exercise = TRUE, exercise.cap = 'ifelse Function Outline'} +```{r ex-9E7jX-5, eval = FALSE, exercise = TRUE, exercise.cap = 'ifelse Function Outline'} ifelse(, , ) ``` -```{r ex-t7oAn-6, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'ifelse Function Example'} +```{r ex-mzTAI-6, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'ifelse Function Example'} ozone_value <- 0.06 message <- ifelse(ozone_value > 0.065, "Potential Health Effects", "All Good") @@ -251,7 +251,7 @@ print(message) ``` -## For loop +## For loops Like most programming languages, R has for and while loops. This tutorial will cover just for loops and move on to `apply()` functions, which are more commonly @@ -261,7 +261,7 @@ For loops are used to repeat an operation a set number of times. Here is the basic outline: -```{r ex-L8axJ-1, eval = FALSE, exercise = TRUE, exercise.cap = 'For Loop Outline'} +```{r ex-rMRtF-1, eval = FALSE, exercise = TRUE, exercise.cap = 'For Loop Outline'} for(i in sequence){ @@ -270,7 +270,7 @@ for(i in sequence){ ``` -```{r ex-ybMKy-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'For Loop Example'} +```{r ex-9uJFw-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'For Loop Example'} for(i in c(1, 2, 3)) { print(i) @@ -288,7 +288,7 @@ Here is an example data frame we will use. It represents a few values from three monitors. -```{r ex-icnkn-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'For Loop with Data Frame'} +```{r ex-8jAA9-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'For Loop with Data Frame'} monitors <- data.frame(monitor1 = c(50, 60, 58, 52), monitor2 = c(55, 59, 65, 61), monitor3 = c(70, 62, 68, 71)) @@ -297,7 +297,13 @@ monitors ``` -```{r ex-WNnr2-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Finding Maximum Values with For Loop'} +In the code below, we create an empty vector called max_values. As the +for() function loops through the vector c(1, 2, 3), the data frame columns +are accessed using square brackets [ , i]. Each max value is saved to +the max_values vector using square brackets [i]. + + +```{r ex-ZQtez-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Finding Maximum Values with For Loop'} max_values <- c() for(i in c(1, 2, 3)) { @@ -328,13 +334,28 @@ The example below applies the `max()` function to the `monitors` data frame from the previous section. -```{r ex-pr0eu-1, eval = FALSE, exercise = TRUE, exercise.cap = 'apply Function Example'} +```{r ex-AZOyl-1, eval = FALSE, exercise = TRUE, exercise.cap = 'apply Function Example'} monitors_max <- apply(monitors, MARGIN = 2, FUN = max) monitors_max ``` +The MARGIN argument is set to 2 because we are applying the max() function +to the columns of the data frame. Also notice that we do not need to create +an initial empty vector, as we did with the for() function. The returned +value is a named vector that is as long as the number of columns in the +data frame. + +We could also find the mean of each row in the `monitors` data frame. +To do this, we would set the `MARGIN` argument to `1`. + + +```{r ex-uDkYC-2, eval = FALSE, exercise = TRUE, exercise.cap = 'apply Function Example with MARGIN = 1'} +apply(monitors, MARGIN = 1, FUN = mean) + +``` + ## Exercises diff --git a/docs/5-Plotting/readme.md b/docs/5-Plotting/readme.md index 71aa968..b3e806b 100644 --- a/docs/5-Plotting/readme.md +++ b/docs/5-Plotting/readme.md @@ -21,6 +21,8 @@ to build more complicated, multi-faceted plots. The `ggplot2` package is a power - [Saving Plots](#saving-plots) +- [Factors](#factors) + ## Prerequisites This lesson assumes you are familiar with the material in the lesson on @@ -28,7 +30,7 @@ This lesson assumes you are familiar with the material in the lesson on It also uses functions from the `ggplot2` package which needs to be installed. -```{r ex-jjPLB-1, eval = FALSE} +```{r ex-x8DCE-1, eval = FALSE} install.packages("ggplot2") ``` @@ -36,7 +38,7 @@ install.packages("ggplot2") The example data for the exercises is available from this package. To load the `chicago_air` data frame we will be using in the lesson, simply use the `data()` function. -```{r ex-wgpUZ-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Loading Data'} +```{r ex-AHP6c-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Loading Data'} data(chicago_air) ``` @@ -48,7 +50,7 @@ will be displayed on the y-axis of a coordinate graph, with the index number of vector taking the x-axis values. -```{r ex-oYDPX-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Basic Plot'} +```{r ex-HSzdk-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Basic Plot'} benzene <- c(1.3, 4.5, 2.6, 3.4, 6.4) plot(benzene) @@ -61,7 +63,7 @@ the `chicago_air` data frame to create a scatterplot of temperature on the x-axi and ozone on the y-axis. -```{r ex-QIAYA-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Scatter Plot'} +```{r ex-O0gbX-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Scatter Plot'} plot(x = chicago_air$temp, y = chicago_air$ozone) ``` @@ -70,7 +72,7 @@ To see data plotted over time, we need to convert the `date` column to a `Date` data type. -```{r ex-uzSQI-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Converting Date Column'} +```{r ex-HWUJ5-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Converting Date Column'} chicago_air$date <- as.Date(chicago_air$date) ``` @@ -78,7 +80,7 @@ chicago_air$date <- as.Date(chicago_air$date) Here is ozone plotted by day as a line graph. -```{r ex-t8iMQ-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Line Graph'} +```{r ex-zhdKD-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Line Graph'} plot(x = chicago_air$date, y = chicago_air$ozone, type = 'l') ``` @@ -88,13 +90,13 @@ to control the look of the graph. The plot below demonstrates a few of these options. Run `?plot` to see a list of all the arguments in the help file. -```{r ex-jdAhG-5, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Customized Plot'} +```{r ex-7bj5y-5, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Customized Plot'} plot(x = chicago_air$date, y = chicago_air$ozone, type='l', pch = 16, col = "purple", lwd = 2.5, - xlab="Date", + xlab = "Date", ylab = 'Ozone (ppm)', main = 'Chicago Ozone Data') @@ -107,7 +109,7 @@ of a data set as a histogram. Below is the default output of the ozone data from the `chicago_air` data frame. -```{r ex-yKUas-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Default Histogram'} +```{r ex-vOsLC-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Default Histogram'} hist(chicago_air$ozone) ``` @@ -117,7 +119,7 @@ each bar, with the `breaks` argument. For example, supplying `breaks = 20` will make a histogram with 20 bars. Other arguments allow you to control the titles and colors of the plot. Run `?hist` to see a complete list of arguments on the help page. -```{r ex-Z3Z3l-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Customized Histogram'} +```{r ex-JH4UK-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Customized Histogram'} hist(chicago_air$ozone, breaks = 20, main = "Histogram of Ozone", @@ -134,7 +136,7 @@ argument. If a data frame is used, then the columns can be referenced without th `$` operator, and a formula must be used. -```{r ex-pMLrR-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Simple Box Plot'} +```{r ex-TzseJ-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Simple Box Plot'} boxplot(chicago_air$ozone) ``` @@ -144,8 +146,8 @@ median of the ozone values. The horizontal lines at the end of the dotted lines are the min and max ozone values within a certain range from the box (specifically, 1.5 times the IQR). If a value falls outside that range, it will be represented by a point (the default point type is a circle). Overall, the plot gives an idea of -where the middle half of the values are, and if there are extreme values beyond -that. +where the middle half of the values are, and if there are extreme values +on either side of the distribution. The benefit of supplying a data frame to the `data` argument is to break the data @@ -154,10 +156,13 @@ the `x` argument must be a _formula_. In R, a formula is a data type that repres an equation like y = x. The way to represent this relationship in R is with the `~` character: `y ~ x`. The `boxplot()` function needs a formula to know which column in the data frame is being plotted, and which column is used to do the -grouping. We can make a plot of ozone by month using the `chicago_air` data frame. +grouping. + +We can make a plot of ozone by month using the `chicag_air` data frame +and the formula `ozone ~ month`. -```{r ex-xOp8M-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Box Plot by Month'} +```{r ex-NI9lL-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Box Plot by Month'} boxplot(ozone ~ month, data = chicago_air) ``` @@ -170,10 +175,10 @@ way. To use `ggplot2`, we typically follow this sequence of steps: -1. Start with the `ggplot()` function where we specify the dataset and - map variables to aesthetics (visual properties of objects in the +1. Start with the `ggplot()` function where we specify the dataset, and then we + map variables to "aesthetics" (i.e. visual properties of objects in the plot like shapes or colors). -2. Add `geoms` – geometric objects like points (`geom_point` for +2. Add `geoms` which are geometric objects like points (`geom_point` for scatter plots), bars (`geom_bar` for bar plots), or lines (`geom_line` for line plots) that determine the type of the plot. 3. Finally, customize and refine the plot with additional layers like @@ -183,19 +188,20 @@ To use `ggplot2`, we typically follow this sequence of steps: Let's begin by loading the `ggplot2` package. -```{r ex-ii28k-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Loading ggplot2 Package'} +```{r ex-XUxiq-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Loading ggplot2 Package'} library(ggplot2) ``` -Let's recreate the base scatter plot of ozone and temperature from the previous +First we recreate the base scatter plot of ozone and temperature from the previous section, starting with the `ggplot( )` function. The first argument `data` takes -the data frame. The `mapping` argument takes another function named `aes()`, which +the data frame. The `mapping` argument takes a function named `aes()`, which is short for aesthetic. The primary arguments in the `aes( )` function are `x` -and `y`. These determine which column from the data frame is used on the x and y axes. +and `y`. These determine which columns from the data frame are displayed +on the graph. -```{r ex-EvIUn-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Base ggplot'} +```{r ex-oTzAU-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Base ggplot'} ggplot(chicago_air, aes(x = temp, y = ozone)) ``` @@ -206,14 +212,14 @@ to the plot, which is done by adding a function using the `+` sign. For a point plot, we add the `geom_point()` function. -```{r ex-9SCpO-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'ggplot Scatter Plot'} +```{r ex-JIEo6-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'ggplot Scatter Plot'} ggplot(chicago_air, aes(x = temp, y = ozone)) + geom_point() ``` Additional modifications can be made. Customize it by adding color, title, and labels. -```{r ex-TfF6b-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Customized ggplot Scatter Plot'} +```{r ex-m0aaG-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Customized ggplot Scatter Plot'} ggplot(chicago_air, aes(x = temp, y = ozone)) + geom_point(color = "forestgreen") + ggtitle('Relationship between Ozone and Temperature') + @@ -223,12 +229,12 @@ ggplot(chicago_air, aes(x = temp, y = ozone)) + ``` The `aes()` function can also map aesthetic properties like color based on other -columns in the data frame. We could want each point to have a different color based +columns in the data frame. We might want each point to have a different color based on the month. To do this, we need to make the month column a factor and use the argument `color` in the `aes( )` function. -```{r ex-2SiKx-5, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Color Coded Scatter Plot'} +```{r ex-09Sbl-5, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Color Coded Scatter Plot'} ggplot(chicago_air, aes(x = temp, y = ozone, color = factor(month))) + geom_point() + ggtitle('Relationship between Ozone and Temperature') + @@ -238,16 +244,15 @@ ggplot(chicago_air, aes(x = temp, y = ozone, color = factor(month))) + ``` The `factor()` function converts the data type of the month column to a `factor` -class. This class represents categorical variables in R. See the -[lesson on regression](../7-Regression-and-Data-Transformation/readme.md) for more -details on factors in R. +class. This class represents categorical variables in R. See the section +on Factors in this lesson for more details. To create a line plot of ozone over time, we use the `as.Date()` function on the date column and replace the `geom_point( )` function with the `geom_line( )` function. -```{r ex-F1gzi-6, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Line Plot Over Time'} +```{r ex-ot69d-6, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Line Plot Over Time'} ggplot(chicago_air, aes(x = as.Date(date), y = ozone)) + geom_line() ``` @@ -257,7 +262,7 @@ the width of each bar, the `fill` argument the color of the bars, and the `color argument the outline of the bars. -```{r ex-7i93G-7, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'ggplot Histogram'} +```{r ex-HQ6Fj-7, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'ggplot Histogram'} ggplot(chicago_air, aes(ozone)) + geom_histogram(binwidth=0.005, fill='darkorange', color='black') @@ -266,7 +271,7 @@ ggplot(chicago_air, aes(ozone)) + The `geom_boxplot()` function will create a box plot. -```{r ex-jOxh0-8, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Simple ggplot Box Plot'} +```{r ex-aO0gx-8, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Simple ggplot Box Plot'} ggplot(chicago_air, aes(ozone)) + geom_boxplot() ``` @@ -275,7 +280,7 @@ Using the `y` argument can split the data into groups. Here we use the `factor() function on the month column to create 12 box plots on the graph. -```{r ex-I5X47-9, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Grouped Box Plot'} +```{r ex-v9v5U-9, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Grouped Box Plot'} ggplot(chicago_air, aes(x = ozone, y = factor(month))) + geom_boxplot() ``` @@ -291,7 +296,7 @@ multiple plots or facets. The `facet_wrap()` function allows you to use a column to choose the facets. Below is a faceted histogram of ozone values. -```{r ex-1LcZl-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Faceted Histogram'} +```{r ex-EUZmI-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Faceted Histogram'} ggplot(chicago_air, aes(ozone)) + geom_histogram() + facet_wrap("month") @@ -307,7 +312,7 @@ represented as a shaded area. Below, the argument `method` is given the value `lm` which stands for a linear model. -```{r ex-1gxvv-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Linear Model Fitted Line', message = FALSE, warning = FALSE} +```{r ex-pCCm7-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Linear Model Fitted Line', message = FALSE, warning = FALSE} ggplot(chicago_air, aes(temp, ozone) ) + geom_point() + geom_smooth(method=lm) @@ -319,7 +324,7 @@ argument will draw a nonlinear curve which represents localized relationships be the x and y variables. -```{r ex-sp9SY-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Nonlinear Curve Fitting', message = FALSE, warning = FALSE} +```{r ex-w6Hdx-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Nonlinear Curve Fitting', message = FALSE, warning = FALSE} ggplot(chicago_air, aes(temp, ozone) ) + geom_point() + geom_smooth(method=loess) @@ -337,7 +342,7 @@ pane. You can also save a plot made by `ggplot2` using the `ggsave()` function. -```{r ex-SsPB0-1, eval = FALSE} +```{r ex-SZeDt-1, eval = FALSE} my_plot <- ggplot(chicago_air, aes(temp, ozone) ) + geom_point() + geom_smooth(method=loess) @@ -346,6 +351,38 @@ ggsave(filename = "my_plot.png", plot = my_plot) ``` +## Factors + +Factors are a data structure used in R for categorizing data into a set of levels, +which is particularly useful in statistical modeling and visualizations that involve +categorical variables. They are important in R because they influence how data is +represented and analyzed in statistical models, ensuring the data is treated as +nominal or ordinal rather than continuous. + +To create a factor in R, you use the `factor()` function. This function takes a +vector of alphanumeric values and converts it into a factor, which stores the data +as integers internally while maintaining labels for these integers. Here is an +example of converting a character vector into a factor: + + +```{r ex-DfS6P-1, eval = FALSE} +months <- c("January", "February", "March", "January", "February") +months_factor <- factor(months) + +months_factor + +``` + +Factors are particularly useful in data analysis for a few reasons: + +- __Statistical Analysis__: Many statistical models require categorical data to + be provided as factors in order to correctly analyze it. +- __Control Order__: Factors can be ordered or unordered, and you can specify the + order of levels to influence data analysis and visual representation. +- __Efficiency__: Factors store data as integers, which can be more memory efficient + than storing strings, especially for large datasets. + + ## Exercises @@ -519,6 +556,63 @@ By utilizing `ggplot2` to create histograms of barometric pressure values, we ca ggplot(chicago_air, aes(pressure)) + geom_histogram() + facet_wrap(~month) + +``` + +
+ +--- + + +### Exercise 5 + +__CHALLENGING EXERCISE!__ Use `ggplot2` and the data frame `ertac_egu_projections` from the `region5air` package to make a map of facility locations in the CONUS ERTAC region. + +
Click for Hint + +> # Use the `data()` function to load the `ertac_egu_projections` data frame and ?ertac_egu_projections to see the documentation. Which columns would be useful for the x and y coordinates? + +
+ +
Click for Hint + +> # Use the `filter()` function from the `dplyr` package to filter down to the 'CONUS' region and look at the documentation for the `geom_polygon()` function: https://ggplot2.tidyverse.org/reference/geom_polygon.html. + +
+ +
Click for Hint + +> # Use the `map_data()` function from the `maps` package to create a base map variable to use as the `data` parameter in the `geom_polygon()` function. + +
+ +
Click for Solution + +#### Solution + +This exercise involves creating a map of facility locations in the CONUS ERTAC region using `ggplot2` and the `ertac_egu_projections` data frame from the `region5air` package. By filtering the data to the "CONUS" region and using the `geom_polygon()` function with a base map variable, we can visualize the facility locations effectively. This exercise provides a challenging opportunity to practice plotting geospatial data in R. + + +```r +library(region5air) +library(ggplot2) +library(dplyr) +library(maps) + +data(ertac_egu_projections) +# filter to CONUS region +conus <- filter(ertac_egu_projections, ertac_region == "CONUS") +states_map <- map_data("state") + +ggplot() + + geom_polygon(data = states_map, aes(x = long, y = lat, group = group), + fill = "white", color = "black") + + geom_point(data = conus, aes(x = longitude, y = latitude), color = "red", + size = 3) + + coord_fixed(1.3) + + labs(title = "Map of the US with Points", x = "Longitude", y = "Latitude") + + theme_minimal() + ```
diff --git a/docs/6-Basic-Statistics/readme.md b/docs/6-Basic-Statistics/readme.md index 81ab583..cd5ea8b 100644 --- a/docs/6-Basic-Statistics/readme.md +++ b/docs/6-Basic-Statistics/readme.md @@ -26,7 +26,7 @@ This lesson assumes you are familiar with the material in the lesson on [Functio Statistical functions are used in this lesson that require installation of the following packages. -```{r ex-mrYgt-1, eval = FALSE} +```{r ex-51h4u-1, eval = FALSE} install.packages("envstats") ``` @@ -40,7 +40,7 @@ R has many built-in functions for descriptive statistics. We will use these functions to understand the example environmental data available in this package. -```{r ex-pgJHO-1, exercise = TRUE, exercise.cap = 'Extract example data'} +```{r ex-vVzfW-1, exercise = TRUE, exercise.cap = 'Extract example data'} data <- example_data # Assuming example_data is available in this package ``` @@ -53,17 +53,17 @@ These functions let us know the range of the data values, i.e., the highest and lowest values. -```{r ex-hpFYT-2, exercise = TRUE, exercise.cap = 'Find minimum value'} +```{r ex-VQdWQ-2, exercise = TRUE, exercise.cap = 'Find minimum value'} min(data, na.rm=TRUE) ``` -```{r ex-y5aaF-3, exercise = TRUE, exercise.cap = 'Find maximum value'} +```{r ex-MsGTt-3, exercise = TRUE, exercise.cap = 'Find maximum value'} max(data, na.rm=TRUE) ``` -```{r ex-PX97c-4, exercise = TRUE, exercise.cap = 'Find range of values'} +```{r ex-yK2wX-4, exercise = TRUE, exercise.cap = 'Find range of values'} range(data, na.rm=TRUE) ``` @@ -71,7 +71,7 @@ range(data, na.rm=TRUE) We can also get the mean and the quartile values from the `summary()` function. -```{r ex-nqZNH-5, exercise = TRUE, exercise.cap = 'Summary statistics'} +```{r ex-6miWt-5, exercise = TRUE, exercise.cap = 'Summary statistics'} summary(data) ``` @@ -81,7 +81,7 @@ the spread is for the values in the central range of the distribution, i.e., bet the 1st quartile and the 3rd quartile. -```{r ex-iaLj1-6, exercise = TRUE, exercise.cap = 'Calculate IQR'} +```{r ex-1vu7t-6, exercise = TRUE, exercise.cap = 'Calculate IQR'} IQR(data, na.rm=TRUE) ``` @@ -91,7 +91,7 @@ of the box itself shows the first and third quartile, while the line in the midd of the box shows the median. -```{r ex-BwSRl-7, exercise = TRUE, exercise.cap = 'Visualize IQR with boxplot'} +```{r ex-9l0RF-7, exercise = TRUE, exercise.cap = 'Visualize IQR with boxplot'} boxplot(data) ``` @@ -101,12 +101,12 @@ boxplot(data) R has functions for finding the mean and median of a set of values. -```{r ex-SppiP-1, exercise = TRUE, exercise.cap = 'Calculate mean'} +```{r ex-068Qu-1, exercise = TRUE, exercise.cap = 'Calculate mean'} mean(data, na.rm=TRUE) ``` -```{r ex-y80m5-2, exercise = TRUE, exercise.cap = 'Calculate median'} +```{r ex-mT7sT-2, exercise = TRUE, exercise.cap = 'Calculate median'} median(data, na.rm=TRUE) ``` @@ -115,12 +115,12 @@ The functions `var()` and `sd()` calculate the variance and standard deviation, respectively. -```{r ex-gFKd7-3, exercise = TRUE, exercise.cap = 'Calculate variance'} +```{r ex-ptbDz-3, exercise = TRUE, exercise.cap = 'Calculate variance'} var(data, na.rm=TRUE) ``` -```{r ex-8sPJi-4, exercise = TRUE, exercise.cap = 'Calculate standard deviation'} +```{r ex-SSlLM-4, exercise = TRUE, exercise.cap = 'Calculate standard deviation'} sd(data, na.rm=TRUE) ``` @@ -133,7 +133,7 @@ the `t.test()` function to perform a two-sample t-test on the example data. First, let's visualize our dataset. -```{r ex-v3dCg-1, warning = FALSE, message = FALSE, exercise = TRUE, exercise.cap = 'Visualize dataset'} +```{r ex-lBOr0-1, warning = FALSE, message = FALSE, exercise = TRUE, exercise.cap = 'Visualize dataset'} ggplot(data, aes(factor(variable), value)) + geom_boxplot() ``` @@ -143,7 +143,7 @@ difference in concentrations. Below is a plot of those two groups side by side. -```{r ex-jD3VH-2, warning = FALSE, message = FALSE, exercise = TRUE, exercise.cap = 'Compare two groups'} +```{r ex-SfQkn-2, warning = FALSE, message = FALSE, exercise = TRUE, exercise.cap = 'Compare two groups'} filtered_data <- filter(data, group == "Group1" | group == "Group2") ggplot(filtered_data, aes(factor(group), value)) + geom_boxplot() @@ -154,7 +154,7 @@ We should also check for normality before doing any statistical tests. Below are histograms of the datasets. -```{r ex-0b2AA-3, exercise = TRUE, exercise.cap = 'Check for normality with histograms'} +```{r ex-QohKM-3, exercise = TRUE, exercise.cap = 'Check for normality with histograms'} ggplot(filtered_data, aes(value)) + facet_grid(rows = vars(group)) + geom_histogram() @@ -168,14 +168,14 @@ comes from a normal distribution. If the p-value of the test is less than .05, we reject the null hypothesis and conclude the data is not normal. -```{r ex-duUuX-4, exercise = TRUE, exercise.cap = 'Shapiro-Wilk test for Group1'} +```{r ex-22UpB-4, exercise = TRUE, exercise.cap = 'Shapiro-Wilk test for Group1'} group1_data <- filter(data, group == "Group1") shapiro.test(group1_data$value) ``` -```{r ex-qu9LC-5, exercise = TRUE, exercise.cap = 'Shapiro-Wilk test for Group2'} +```{r ex-4jnjG-5, exercise = TRUE, exercise.cap = 'Shapiro-Wilk test for Group2'} group2_data <- filter(data, group == "Group2") shapiro.test(group2_data$value) @@ -192,7 +192,7 @@ datasets are from the same distribution or not. The assumption, or null hypothes is that they are, in fact, mean values from the same distribution. -```{r ex-XBtU4-6, exercise = TRUE, exercise.cap = 'Student's t-test between two groups'} +```{r ex-UOqOH-6, exercise = TRUE, exercise.cap = 'Student's t-test between two groups'} t.test(group1_data$value, group2_data$value) ``` @@ -217,7 +217,7 @@ The `EnvStats` package has a comprehensive list of basic and more advanced stati tests for Environmental Data. -```{r ex-F9Tl0-1, eval = FALSE} +```{r ex-etfDX-1, eval = FALSE} library(EnvStats) ?FcnsByCatHypothTests @@ -236,7 +236,7 @@ arguments that we only want to include complete observations and the Pearson met of finding correlations. -```{r ex-pWrru-1, exercise = TRUE, exercise.cap = 'Correlation matrix of select variables'} +```{r ex-uEkBn-1, exercise = TRUE, exercise.cap = 'Correlation matrix of select variables'} cor(data[, c("Variable1", "Variable2", "Variable3")], use = "complete.obs", method ="pearson") @@ -253,7 +253,7 @@ We could also perform a correlation test using the `cor.test()` function. Here we test the correlation between two variables. -```{r ex-unIry-2, exercise = TRUE, exercise.cap = 'Test correlation between two variables'} +```{r ex-Ixup6-2, exercise = TRUE, exercise.cap = 'Test correlation between two variables'} cor.test(data$Variable1, data$Variable2, method = "pearson") ``` @@ -267,7 +267,7 @@ do not reject the null hypothesis. We conclude there is no correlation between these two variables. -```{r ex-5rmSI-3, exercise = TRUE, exercise.cap = 'Test correlation between another set of two variables'} +```{r ex-vhUiL-3, exercise = TRUE, exercise.cap = 'Test correlation between another set of two variables'} cor.test(data$Variable1, data$Variable3, method = "pearson") ``` @@ -278,7 +278,7 @@ plot between each pair of columns in the data frame. Setting `lower.panel = pane will draw a smooth line through the scatter plots on the lower panels. -```{r ex-MeoBl-4, exercise = TRUE, exercise.cap = 'Pairwise plots of select variables'} +```{r ex-9Vs8T-4, exercise = TRUE, exercise.cap = 'Pairwise plots of select variables'} pairs(data[, c("Variable1", "Variable2", "Variable3")], lower.panel = panel.smooth) ``` diff --git a/docs/7-Quality-Assurance/readme.md b/docs/7-Quality-Assurance/readme.md index 81a4cd5..1c083f4 100644 --- a/docs/7-Quality-Assurance/readme.md +++ b/docs/7-Quality-Assurance/readme.md @@ -21,7 +21,7 @@ This lesson assumes you are familiar with the material in the lesson on [Functio The data used throughout these lessons is provided by this package. To access the data, simply use the `data()` function with the name of the dataset provided by this package. -```{r ex-mLauk-1, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Load Example Data Frame'} +```{r ex-t8RUE-1, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Load Example Data Frame'} # Assuming this package is already loaded into your R session data("example_dataset") @@ -34,7 +34,7 @@ data("example_dataset") Data types are the first thing to consider when using data in R. Many errors can happen if we assume that our data is a certain type, when in reality it is not. After reading data into R, we should look at the data types in RStudio or using the function `str()`. -```{r ex-8Z93E-1, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Inspect Example Dataset Data Types'} +```{r ex-pHE7v-1, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Inspect Example Dataset Data Types'} str(example_dataset) ``` @@ -42,7 +42,7 @@ str(example_dataset) Here is an example of text that is read into R, and a certain column might be `character` when we expected it to be `Date`. -```{r ex-UHgmY-2, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Check Data Types'} +```{r ex-KJOyG-2, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Check Data Types'} sample_data <- read.csv(text = " date,value 2022-08-01,100 @@ -56,7 +56,7 @@ str(sample_data) We can use the `as.Date()` function to transform the column after reading the data, or we can use the `colClasses` argument in the `read.csv` function to ensure it's read correctly. -```{r ex-j5i0t-3, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Correcting Data Types with colClasses'} +```{r ex-nDuQ0-3, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Correcting Data Types with colClasses'} sample_data <- read.csv(colClasses = c("Date", "numeric"), text = " date,value 2022-08-01,100 @@ -72,7 +72,7 @@ str(sample_data) For both character and numeric data types, there may be values that should not be allowed. -```{r ex-iUtMe-1, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Identify and Correct Unallowed Data'} +```{r ex-8da09-1, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Identify and Correct Unallowed Data'} # Example of correcting unallowed values values <- c(1, 2, -1, 3, -2, 4) values[values < 0] <- NA @@ -86,7 +86,7 @@ values Handling outliers is difficult because we do not necessarily want to remove data that may be uncommon but within the realm of possibility. -```{r ex-ZvmnZ-1, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Identify and Handle Outliers'} +```{r ex-h7kSA-1, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Identify and Handle Outliers'} # Example of identifying and handling outliers data("example_dataset") boxplot(example_dataset$value) @@ -98,7 +98,7 @@ boxplot(example_dataset$value) If you run a command and get an error, then R should print an error message. Common syntax mistakes include missing commas, unmatched parentheses, and the wrong type of closing brace. -```{r ex-ME5GN-1, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Examples of Common Syntax Mistakes'} +```{r ex-Ufpjn-1, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Examples of Common Syntax Mistakes'} # Example of a common syntax mistake: missing commas x <- c("a", "b" "c") diff --git a/inst/tutorials/1-Introduction/lesson.Rmd b/inst/tutorials/1-Introduction/lesson.Rmd index 81983eb..ae7e245 100644 --- a/inst/tutorials/1-Introduction/lesson.Rmd +++ b/inst/tutorials/1-Introduction/lesson.Rmd @@ -47,7 +47,7 @@ Open up a script if you haven't already (“File” -> “New File” -> “R Sc the lines into your script. -```{r ex-MTiVW-1, exercise = FALSE, eval = TRUE, exercise.cap = 'Practice Basic Math Operations'} +```{r ex-bPZt0-1, exercise = FALSE, eval = TRUE, exercise.cap = 'Practice Basic Math Operations'} 10 + 5 10 - 5 10 * 5 @@ -74,12 +74,12 @@ R follows the usual order of arithmetical operations and uses parentheses for gr see the different values that are returned. -```{r ex-ebpxn-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Understanding Order of Operations'} +```{r ex-ykH12-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Understanding Order of Operations'} 10 - 3 / 5 ``` -```{r ex-ufpL9-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Practicing Grouping with Parentheses'} +```{r ex-qeOCv-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Practicing Grouping with Parentheses'} (10 - 3) / 5 ``` @@ -93,7 +93,7 @@ no multi-line commenting in R, so every comment line must begin with the `#` cha Run all of the code below and see what gets returned in the R console (bottom left panel in RStudio). -```{r ex-cyWBd-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Experiment with Comments'} +```{r ex-6edAy-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Experiment with Comments'} # Full line comment 5^2 # partial line comment @@ -103,7 +103,7 @@ In the example above and the previous section, you can see the R code and its ou the R code and `## [1] 2` being the output: -```{r ex-byZ3W-2, exercise = FALSE, eval = TRUE, exercise.cap = 'Code and Output Example'} +```{r ex-WwgSA-2, exercise = FALSE, eval = TRUE, exercise.cap = 'Code and Output Example'} 1+1 ``` @@ -111,7 +111,7 @@ the R code and `## [1] 2` being the output: However, in the R console the code and output would look like this: -```{r ex-9gHIa-3, exercise = FALSE, eval = FALSE, echo = TRUE, exercise.cap = 'Console Code Execution'} +```{r ex-q9e11-3, exercise = FALSE, eval = FALSE, echo = TRUE, exercise.cap = 'Console Code Execution'} > 1 + 1 [1] 2 @@ -123,7 +123,7 @@ A variable is a letter or combination of alphanumeric characters that is used to with the dash to create an arrow symbol pointing left `<-`. Below, the variables `x` and `y` are created by assigning some numbers to them. -```{r ex-mHvvD-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Creating Variables'} +```{r ex-mMpFn-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Creating Variables'} x <- 10 y <- 5 x + y @@ -138,12 +138,12 @@ In RStudio, you will see the variables we created in the top right panel. If you've already created a variable, you can replace the value with another value. -```{r ex-kUDcq-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Exploring Variable Assignment'} +```{r ex-4Aher-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Exploring Variable Assignment'} x ``` -```{r ex-5d8sV-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Updating Variable Values'} +```{r ex-6FMuV-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Updating Variable Values'} x <- 20 x @@ -160,7 +160,7 @@ There are 3 important rules to remember when creating variable names: Try running the following code and you will see that in your global environment there are two different objects listed. -```{r ex-Vw1tV-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Variable Naming Rules'} +```{r ex-rbQdo-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Variable Naming Rules'} y <- 5 Y <- 10 @@ -192,13 +192,13 @@ There are several ways to store groups of data to make them easier to work with: A vector variable can contain only one type of data (numeric, character, or logical). We use `c()` to create vectors. -```{r ex-axso2-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Creating Numeric Vectors'} +```{r ex-ASwYE-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Creating Numeric Vectors'} x <- c(1, 2, 3, 4, 5) x ``` -```{r ex-ouMaS-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Creating Character Vectors'} +```{r ex-qXb3N-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Creating Character Vectors'} fruit <- c("apples", "bananas", "oranges") fruit @@ -208,7 +208,7 @@ If you try to type in text without using quotations marks for character values ( running the code below. -```{r ex-4S8mo-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Error Without Quotes'} +```{r ex-2si1E-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Error Without Quotes'} fruit <- c(apples, bananas, oranges) ``` @@ -218,7 +218,7 @@ find them and it returns an error. The members of a vector can be accessed by us `fruit` vector, you can use the single bracket with the number 3: -```{r ex-CqAJ8-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Accessing Vector Elements'} +```{r ex-icerQ-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Accessing Vector Elements'} fruit[3] ``` @@ -228,7 +228,7 @@ fruit[3] Lists are like vectors but can contain any mix of data types. We use `list()` to create a list variable. -```{r ex-1xLzn-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Creating Lists'} +```{r ex-X8z2D-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Creating Lists'} x <- list("Benzene", 1.3, TRUE) x @@ -239,7 +239,7 @@ is the second value in the list, so it is shown below the double bracket `[[2]]` list. -```{r ex-nhD28-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Accessing List Elements'} +```{r ex-EX1dw-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Accessing List Elements'} x[[2]] ``` @@ -247,7 +247,7 @@ x[[2]] Lists can also contain vectors and other lists. -```{r ex-9qNwR-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Lists Containing Vectors and Lists'} +```{r ex-UJ7Pe-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Lists Containing Vectors and Lists'} my_vector <- c(1, 2, 3) my_list <- list("Benzene", 1.3, TRUE) y <- list(TRUE, my_vector, my_list) @@ -259,7 +259,7 @@ In this example, you can use two double brackets to access the value `1.3` by se `my_list`: -```{r ex-4m4IT-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Nested List Access'} +```{r ex-zi0BA-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Nested List Access'} y[[3]][[2]] ``` @@ -270,7 +270,7 @@ Data frames are data tables in R. We use `data.frame()` to create a data frame o vectors of the same length and use them to create a data frame. -```{r ex-MaAmV-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Creating Data Frames'} +```{r ex-hhNQ3-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Creating Data Frames'} pollutant <- c("Benzene", "Toluene", "Xylenes") concentration <- c(1.3, 5.5, 6.0) carcinogen <- c(TRUE, FALSE, FALSE) @@ -283,7 +283,7 @@ The output above shows a table with the vector variable names as column names, a create a data frame where the vectors are not all the same length, you will see the error shown below. -```{r ex-kt2Ak-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Data Frame Dimension Mismatch Error'} +```{r ex-GHIDP-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Data Frame Dimension Mismatch Error'} pollutant <- c("Benzene", "Toluene") concentration <- c(1.3, 5.5, 6.0) carcinogen <- c(TRUE, FALSE, FALSE) diff --git a/inst/tutorials/2-Functions-and-Importing-Data/lesson.Rmd b/inst/tutorials/2-Functions-and-Importing-Data/lesson.Rmd index 5cc4371..d29b445 100644 --- a/inst/tutorials/2-Functions-and-Importing-Data/lesson.Rmd +++ b/inst/tutorials/2-Functions-and-Importing-Data/lesson.Rmd @@ -33,31 +33,32 @@ Functions are similar to variables in that they are short names that reference s R has many built-in functions that perform common tasks. When you open RStudio you can immediately use a function called `mean( )`. Here is an example of using the `mean( )` function to find the average of a vector of integers. We first save a vector of integers in the `x` variable then put the variable inside the parentheses of the function. -```{r ex-zjABO-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Mean'} +```{r ex-KGbwx-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Mean'} x <- c(4, 8, 1, 14, 34) mean(x) ``` -As you would expect, R has many built-in math functions. Below are a series of examples. +As you would expect, R has many built-in math functions. Below are a few +examples. -```{r ex-tk9tQ-2, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Natural Logarithm'} +```{r ex-N3tac-2, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Natural Logarithm'} log(27) #Natural logarithm ``` -```{r ex-q55TG-3, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Base 10 Logarithm'} +```{r ex-ul39I-3, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Base 10 Logarithm'} log10(100) #base 10 logarithm ``` -```{r ex-LgzHK-4, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Square Root'} +```{r ex-HWVfX-4, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Square Root'} sqrt(225) # Square root ``` -```{r ex-Ppiy7-5, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Absolute Value'} +```{r ex-8Inet-5, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Absolute Value'} abs(-5) #Absolute value ``` @@ -68,7 +69,7 @@ All of the examples show that the general form is `function_name( )`. The name o Many functions also have additional options you can choose, which are called the _arguments_. To see what needs to go inside `( )`, type a question mark in front of the function and run it in the R console. -```{r ex-akxtC-6, eval = FALSE} +```{r ex-Gw7lg-6, eval = FALSE} ?mean() ``` @@ -84,17 +85,17 @@ On the help page, under `Usage`, you see `mean(x, ...)`. This means that the onl Many built-in functions in R have multiple arguments. This allows you to give the function some more information to perform calculation you want. The example below shows how to use the `digits` argument in the `round( )` function. Providing different values to the `digits` argument will return different values. -```{r ex-rhZdj-7, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Round to Zero Digits'} +```{r ex-z51CK-7, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Round to Zero Digits'} round(12.3456) ``` -```{r ex-8j1Yx-8, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Round to Three Digits'} +```{r ex-Lt2TR-8, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Round to Three Digits'} round(12.3456, digits=3) ``` -```{r ex-d2mMB-9, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Round to One Digit'} +```{r ex-VaHN1-9, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Round to One Digit'} round(12.3456, digits=1) ``` @@ -109,7 +110,7 @@ When you start an R session there are many built-in functions that are immediate Returns the sum of a vector of numeric values. -```{r ex-K0Mih-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Sum of Values'} +```{r ex-pv936-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Sum of Values'} sum(c(2.3, 7.5, 9, -10)) ``` @@ -119,7 +120,7 @@ sum(c(2.3, 7.5, 9, -10)) Get the minimum value from a numeric vector. -```{r ex-NQbSf-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Minimum Value'} +```{r ex-eaXyL-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Minimum Value'} min(c(6, 9, 3, 11, -2)) ``` @@ -129,8 +130,8 @@ min(c(6, 9, 3, 11, -2)) Get the maximum value from a numeric vector. -```{r ex-z2NwD-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Maximum Value'} -max(c(6, 9, 3, 11, -2)) +```{r ex-NH2bE-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Maximum Value'} +max(c(15, 2, 8.3, -10, 21)) ``` @@ -139,7 +140,7 @@ max(c(6, 9, 3, 11, -2)) Create a numeric vector with a certain sequence. The example below creates a vector of integers from 1 to 5. -```{r ex-kuKw1-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Create Sequence with `seq()`'} +```{r ex-RM9C8-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Create Sequence with `seq()`'} seq(from = 1, to = 5, by = 1) ``` @@ -147,7 +148,7 @@ seq(from = 1, to = 5, by = 1) Another way to create a sequence of integers is to use the colon. -```{r ex-yzJw9-2, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Create Sequence with Colon Operator'} +```{r ex-eLFnq-2, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Create Sequence with Colon Operator'} 1:5 ``` @@ -157,7 +158,7 @@ Another way to create a sequence of integers is to use the colon. Concatenate two or more strings. -```{r ex-zKlrp-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Concatenate Strings'} +```{r ex-C27oO-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Concatenate Strings'} x <- "Hello" y <- "world!" paste(x, y, sep = " ") @@ -167,7 +168,7 @@ paste(x, y, sep = " ") Any numbers will be converted to strings. -```{r ex-CLJeb-2, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Concatenate Strings with Numbers'} +```{r ex-wTyKS-2, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Concatenate Strings with Numbers'} x <- "You're number " y <- 1 z <- "!" @@ -183,7 +184,7 @@ The `substr()` function allows you to pull out a section from a string based on For example, in AQS data a monitor ID may be written in the following format: [State code - County code - Site number - Parameter code - POC]. If we only wanted to pull out the site number for this monitor ID we could do the following: -```{r ex-AJsza-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Extract Site Number'} +```{r ex-8vFtN-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Extract Site Number'} wisconsin_monitor <- c('55-021-0015-44201-2') # Ozone monitor in Columbia County, WI site_id <- substr(wisconsin_monitor, start = 8, stop = 11) # start and stop position within the character string. site_id @@ -198,14 +199,14 @@ R allows you to place a function inside another function to perform multiple tas For instance, if you want to create a sequence of numbers and then take the mean of that sequence, you could either do it in a couple of steps, or all at once. -```{r ex-v4Oam-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Mean in Two Steps'} +```{r ex-Pf9AU-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Mean in Two Steps'} #Two steps x <- seq(from=1, to=10, by=3) mean(x) ``` -```{r ex-bbQSh-2, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Mean in One Step'} +```{r ex-1K8TV-2, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Mean in One Step'} #One step mean(seq(from=1, to=10, by=3)) @@ -216,13 +217,13 @@ _Note: Typically you don’t want to have too many nested functions because it b ## NA Values -Most of the statistical summary functions in R have the argument `na.rm`. This stands for `NA` remove. `NA` value is how R represents a missing value, similar to the NULL value in a SQL database. +Most of the statistical summary functions in R have the argument `na.rm`. This stands for `NA` remove. The `NA` value is how R represents a missing value, similar to the NULL value in a SQL database. For example, there is a built-in data frame in R called `airquality` with daily measurements from a monitor in New York from 1973 (see `?airquality`). If we load the data frame using the `data()` function and take a look at the top 6 rows using the `head()` function, we can see some missing values represented as `NA`. -```{r ex-uOhJx-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Inspect `airquality` Data'} +```{r ex-fmSM6-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Inspect `airquality` Data'} data("airquality") head(airquality) @@ -231,7 +232,7 @@ head(airquality) The `mean()` function, for example, has the argument `na.rm` set to `FALSE`. This means that the `NA` values will not be removed from the vector for which it is calculating the mean. As a result, it will return an `NA` because it cannot properly calculate the average. Here we use the `Ozone` column from the `airquality` data frame. -```{r ex-n8XjO-2, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Mean with NA Values'} +```{r ex-SjMht-2, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Mean with NA Values'} mean(airquality$Ozone) ``` @@ -239,7 +240,7 @@ mean(airquality$Ozone) To get the mean value, we set `na.rm = TRUE`. -```{r ex-ERw6X-3, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Mean with NA Removal'} +```{r ex-LVWAV-3, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Mean with NA Removal'} mean(airquality$Ozone, na.rm = TRUE) ``` @@ -261,7 +262,7 @@ For example, if you wanted to find serial correlation in an environmental data s First, you might try to use the function. -```{r ex-g63qu-1, error = TRUE} +```{r ex-xBFkR-1, error = TRUE} x <- c(1.3, 3.5, 2.6, 3.4, 6.4) serialCorrelationTest(x) @@ -283,12 +284,12 @@ A window will pop up. Start typing "EnvStats" into the "Packages" box, select th Now that we've installed the package, we still can't use the function we want. We need to load the package first (opening the app). We use the `library()` function to do this. -```{r ex-B7VCB-2, message = FALSE} +```{r ex-ga6IS-2, message = FALSE} library(EnvStats) ``` -```{r ex-gZ7KC-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Use `serialCorrelationTest` from EnvStats'} +```{r ex-Ps96A-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Use `serialCorrelationTest` from EnvStats'} x <- c(1.3, 3.5, 2.6, 3.4, 6.4) serialCorrelationTest(x) @@ -297,7 +298,7 @@ serialCorrelationTest(x) Here is a link to a page that lists many useful packages for environmental data analysis: https://cran.r-project.org/web/views/Environmetrics.html -Remember, when you close down RStudio, then start it up again, you don’t have to download the package again. But you do have to load the package to use any function that's not in the R core functionality (this is very easy to forget). +Remember, when you close down RStudio, then start it up again, you don’t have to download the package again. But you do have to use the `library()` function to load the package before you can use any function that's not in the R core functionality (this is very easy to forget). ## Importing Data @@ -310,7 +311,7 @@ R can import data from just about any format, including CSV, Excel, Databases, G R has a built-in function called `read.csv()` for reading `.csv` files. Download the `chicago_daily.csv` file [here](../data/chicago_daily.csv) and save it to your working directory. If you don't know what your working directory is, run this code in R and it will tell you. -```{r ex-GCoXE-1, eval = FALSE} +```{r ex-ftJJf-1, eval = FALSE} getwd() ``` @@ -318,7 +319,7 @@ getwd() Use `read.csv()` by providing the location and name of the file as the first argument. If the file is in your working directory, simply supply the name of the file. Below, the data from the file is read into R and saved as a data frame, which is the data type for storing tables. The function `head()` will show the first few lines. -```{r ex-2Fbtm-2, eval = FALSE} +```{r ex-coc7X-2, eval = FALSE} chicago_daily <- read.csv("chicago_daily.csv") head(chicago_daily) @@ -329,7 +330,7 @@ head(chicago_daily) There are several packages that can be used to import data from an Excel file, such as `xlsx`, `XLConnect`, and `readxl`. In this example, we'll use the `readxl` package. If you do not have the package installed, you can use RStudio to install as described in the section above on packages. You can also use the function `install.packages( )`. -```{r ex-bCVPT-1, eval = FALSE} +```{r ex-GltGn-1, eval = FALSE} install.packages("readxl") ``` @@ -345,7 +346,7 @@ library(readxl) Use the `read_excel()` function from the `readxl` package to read emissions data from [this Excel workbook](./data/emissions_IL_2022.xlsx). Download the file to your working directory and read the first worksheet (named "UNIT_DATA"), skipping the first 6 rows. -```{r ex-QWZFw-3, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Read and Inspect Excel Data'} +```{r ex-Zus1W-3, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Read and Inspect Excel Data'} library(readxl) emissions <- read_excel("./data/emissions_IL_2022.xlsx", sheet = "UNIT_DATA", skip = 6) head(emissions) @@ -357,7 +358,7 @@ head(emissions) ### Exercise 1 -Use the `seq()` function to create a vector from 1 to 20 by 2. For help with the parameters, run `?seq()` in the console and use the documentation. +Use the `seq()` function to create a vector from 1 to 20 by 2. For help with the parameters, run `?seq()` in the console and consult the documentation. ```{r exercise1, exercise = TRUE} # Your code here diff --git a/inst/tutorials/3-Subsetting-Sorting-and-Combining/lesson.Rmd b/inst/tutorials/3-Subsetting-Sorting-and-Combining/lesson.Rmd index 6b91380..d7ca5b4 100644 --- a/inst/tutorials/3-Subsetting-Sorting-and-Combining/lesson.Rmd +++ b/inst/tutorials/3-Subsetting-Sorting-and-Combining/lesson.Rmd @@ -31,7 +31,7 @@ The example data for exercises in this lesson is available directly from this pa To load the `chicago_air` data frame we will be using in the lesson, simply use the `data()` function to load the data frame. -```{r ex-6wiK1-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Load `chicago_air` Data Frame'} +```{r ex-Ff5Sd-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Load `chicago_air` Data Frame'} data(chicago_air) ``` @@ -42,7 +42,7 @@ You should see the `chicago_air` variable in the top right panel of RStudio, whi We will also use some functions from the `dplyr` package. You will need to install the package if you haven't already. -```{r ex-zhXCH-2, exercise = FALSE, exercise.eval = FALSE, eval = FALSE, exercise.cap = 'Install `dplyr` Package'} +```{r ex-61y8i-2, exercise = FALSE, exercise.eval = FALSE, eval = FALSE, exercise.cap = 'Install `dplyr` Package'} install.packages("dplyr") ``` @@ -55,7 +55,7 @@ We always want to make sure our data looks the way it is supposed to before we b The best way to take a quick look at the first few rows of a data frame is to use the `head()` function. -```{r ex-KnlRD-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'View the First Few Rows of `chicago_air`'} +```{r ex-XBPSe-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'View the First Few Rows of `chicago_air`'} data(chicago_air) head(chicago_air) @@ -64,7 +64,7 @@ head(chicago_air) You can specify the number of lines to display by using the `n` parameter. -```{r ex-5LDbm-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Specify Number of Lines with `head()`'} +```{r ex-W8xk7-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Specify Number of Lines with `head()`'} data(chicago_air) head(chicago_air, n = 3) @@ -73,7 +73,7 @@ head(chicago_air, n = 3) You can also look at the bottom of the data frame by using the `tail()` function. -```{r ex-1qAUt-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'View the Last Few Rows of `chicago_air`'} +```{r ex-k9SN6-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'View the Last Few Rows of `chicago_air`'} data(chicago_air) tail(chicago_air) @@ -82,7 +82,7 @@ tail(chicago_air) In RStudio, you can either click on the name of the data frame in the top right panel or use the `View()` function to generate a web based table of the data in the top left panel. -```{r ex-GCv7f-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Use `View()` to Inspect `chicago_air`'} +```{r ex-4Q5uw-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Use `View()` to Inspect `chicago_air`'} data(chicago_air) View(chicago_air) @@ -93,7 +93,7 @@ View(chicago_air) By inspecting the data frame this way, you can see that the records are daily values of ozone, temperature, and solar radiation. For more information about the data set you can type a question mark in from the name of the data frame variable in the console. -```{r ex-JDWqG-5, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Learn More About `chicago_air`'} +```{r ex-fcXTz-5, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Learn More About `chicago_air`'} ?chicago_air ``` @@ -104,7 +104,7 @@ From the `Description` section of the help page, the `chicago_air` data frame is ## Subsetting -If we want to work with a particular subset of a data frame, we need to know how to select particular records. We will cover how to subset using numeric indexing, logical conditions, and the `filter()` function. +If we want to work with a particular subset of a data frame, we need to know how to select those records. We will cover how to subset using numeric indexing, logical conditions, and the `filter()` function. ### Indexing @@ -119,7 +119,7 @@ Values in a data frame can be selected, individually or in a group, based on the Below is a data frame called `my_data` that has 3 rows and 2 columns. -```{r ex-Vhe21-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Explore Data Frame Indexing with `my_data`'} +```{r ex-wbzvC-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Explore Data Frame Indexing with `my_data`'} my_data <- data.frame(colors = c("red", "green", "yellow"), fruit = c("apple", "grape", "banana")) @@ -130,7 +130,7 @@ my_data To select a particular cell from the `my_data` data frame, we use the `[row, column]` construction. We place those square brackets at the end of the data frame variable `my_data[]` and use integers to select a value. If we wanted to select the "green" value, we would use `my_data[2, 1]`. -```{r ex-c26CI-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Select a Specific Value Using Indexing'} +```{r ex-dIfsD-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Select a Specific Value Using Indexing'} my_data[2, 1] ``` @@ -138,7 +138,7 @@ my_data[2, 1] To select "banana", we use `my_data[3, 2]`. -```{r ex-bdSdp-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access Another Specific Value Using Indexing'} +```{r ex-dQYP7-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access Another Specific Value Using Indexing'} my_data[3, 2] ``` @@ -146,7 +146,7 @@ my_data[3, 2] We can also access data from a vector using the same indexing idea. In this case, you don’t need the comma to separate the rows and columns since you are accessing one dimensional data. Below is a vector of numbers. -```{r ex-54xbn-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Vector Indexing Example'} +```{r ex-crQeB-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Vector Indexing Example'} x <- c(1, 3, 2, 7, 25.3, 6) x @@ -155,7 +155,7 @@ x If we want to access the 5th element of the vector, we would use `x[5]`. -```{r ex-lzp3w-5, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access a Vector Element by Index'} +```{r ex-0RkJ8-5, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access a Vector Element by Index'} x[5] ``` @@ -163,7 +163,10 @@ x[5] Now that we understand indexing we can subset the `chicago_air` data frame by using the brackets `[ , ]` function. (This is a rare example of a function in R that does not have the form `function_name()`.) -```{r ex-dvaSW-6, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Subset `chicago_air` Data Frame Using Indexing'} +To get one row of the data frame, specify the row number you would like in the brackets, on the left side of the comma. If you leave the column value on the right side of the comma blank, it returns all the columns associated with row number 1. + + +```{r ex-pJuzG-6, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Subset `chicago_air` Data Frame Using Indexing'} data(chicago_air) chicago_air[1, ] @@ -172,7 +175,7 @@ chicago_air[1, ] If you want more than one row, you can supply a vector of row numbers. Below, the vector access the 1st, 2nd, and 5th rows of the data frame. -```{r ex-m0BhJ-7, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access Multiple Rows with a Vector of Row Numbers'} +```{r ex-rOwjG-7, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access Multiple Rows with a Vector of Row Numbers'} data(chicago_air) chicago_air[c(1, 2, 5), ] @@ -181,7 +184,7 @@ chicago_air[c(1, 2, 5), ] To get a column from the data frame, specify the column number in the brackets, to the right of the comma. By leaving the row value blank, you are telling it to return all rows associated with column 1. Below, we wrap the output in the `head()` function to limit the number of rows printed. -```{r ex-g7A32-8, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access a Single Column'} +```{r ex-DidXA-8, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access a Single Column'} data(chicago_air) head(chicago_air[, 1]) @@ -193,7 +196,7 @@ As you can see, a vector is returned. When a column of a data frame is selected You can also obtain more than one column by supplying a vector of column numbers. -```{r ex-fMpFj-9, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access Multiple Columns'} +```{r ex-lUdte-9, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access Multiple Columns'} data(chicago_air) head(chicago_air[, c(3, 4, 6)]) @@ -205,7 +208,7 @@ Since more than one column is selected, then a data frame is returned. A column name can be used to select a vector. -```{r ex-kzV6j-10, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access a Column by Name'} +```{r ex-vaAO3-10, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access a Column by Name'} data(chicago_air) head(chicago_air[, "solar"]) @@ -214,7 +217,7 @@ head(chicago_air[, "solar"]) Or a vector of column names can subset to a slimmed down data frame. -```{r ex-oGEmr-11, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Subset to Specific Columns by Name'} +```{r ex-3yZVc-11, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Subset to Specific Columns by Name'} data(chicago_air) head(chicago_air[, c("ozone", "temp", "month")]) @@ -223,17 +226,17 @@ head(chicago_air[, c("ozone", "temp", "month")]) Both rows and columns can be specified at the same time. The example below returns the first 5 rows of the temperature and solar columns. -```{r ex-XFSJ4-12, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Specify Both Rows and Columns for Subsetting'} +```{r ex-YRGcl-12, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Specify Both Rows and Columns for Subsetting'} chicago_air[1:5, c("temp", "solar")] ``` ### Access a Column with `$` -In R, the dollar sign `$` is a special character that can be used to access a data frame column by name. The dollar sign is placed immediately after the variable name. For example, if we wanted to access the temperature values in the `chicago_air` data frame, then we would use `chicago_air$temp`. +In R, the dollar sign `$` is a special character that can be used to access a data frame column by name. The dollar sign is placed immediately after the variable name. For example, if we wanted to access the temperature values in the `chicago_air` data frame, then we could use `chicago_air$temp`. -```{r ex-6xvxH-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access a Column with `$`'} +```{r ex-9PZ88-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access a Column with `$`'} data(chicago_air) head(chicago_air$temp) @@ -267,12 +270,12 @@ Below is a table of logical operators in R that can be used to create logical co The result of a logical expression is a logical data type, a boolean value `TRUE` or `FALSE`. -```{r ex-5wAI4-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Logical Expressions Example 1'} +```{r ex-ZjgTb-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Logical Expressions Example 1'} 1 + 1 == 2 ``` -```{r ex-ejfud-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Logical Expressions Example 2'} +```{r ex-rlN7A-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Logical Expressions Example 2'} 10 > 20 ``` @@ -280,7 +283,10 @@ The result of a logical expression is a logical data type, a boolean value `TRUE Vectors can also be used in a logical expression. A vector of values on the left hand side of a logical operator will return a vector of the same length with boolean values. -```{r ex-DlaPL-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Vector Logical Expression'} +Here, we check if any of the integers in the vector on the left are above 60. A logical vector is returned. + + +```{r ex-wZ7Fl-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Vector Logical Expression'} c(25, 80, 55) > 60 ``` @@ -291,7 +297,7 @@ This concept can be used to subset a data frame. A logical vector can be used in We can use the data frame of colors and fruit again to demonstrate. -```{r ex-21WYI-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Create a Logical Vector for Subsetting'} +```{r ex-9sTMH-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Create a Logical Vector for Subsetting'} my_data <- data.frame(colors = c("red", "green", "yellow"), fruit = c("apple", "grape", "banana")) @@ -302,7 +308,7 @@ my_data If we only wanted records with the "yellow" color, we could use the vector `c(FALSE, FALSE, TRUE)`. Place this vector in the brackets of the data frame, where we select rows. -```{r ex-sEk2e-5, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Subset Using a Logical Vector'} +```{r ex-Ld3tB-5, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Subset Using a Logical Vector'} my_data[c(FALSE, FALSE, TRUE), ] ``` @@ -313,7 +319,7 @@ A data frame is returned. The only record is from the 3rd row of the logical vec But a more useful way of creating the logical vector is with a logical expression. Below we access the "color" column as a vector using the `$` operator. Then we create a logical vector using a logical expression. -```{r ex-MnvL1-6, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Create a Logical Vector with a Logical Expression'} +```{r ex-NEemJ-6, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Create a Logical Vector with a Logical Expression'} colors <- my_data$colors colors @@ -327,7 +333,7 @@ yellow Now we can use the logical vector `yellow` to subset the data frame down to records that have the color yellow. -```{r ex-a5y6T-7, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Subset Data Frame Using Logical Vector'} +```{r ex-uzDCk-7, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Subset Data Frame Using Logical Vector'} my_data[yellow, ] ``` @@ -335,7 +341,7 @@ my_data[yellow, ] The `chicago_air` data frame can be subset in a similar way. Below, a logical vector `hot` is created to represent hot days above 90 degrees. The data frame is subset down to records with hot days. -```{r ex-2uYLb-8, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Subset `chicago_air` for Hot Days'} +```{r ex-s91s1-8, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Subset `chicago_air` for Hot Days'} data(chicago_air) hot <- chicago_air$temp > 90 @@ -351,7 +357,7 @@ A logical vector can also be used in combination with the function `filter()`. The `filter()` function is from a package called `dplyr` which provides many functions for manipulating data frames. -```{r ex-lbWOB-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Load `dplyr` and Explore `filter()`'} +```{r ex-xdO0G-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Load `dplyr` and Explore `filter()`'} # if you have not installed dplyr @@ -361,13 +367,13 @@ library(dplyr) ``` -The benefit of using `filter()` is that it works the way other functions in R typically work. It used braces with parameters, and not brackets `[ , ]`. The first parameter is the data frame you want to subset, and the second parameter is a logical expression. It also allows you to reference the columns in the data frame by name, without having to access the column using `$`. +The benefit of using `filter()` is that it works the way other functions in R typically work. It used parentheses with parameters `( )`, and not brackets `[ , ]`. The first parameter is the data frame you want to subset, and the second parameter is a logical expression. It also allows you to reference the columns in the data frame by name, without having to access the column using `$`. If we want to filter down to records in the `chicago_air` data frame where ozone was above 60 ppb (.060 ppm), we would use the following code. -```{r ex-5JvvD-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Filter `chicago_air` for High Ozone Levels'} +```{r ex-xGrfr-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Filter `chicago_air` for High Ozone Levels'} data(chicago_air) high_ozone <- filter(chicago_air, ozone > 0.060) @@ -378,7 +384,7 @@ high_ozone If we wanted all of the high ozone days in the 6th month, we add another expression separated by a comma. -```{r ex-ZTpqB-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Filter for High Ozone Levels in June'} +```{r ex-4GEoD-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Filter for High Ozone Levels in June'} data(chicago_air) high_ozone_june <- filter(chicago_air, ozone > 0.060, month == 6) @@ -389,7 +395,7 @@ high_ozone_june Additional logical expressions can be added by separating each expression with a comma. The comma serves as a logical AND. Below is an equivalent output to the output above, using `&` instead of a comma. -```{r ex-4F3Dj-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Alternative `filter()` Syntax Using `&`'} +```{r ex-nZCJu-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Alternative `filter()` Syntax Using `&`'} data(chicago_air) high_ozone_june <- filter(chicago_air, ozone > 0.060 & month == 6) @@ -405,7 +411,7 @@ The `dplyr` package also has a function named `arrange()` that will sort a data Below, the `chicago_air` data frame is ordered by the `ozone` column. The default is ascending order. -```{r ex-gBIfC-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Sort `chicago_air` by `ozone`'} +```{r ex-dobXr-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Sort `chicago_air` by `ozone`'} # if the dplyr library is not already loaded library(dplyr) data(chicago_air) @@ -419,7 +425,7 @@ head(ozone_ordered) To use descending order, wrap the column in the `desc()` function (also from the `dplyr` package). -```{r ex-BHhgd-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Sort `chicago_air` in Descending Order by `ozone`'} +```{r ex-oI5Hz-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Sort `chicago_air` in Descending Order by `ozone`'} data(chicago_air) ozone_descending <- arrange(chicago_air, desc(ozone)) @@ -430,7 +436,7 @@ head(ozone_descending) Additional columns can be used to sort the data frame, separated by a comma. -```{r ex-S7YuE-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Sort `chicago_air` by Multiple Columns'} +```{r ex-E6e60-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Sort `chicago_air` by Multiple Columns'} data(chicago_air) ozone_temp <- arrange(chicago_air, desc(ozone), desc(temp)) @@ -446,7 +452,7 @@ If we are working with multiple data frames in R, it is sometimes useful to comb To illustrate, we will make two subsets of the `chicago_air` data frame, then combine them together using the `bind_rows()` function. Below, the original number of records in the `chicago_air` data frame is shown using the `nrow()` function. We will split the data frame and recombine to a data frame with the original number of records. -```{r ex-iN7cR-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Count Rows in `chicago_air`'} +```{r ex-sS6aV-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Count Rows in `chicago_air`'} # if you have not loaded the dplyr package library(dplyr) data(chicago_air) @@ -458,7 +464,7 @@ nrow(chicago_air) Now we split the data frame into warm and cool data frames using the `filter()` function. -```{r ex-bFcau-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Split `chicago_air` into Warm and Cool Subsets'} +```{r ex-kZtfG-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Split `chicago_air` into Warm and Cool Subsets'} data(chicago_air) warm <- filter(chicago_air, temp > 80) @@ -474,7 +480,7 @@ nrow(cool) We can confirm that the rows from these two data frames add up to the original data frame. -```{r ex-C466c-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Verify Row Counts Before Combining'} +```{r ex-n1L5E-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Verify Row Counts Before Combining'} nrow(warm) + nrow(cool) == nrow(chicago_air) ``` @@ -482,7 +488,7 @@ nrow(warm) + nrow(cool) == nrow(chicago_air) Now we combine using the `bind_rows()` function and confirm that the new `recombined` data frame has the same number of records as the original data frame. -```{r ex-ZxDYZ-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Recombine Data Frames and Verify Row Count'} +```{r ex-ghHy3-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Recombine Data Frames and Verify Row Count'} data(chicago_air) recombined <- bind_rows(warm, cool) @@ -585,6 +591,7 @@ data(chicago_air) descending <- arrange(chicago_air, desc(date)) head(descending) + ``` ```{r exercise3-check} @@ -594,6 +601,31 @@ grade_this_code( ``` +### Exercise 4 + +Create two data frames using the `data.frame()` function. The first data frame should have the columns `monitor_id` and `state` and at least one record. The second data frame should have the same column names and at least one record. Use the `dplyr` function `bind_rows()` to combine the two data frames. + +```{r exercise4, exercise = TRUE} +# Your code here +``` + +```{r exercise4-solution} +library(dplyr) + +monitors_1 <- data.frame(monitor_id = c(1, 2, 3), state = c("IL", "IN", "WI")) + +monitors_2 <- data.frame(monitor_id = c(4, 5, 6), state = c("MI", "OH", "MN")) + +bind_rows(monitors_1, monitors_2) +``` + +```{r exercise4-check} +grade_this_code( + correct = c(gradethis::random_praise(), "") +) +``` + + ## Next Lesson diff --git a/inst/tutorials/4-Writing-Functions-Conditionals-and-Loops/lesson.Rmd b/inst/tutorials/4-Writing-Functions-Conditionals-and-Loops/lesson.Rmd index c74ede1..58e7fd0 100644 --- a/inst/tutorials/4-Writing-Functions-Conditionals-and-Loops/lesson.Rmd +++ b/inst/tutorials/4-Writing-Functions-Conditionals-and-Loops/lesson.Rmd @@ -32,7 +32,7 @@ This lesson assumes you are familiar with the material in the previous lessons: The data for these lessons is available from this package. It is assumed that this package is already installed and loaded into the R session. If you need to refer to the package, simply refer to it as "this package". -```{r ex-TzN8D-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Load Data from This Package'} +```{r ex-nrhfD-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Load Data from This Package'} # Assuming the package is already loaded data(chicago_air) @@ -49,7 +49,7 @@ the thing that's saved is not a data object but lines of R code. To save your own function, use this construction: -```{r ex-uKJbS-1, eval = FALSE, exercise = TRUE, exercise.cap = 'Creating Your Own Function'} +```{r ex-leVmm-1, eval = FALSE, exercise = TRUE, exercise.cap = 'Creating Your Own Function'} my_function_name <- function() { # lines of R code @@ -62,7 +62,7 @@ We can write a simple function that prints something to the console. Here is a function named `print_hello`. -```{r ex-hTa1P-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Simple Function to Print Hello'} +```{r ex-HqNOS-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Simple Function to Print Hello'} print_hello <- function() { print("Hello") @@ -71,7 +71,7 @@ print_hello <- function() { ``` -```{r ex-GyDJs-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing the print_hello Function'} +```{r ex-fwQ5L-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing the print_hello Function'} print_hello() ``` @@ -87,7 +87,7 @@ Here we recreate the same function, but this time we add an argument `text` insi of the parentheses. -```{r ex-TD71X-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Function with an Argument'} +```{r ex-2P8Tn-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Function with an Argument'} print_hello <- function(text) { message <- paste("Hello", text) @@ -98,7 +98,7 @@ print_hello <- function(text) { ``` -```{r ex-pzP6b-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing the Modified print_hello Function'} +```{r ex-LRgr3-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing the Modified print_hello Function'} print_hello(text = "everybody!") ``` @@ -106,13 +106,13 @@ print_hello(text = "everybody!") ## Default Values We can create a function with more than one argument, and set default values when -needed. Suppose we would like to make a function that checks if a measurement is below -a criteria pollutant standard. We could make simple function that takes two arguments: -one for the measurement value, and one for the standard value. +needed. Suppose we would like to make a function that checks if a measurement is +greater than a criteria pollutant standard. We could make a simple function that +takes two arguments: one for the measurement value, and one for the standard value. -```{r ex-dB1MP-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Function with Two Arguments'} -check_standard <- function(measurement, standard) { +```{r ex-0qVRQ-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Function with Two Arguments'} +standard_violated <- function(measurement, standard) { measurement > standard @@ -120,8 +120,8 @@ check_standard <- function(measurement, standard) { ``` -```{r ex-kpphy-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing check_standard Function'} -check_standard(measurement = 84, standard = 70) +```{r ex-Qop4d-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing standard_violated Function'} +standard_violated(measurement = 84, standard = 70) ``` @@ -129,13 +129,13 @@ We could write a more specific function for checking a value against the ozone standard. For this function, we want to keep the `standard` parameter but make sure the default is `70`. It may be that we typically want to use this function to check against the current 8-hour ozone standard in parts per billion, but have -then flexibility to use a different value. +the flexibility to use a different value. To set a default value, we use `= 70` when we create the function. -```{r ex-VGkJm-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Function with Default Value'} -check_standard <- function(measurement, standard = 70) { +```{r ex-MZLuD-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Function with Default Value'} +standard_violated <- function(measurement, standard = 70) { measurement > standard @@ -143,8 +143,8 @@ check_standard <- function(measurement, standard = 70) { ``` -```{r ex-wET6k-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing check_standard with Default Value'} -check_standard(measurement = 50) +```{r ex-3zbVT-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing standard_violated with Default Value'} +standard_violated(measurement = 50) ``` @@ -155,7 +155,7 @@ arguments are important. The user can supply values for the arguments in the ord they appeared in the parentheses of the `function( ){}` call, without writing out the argument names. -For example, we can supply two numbers to the `check_standard()` function that we +For example, we can supply two numbers to the `standard_violated()` function that we created above, without writing out the `measurement` and `standard` arguments. When R executes the function, it will assign the numbers to the arguments based on the position in the parentheses. @@ -164,13 +164,13 @@ Here we show that using two numbers in a different order will return different outputs. -```{r ex-Azdwt-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing Positional Arguments'} -check_standard(60, 70) +```{r ex-GWBtx-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing Positional Arguments'} +standard_violated(60, 70) ``` -```{r ex-CS7VP-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing Positional Arguments in Reverse Order'} -check_standard(70, 60) +```{r ex-8GIop-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing Positional Arguments in Reverse Order'} +standard_violated(70, 60) ``` @@ -186,7 +186,7 @@ will run if the logical expression is `TRUE` is placed inside curly braces. Belo is the outline (not actual R code). -```{r ex-w4ftE-1, eval = FALSE, exercise = TRUE, exercise.cap = 'if Function Outline'} +```{r ex-QUyL3-1, eval = FALSE, exercise = TRUE, exercise.cap = 'if Function Outline'} if(logical expression>) { @@ -195,7 +195,7 @@ if(logical expression>) { ``` -```{r ex-7ZGO9-2, eval = FALSE, exercise = TRUE, exercise.cap = 'if-else Function Outline'} +```{r ex-UdUgn-2, eval = FALSE, exercise = TRUE, exercise.cap = 'if-else Function Outline'} if() { @@ -208,7 +208,7 @@ if() { ``` -```{r ex-BaGTG-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'if Function Example'} +```{r ex-48zih-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'if Function Example'} ozone <- 0.075 if(ozone > 0.065) { @@ -223,7 +223,7 @@ if(ozone > 0.065) { ``` -```{r ex-cYEUK-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Another if Function Example'} +```{r ex-gDSuK-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Another if Function Example'} ozone <- 0.06 if(ozone > 0.065) { @@ -238,12 +238,12 @@ if(ozone > 0.065) { ``` -```{r ex-H30oE-5, eval = FALSE, exercise = TRUE, exercise.cap = 'ifelse Function Outline'} +```{r ex-3aWMJ-5, eval = FALSE, exercise = TRUE, exercise.cap = 'ifelse Function Outline'} ifelse(, , ) ``` -```{r ex-kz8FH-6, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'ifelse Function Example'} +```{r ex-2YM6L-6, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'ifelse Function Example'} ozone_value <- 0.06 message <- ifelse(ozone_value > 0.065, "Potential Health Effects", "All Good") @@ -252,7 +252,7 @@ print(message) ``` -## For loop +## For loops Like most programming languages, R has for and while loops. This tutorial will cover just for loops and move on to `apply()` functions, which are more commonly @@ -262,7 +262,7 @@ For loops are used to repeat an operation a set number of times. Here is the basic outline: -```{r ex-nMEPX-1, eval = FALSE, exercise = TRUE, exercise.cap = 'For Loop Outline'} +```{r ex-ofECG-1, eval = FALSE, exercise = TRUE, exercise.cap = 'For Loop Outline'} for(i in sequence){ @@ -271,7 +271,7 @@ for(i in sequence){ ``` -```{r ex-5Xu1i-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'For Loop Example'} +```{r ex-dU92z-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'For Loop Example'} for(i in c(1, 2, 3)) { print(i) @@ -289,7 +289,7 @@ Here is an example data frame we will use. It represents a few values from three monitors. -```{r ex-nq2hb-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'For Loop with Data Frame'} +```{r ex-0a2Hb-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'For Loop with Data Frame'} monitors <- data.frame(monitor1 = c(50, 60, 58, 52), monitor2 = c(55, 59, 65, 61), monitor3 = c(70, 62, 68, 71)) @@ -298,7 +298,13 @@ monitors ``` -```{r ex-Ga6ae-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Finding Maximum Values with For Loop'} +In the code below, we create an empty vector called max_values. As the +for() function loops through the vector c(1, 2, 3), the data frame columns +are accessed using square brackets [ , i]. Each max value is saved to +the max_values vector using square brackets [i]. + + +```{r ex-koVZJ-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Finding Maximum Values with For Loop'} max_values <- c() for(i in c(1, 2, 3)) { @@ -329,13 +335,28 @@ The example below applies the `max()` function to the `monitors` data frame from the previous section. -```{r ex-qOJGn-1, eval = FALSE, exercise = TRUE, exercise.cap = 'apply Function Example'} +```{r ex-TQVh0-1, eval = FALSE, exercise = TRUE, exercise.cap = 'apply Function Example'} monitors_max <- apply(monitors, MARGIN = 2, FUN = max) monitors_max ``` +The MARGIN argument is set to 2 because we are applying the max() function +to the columns of the data frame. Also notice that we do not need to create +an initial empty vector, as we did with the for() function. The returned +value is a named vector that is as long as the number of columns in the +data frame. + +We could also find the mean of each row in the `monitors` data frame. +To do this, we would set the `MARGIN` argument to `1`. + + +```{r ex-bGiYj-2, eval = FALSE, exercise = TRUE, exercise.cap = 'apply Function Example with MARGIN = 1'} +apply(monitors, MARGIN = 1, FUN = mean) + +``` + ## Exercises {data-progressive=TRUE} diff --git a/inst/tutorials/5-Plotting/lesson.Rmd b/inst/tutorials/5-Plotting/lesson.Rmd index efc6368..7a0e21e 100644 --- a/inst/tutorials/5-Plotting/lesson.Rmd +++ b/inst/tutorials/5-Plotting/lesson.Rmd @@ -29,7 +29,7 @@ This lesson assumes you are familiar with the material in the lesson on It also uses functions from the `ggplot2` package which needs to be installed. -```{r ex-8sfdI-1, eval = FALSE} +```{r ex-wZCMY-1, eval = FALSE} install.packages("ggplot2") ``` @@ -37,7 +37,7 @@ install.packages("ggplot2") The example data for the exercises is available from this package. To load the `chicago_air` data frame we will be using in the lesson, simply use the `data()` function. -```{r ex-aXr0c-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Loading Data'} +```{r ex-WmpIN-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Loading Data'} data(chicago_air) ``` @@ -49,7 +49,7 @@ will be displayed on the y-axis of a coordinate graph, with the index number of vector taking the x-axis values. -```{r ex-VoMZI-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Basic Plot'} +```{r ex-mprS0-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Basic Plot'} benzene <- c(1.3, 4.5, 2.6, 3.4, 6.4) plot(benzene) @@ -62,7 +62,7 @@ the `chicago_air` data frame to create a scatterplot of temperature on the x-axi and ozone on the y-axis. -```{r ex-2ZeNC-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Scatter Plot'} +```{r ex-6RNew-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Scatter Plot'} plot(x = chicago_air$temp, y = chicago_air$ozone) ``` @@ -71,7 +71,7 @@ To see data plotted over time, we need to convert the `date` column to a `Date` data type. -```{r ex-x6rUw-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Converting Date Column'} +```{r ex-bluf7-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Converting Date Column'} chicago_air$date <- as.Date(chicago_air$date) ``` @@ -79,7 +79,7 @@ chicago_air$date <- as.Date(chicago_air$date) Here is ozone plotted by day as a line graph. -```{r ex-qobUB-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Line Graph'} +```{r ex-UjZBQ-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Line Graph'} plot(x = chicago_air$date, y = chicago_air$ozone, type = 'l') ``` @@ -89,13 +89,13 @@ to control the look of the graph. The plot below demonstrates a few of these options. Run `?plot` to see a list of all the arguments in the help file. -```{r ex-IzNJV-5, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Customized Plot'} +```{r ex-qXCFF-5, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Customized Plot'} plot(x = chicago_air$date, y = chicago_air$ozone, type='l', pch = 16, col = "purple", lwd = 2.5, - xlab="Date", + xlab = "Date", ylab = 'Ozone (ppm)', main = 'Chicago Ozone Data') @@ -108,7 +108,7 @@ of a data set as a histogram. Below is the default output of the ozone data from the `chicago_air` data frame. -```{r ex-GRaK9-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Default Histogram'} +```{r ex-HxDBk-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Default Histogram'} hist(chicago_air$ozone) ``` @@ -118,7 +118,7 @@ each bar, with the `breaks` argument. For example, supplying `breaks = 20` will make a histogram with 20 bars. Other arguments allow you to control the titles and colors of the plot. Run `?hist` to see a complete list of arguments on the help page. -```{r ex-RO2ag-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Customized Histogram'} +```{r ex-pn82v-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Customized Histogram'} hist(chicago_air$ozone, breaks = 20, main = "Histogram of Ozone", @@ -135,7 +135,7 @@ argument. If a data frame is used, then the columns can be referenced without th `$` operator, and a formula must be used. -```{r ex-oqO8i-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Simple Box Plot'} +```{r ex-DqR5T-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Simple Box Plot'} boxplot(chicago_air$ozone) ``` @@ -145,8 +145,8 @@ median of the ozone values. The horizontal lines at the end of the dotted lines are the min and max ozone values within a certain range from the box (specifically, 1.5 times the IQR). If a value falls outside that range, it will be represented by a point (the default point type is a circle). Overall, the plot gives an idea of -where the middle half of the values are, and if there are extreme values beyond -that. +where the middle half of the values are, and if there are extreme values +on either side of the distribution. The benefit of supplying a data frame to the `data` argument is to break the data @@ -155,10 +155,13 @@ the `x` argument must be a _formula_. In R, a formula is a data type that repres an equation like y = x. The way to represent this relationship in R is with the `~` character: `y ~ x`. The `boxplot()` function needs a formula to know which column in the data frame is being plotted, and which column is used to do the -grouping. We can make a plot of ozone by month using the `chicago_air` data frame. +grouping. +We can make a plot of ozone by month using the `chicag_air` data frame +and the formula `ozone ~ month`. -```{r ex-C9rP6-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Box Plot by Month'} + +```{r ex-9sWx8-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Box Plot by Month'} boxplot(ozone ~ month, data = chicago_air) ``` @@ -171,10 +174,10 @@ way. To use `ggplot2`, we typically follow this sequence of steps: -1. Start with the `ggplot()` function where we specify the dataset and - map variables to aesthetics (visual properties of objects in the +1. Start with the `ggplot()` function where we specify the dataset, and then we + map variables to "aesthetics" (i.e. visual properties of objects in the plot like shapes or colors). -2. Add `geoms` – geometric objects like points (`geom_point` for +2. Add `geoms` which are geometric objects like points (`geom_point` for scatter plots), bars (`geom_bar` for bar plots), or lines (`geom_line` for line plots) that determine the type of the plot. 3. Finally, customize and refine the plot with additional layers like @@ -184,19 +187,20 @@ To use `ggplot2`, we typically follow this sequence of steps: Let's begin by loading the `ggplot2` package. -```{r ex-y5bMX-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Loading ggplot2 Package'} +```{r ex-MfDZd-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Loading ggplot2 Package'} library(ggplot2) ``` -Let's recreate the base scatter plot of ozone and temperature from the previous +First we recreate the base scatter plot of ozone and temperature from the previous section, starting with the `ggplot( )` function. The first argument `data` takes -the data frame. The `mapping` argument takes another function named `aes()`, which +the data frame. The `mapping` argument takes a function named `aes()`, which is short for aesthetic. The primary arguments in the `aes( )` function are `x` -and `y`. These determine which column from the data frame is used on the x and y axes. +and `y`. These determine which columns from the data frame are displayed +on the graph. -```{r ex-rZGGh-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Base ggplot'} +```{r ex-GM7Gb-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Base ggplot'} ggplot(chicago_air, aes(x = temp, y = ozone)) ``` @@ -207,14 +211,14 @@ to the plot, which is done by adding a function using the `+` sign. For a point plot, we add the `geom_point()` function. -```{r ex-EMp9A-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'ggplot Scatter Plot'} +```{r ex-QA75M-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'ggplot Scatter Plot'} ggplot(chicago_air, aes(x = temp, y = ozone)) + geom_point() ``` Additional modifications can be made. Customize it by adding color, title, and labels. -```{r ex-pUDJf-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Customized ggplot Scatter Plot'} +```{r ex-4ylyu-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Customized ggplot Scatter Plot'} ggplot(chicago_air, aes(x = temp, y = ozone)) + geom_point(color = "forestgreen") + ggtitle('Relationship between Ozone and Temperature') + @@ -224,12 +228,12 @@ ggplot(chicago_air, aes(x = temp, y = ozone)) + ``` The `aes()` function can also map aesthetic properties like color based on other -columns in the data frame. We could want each point to have a different color based +columns in the data frame. We might want each point to have a different color based on the month. To do this, we need to make the month column a factor and use the argument `color` in the `aes( )` function. -```{r ex-HFIjR-5, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Color Coded Scatter Plot'} +```{r ex-yBykX-5, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Color Coded Scatter Plot'} ggplot(chicago_air, aes(x = temp, y = ozone, color = factor(month))) + geom_point() + ggtitle('Relationship between Ozone and Temperature') + @@ -239,16 +243,15 @@ ggplot(chicago_air, aes(x = temp, y = ozone, color = factor(month))) + ``` The `factor()` function converts the data type of the month column to a `factor` -class. This class represents categorical variables in R. See the -[lesson on regression](../7-Regression-and-Data-Transformation/readme.md) for more -details on factors in R. +class. This class represents categorical variables in R. See the section +on Factors in this lesson for more details. To create a line plot of ozone over time, we use the `as.Date()` function on the date column and replace the `geom_point( )` function with the `geom_line( )` function. -```{r ex-kqYgY-6, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Line Plot Over Time'} +```{r ex-G0l0I-6, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Line Plot Over Time'} ggplot(chicago_air, aes(x = as.Date(date), y = ozone)) + geom_line() ``` @@ -258,7 +261,7 @@ the width of each bar, the `fill` argument the color of the bars, and the `color argument the outline of the bars. -```{r ex-ryF4b-7, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'ggplot Histogram'} +```{r ex-WzjC4-7, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'ggplot Histogram'} ggplot(chicago_air, aes(ozone)) + geom_histogram(binwidth=0.005, fill='darkorange', color='black') @@ -267,7 +270,7 @@ ggplot(chicago_air, aes(ozone)) + The `geom_boxplot()` function will create a box plot. -```{r ex-PJz74-8, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Simple ggplot Box Plot'} +```{r ex-XFlIb-8, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Simple ggplot Box Plot'} ggplot(chicago_air, aes(ozone)) + geom_boxplot() ``` @@ -276,7 +279,7 @@ Using the `y` argument can split the data into groups. Here we use the `factor() function on the month column to create 12 box plots on the graph. -```{r ex-aeCuC-9, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Grouped Box Plot'} +```{r ex-bnjIN-9, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Grouped Box Plot'} ggplot(chicago_air, aes(x = ozone, y = factor(month))) + geom_boxplot() ``` @@ -292,7 +295,7 @@ multiple plots or facets. The `facet_wrap()` function allows you to use a column to choose the facets. Below is a faceted histogram of ozone values. -```{r ex-0Fva3-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Faceted Histogram'} +```{r ex-fvlLl-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Faceted Histogram'} ggplot(chicago_air, aes(ozone)) + geom_histogram() + facet_wrap("month") @@ -308,7 +311,7 @@ represented as a shaded area. Below, the argument `method` is given the value `lm` which stands for a linear model. -```{r ex-fRNCC-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Linear Model Fitted Line', message = FALSE, warning = FALSE} +```{r ex-iIqRr-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Linear Model Fitted Line', message = FALSE, warning = FALSE} ggplot(chicago_air, aes(temp, ozone) ) + geom_point() + geom_smooth(method=lm) @@ -320,7 +323,7 @@ argument will draw a nonlinear curve which represents localized relationships be the x and y variables. -```{r ex-njElP-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Nonlinear Curve Fitting', message = FALSE, warning = FALSE} +```{r ex-XPuES-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Nonlinear Curve Fitting', message = FALSE, warning = FALSE} ggplot(chicago_air, aes(temp, ozone) ) + geom_point() + geom_smooth(method=loess) @@ -338,7 +341,7 @@ pane. You can also save a plot made by `ggplot2` using the `ggsave()` function. -```{r ex-dlAjr-1, eval = FALSE} +```{r ex-jCzeg-1, eval = FALSE} my_plot <- ggplot(chicago_air, aes(temp, ozone) ) + geom_point() + geom_smooth(method=loess) @@ -347,6 +350,38 @@ ggsave(filename = "my_plot.png", plot = my_plot) ``` +## Factors + +Factors are a data structure used in R for categorizing data into a set of levels, +which is particularly useful in statistical modeling and visualizations that involve +categorical variables. They are important in R because they influence how data is +represented and analyzed in statistical models, ensuring the data is treated as +nominal or ordinal rather than continuous. + +To create a factor in R, you use the `factor()` function. This function takes a +vector of alphanumeric values and converts it into a factor, which stores the data +as integers internally while maintaining labels for these integers. Here is an +example of converting a character vector into a factor: + + +```{r ex-jUKrV-1, eval = FALSE} +months <- c("January", "February", "March", "January", "February") +months_factor <- factor(months) + +months_factor + +``` + +Factors are particularly useful in data analysis for a few reasons: + +- __Statistical Analysis__: Many statistical models require categorical data to + be provided as factors in order to correctly analyze it. +- __Control Order__: Factors can be ordered or unordered, and you can specify the + order of levels to influence data analysis and visual representation. +- __Efficiency__: Factors store data as integers, which can be more memory efficient + than storing strings, especially for large datasets. + + ## Exercises {data-progressive=TRUE} @@ -485,6 +520,7 @@ Use `ggplot2` to plot histograms of barometric pressure values for each month fr ggplot(chicago_air, aes(pressure)) + geom_histogram() + facet_wrap(~month) + ``` ```{r exercise4-check} @@ -494,6 +530,55 @@ grade_this_code( ``` +### Exercise 5 + +__CHALLENGING EXERCISE!__ Use `ggplot2` and the data frame `ertac_egu_projections` from the `region5air` package to make a map of facility locations in the CONUS ERTAC region. + +```{r exercise5, exercise = TRUE} +# Your code here +``` + +```{r exercise5-hint-1} +# Use the `data()` function to load the `ertac_egu_projections` data frame and ?ertac_egu_projections to see the documentation. Which columns would be useful for the x and y coordinates? +``` + +```{r exercise5-hint-2} +# Use the `filter()` function from the `dplyr` package to filter down to the 'CONUS' region and look at the documentation for the `geom_polygon()` function: https://ggplot2.tidyverse.org/reference/geom_polygon.html. +``` + +```{r exercise5-hint-3} +# Use the `map_data()` function from the `maps` package to create a base map variable to use as the `data` parameter in the `geom_polygon()` function. +``` + +```{r exercise5-solution} +library(region5air) +library(ggplot2) +library(dplyr) +library(maps) + +data(ertac_egu_projections) +# filter to CONUS region +conus <- filter(ertac_egu_projections, ertac_region == "CONUS") +states_map <- map_data("state") + +ggplot() + + geom_polygon(data = states_map, aes(x = long, y = lat, group = group), + fill = "white", color = "black") + + geom_point(data = conus, aes(x = longitude, y = latitude), color = "red", + size = 3) + + coord_fixed(1.3) + + labs(title = "Map of the US with Points", x = "Longitude", y = "Latitude") + + theme_minimal() + +``` + +```{r exercise5-check} +grade_this_code( + correct = c(gradethis::random_praise(), "This exercise involves creating a map of facility locations in the CONUS ERTAC region using `ggplot2` and the `ertac_egu_projections` data frame from the `region5air` package. By filtering the data to the "CONUS" region and using the `geom_polygon()` function with a base map variable, we can visualize the facility locations effectively. This exercise provides a challenging opportunity to practice plotting geospatial data in R. ") +) +``` + + ## Next Lesson diff --git a/inst/tutorials/6-Basic-Statistics/lesson.Rmd b/inst/tutorials/6-Basic-Statistics/lesson.Rmd index 8083137..637a65c 100644 --- a/inst/tutorials/6-Basic-Statistics/lesson.Rmd +++ b/inst/tutorials/6-Basic-Statistics/lesson.Rmd @@ -29,7 +29,7 @@ This lesson assumes you are familiar with the material in the lesson on [Functio Statistical functions are used in this lesson that require installation of the following packages. -```{r ex-oiHQs-1, eval = FALSE} +```{r ex-a2k2I-1, eval = FALSE} install.packages("envstats") ``` @@ -43,7 +43,7 @@ R has many built-in functions for descriptive statistics. We will use these functions to understand the example environmental data available in this package. -```{r ex-EAgK4-1, exercise = TRUE, exercise.cap = 'Extract example data'} +```{r ex-1qQ2j-1, exercise = TRUE, exercise.cap = 'Extract example data'} data <- example_data # Assuming example_data is available in this package ``` @@ -56,17 +56,17 @@ These functions let us know the range of the data values, i.e., the highest and lowest values. -```{r ex-OvzXh-2, exercise = TRUE, exercise.cap = 'Find minimum value'} +```{r ex-weaEx-2, exercise = TRUE, exercise.cap = 'Find minimum value'} min(data, na.rm=TRUE) ``` -```{r ex-S5tF6-3, exercise = TRUE, exercise.cap = 'Find maximum value'} +```{r ex-7QIRr-3, exercise = TRUE, exercise.cap = 'Find maximum value'} max(data, na.rm=TRUE) ``` -```{r ex-fZq9J-4, exercise = TRUE, exercise.cap = 'Find range of values'} +```{r ex-xgBvB-4, exercise = TRUE, exercise.cap = 'Find range of values'} range(data, na.rm=TRUE) ``` @@ -74,7 +74,7 @@ range(data, na.rm=TRUE) We can also get the mean and the quartile values from the `summary()` function. -```{r ex-WlGqt-5, exercise = TRUE, exercise.cap = 'Summary statistics'} +```{r ex-UT6Xm-5, exercise = TRUE, exercise.cap = 'Summary statistics'} summary(data) ``` @@ -84,7 +84,7 @@ the spread is for the values in the central range of the distribution, i.e., bet the 1st quartile and the 3rd quartile. -```{r ex-MrRFk-6, exercise = TRUE, exercise.cap = 'Calculate IQR'} +```{r ex-eXOtH-6, exercise = TRUE, exercise.cap = 'Calculate IQR'} IQR(data, na.rm=TRUE) ``` @@ -94,7 +94,7 @@ of the box itself shows the first and third quartile, while the line in the midd of the box shows the median. -```{r ex-Zl2MW-7, exercise = TRUE, exercise.cap = 'Visualize IQR with boxplot'} +```{r ex-Iw1o0-7, exercise = TRUE, exercise.cap = 'Visualize IQR with boxplot'} boxplot(data) ``` @@ -104,12 +104,12 @@ boxplot(data) R has functions for finding the mean and median of a set of values. -```{r ex-xtlnV-1, exercise = TRUE, exercise.cap = 'Calculate mean'} +```{r ex-K7PqR-1, exercise = TRUE, exercise.cap = 'Calculate mean'} mean(data, na.rm=TRUE) ``` -```{r ex-FGd5i-2, exercise = TRUE, exercise.cap = 'Calculate median'} +```{r ex-bkBnL-2, exercise = TRUE, exercise.cap = 'Calculate median'} median(data, na.rm=TRUE) ``` @@ -118,12 +118,12 @@ The functions `var()` and `sd()` calculate the variance and standard deviation, respectively. -```{r ex-Gymng-3, exercise = TRUE, exercise.cap = 'Calculate variance'} +```{r ex-kPlk7-3, exercise = TRUE, exercise.cap = 'Calculate variance'} var(data, na.rm=TRUE) ``` -```{r ex-ywSV1-4, exercise = TRUE, exercise.cap = 'Calculate standard deviation'} +```{r ex-kmE21-4, exercise = TRUE, exercise.cap = 'Calculate standard deviation'} sd(data, na.rm=TRUE) ``` @@ -136,7 +136,7 @@ the `t.test()` function to perform a two-sample t-test on the example data. First, let's visualize our dataset. -```{r ex-IZZoZ-1, warning = FALSE, message = FALSE, exercise = TRUE, exercise.cap = 'Visualize dataset'} +```{r ex-qqNT1-1, warning = FALSE, message = FALSE, exercise = TRUE, exercise.cap = 'Visualize dataset'} ggplot(data, aes(factor(variable), value)) + geom_boxplot() ``` @@ -146,7 +146,7 @@ difference in concentrations. Below is a plot of those two groups side by side. -```{r ex-jRYZK-2, warning = FALSE, message = FALSE, exercise = TRUE, exercise.cap = 'Compare two groups'} +```{r ex-RSizC-2, warning = FALSE, message = FALSE, exercise = TRUE, exercise.cap = 'Compare two groups'} filtered_data <- filter(data, group == "Group1" | group == "Group2") ggplot(filtered_data, aes(factor(group), value)) + geom_boxplot() @@ -157,7 +157,7 @@ We should also check for normality before doing any statistical tests. Below are histograms of the datasets. -```{r ex-Newy0-3, exercise = TRUE, exercise.cap = 'Check for normality with histograms'} +```{r ex-5Ltn0-3, exercise = TRUE, exercise.cap = 'Check for normality with histograms'} ggplot(filtered_data, aes(value)) + facet_grid(rows = vars(group)) + geom_histogram() @@ -171,14 +171,14 @@ comes from a normal distribution. If the p-value of the test is less than .05, we reject the null hypothesis and conclude the data is not normal. -```{r ex-FAgan-4, exercise = TRUE, exercise.cap = 'Shapiro-Wilk test for Group1'} +```{r ex-rKJeL-4, exercise = TRUE, exercise.cap = 'Shapiro-Wilk test for Group1'} group1_data <- filter(data, group == "Group1") shapiro.test(group1_data$value) ``` -```{r ex-4JfUa-5, exercise = TRUE, exercise.cap = 'Shapiro-Wilk test for Group2'} +```{r ex-gKeKN-5, exercise = TRUE, exercise.cap = 'Shapiro-Wilk test for Group2'} group2_data <- filter(data, group == "Group2") shapiro.test(group2_data$value) @@ -195,7 +195,7 @@ datasets are from the same distribution or not. The assumption, or null hypothes is that they are, in fact, mean values from the same distribution. -```{r ex-4ZJf3-6, exercise = TRUE, exercise.cap = 'Student's t-test between two groups'} +```{r ex-5CJ5g-6, exercise = TRUE, exercise.cap = 'Student's t-test between two groups'} t.test(group1_data$value, group2_data$value) ``` @@ -220,7 +220,7 @@ The `EnvStats` package has a comprehensive list of basic and more advanced stati tests for Environmental Data. -```{r ex-eyXT2-1, eval = FALSE} +```{r ex-ja2Cp-1, eval = FALSE} library(EnvStats) ?FcnsByCatHypothTests @@ -239,7 +239,7 @@ arguments that we only want to include complete observations and the Pearson met of finding correlations. -```{r ex-RvBN0-1, exercise = TRUE, exercise.cap = 'Correlation matrix of select variables'} +```{r ex-Mt13i-1, exercise = TRUE, exercise.cap = 'Correlation matrix of select variables'} cor(data[, c("Variable1", "Variable2", "Variable3")], use = "complete.obs", method ="pearson") @@ -256,7 +256,7 @@ We could also perform a correlation test using the `cor.test()` function. Here we test the correlation between two variables. -```{r ex-33Nto-2, exercise = TRUE, exercise.cap = 'Test correlation between two variables'} +```{r ex-cBslA-2, exercise = TRUE, exercise.cap = 'Test correlation between two variables'} cor.test(data$Variable1, data$Variable2, method = "pearson") ``` @@ -270,7 +270,7 @@ do not reject the null hypothesis. We conclude there is no correlation between these two variables. -```{r ex-uUpmT-3, exercise = TRUE, exercise.cap = 'Test correlation between another set of two variables'} +```{r ex-hmGTn-3, exercise = TRUE, exercise.cap = 'Test correlation between another set of two variables'} cor.test(data$Variable1, data$Variable3, method = "pearson") ``` @@ -281,7 +281,7 @@ plot between each pair of columns in the data frame. Setting `lower.panel = pane will draw a smooth line through the scatter plots on the lower panels. -```{r ex-lt9er-4, exercise = TRUE, exercise.cap = 'Pairwise plots of select variables'} +```{r ex-3N14X-4, exercise = TRUE, exercise.cap = 'Pairwise plots of select variables'} pairs(data[, c("Variable1", "Variable2", "Variable3")], lower.panel = panel.smooth) ``` diff --git a/inst/tutorials/7-Quality-Assurance/lesson.Rmd b/inst/tutorials/7-Quality-Assurance/lesson.Rmd index c2558ec..367352d 100644 --- a/inst/tutorials/7-Quality-Assurance/lesson.Rmd +++ b/inst/tutorials/7-Quality-Assurance/lesson.Rmd @@ -27,7 +27,7 @@ This lesson assumes you are familiar with the material in the lesson on [Functio The data used throughout these lessons is provided by this package. To access the data, simply use the `data()` function with the name of the dataset provided by this package. -```{r ex-vhRJG-1, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Load Example Data Frame'} +```{r ex-oJlVk-1, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Load Example Data Frame'} # Assuming this package is already loaded into your R session data("example_dataset") @@ -40,7 +40,7 @@ data("example_dataset") Data types are the first thing to consider when using data in R. Many errors can happen if we assume that our data is a certain type, when in reality it is not. After reading data into R, we should look at the data types in RStudio or using the function `str()`. -```{r ex-n1jfS-1, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Inspect Example Dataset Data Types'} +```{r ex-l5aYh-1, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Inspect Example Dataset Data Types'} str(example_dataset) ``` @@ -48,7 +48,7 @@ str(example_dataset) Here is an example of text that is read into R, and a certain column might be `character` when we expected it to be `Date`. -```{r ex-k9zoX-2, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Check Data Types'} +```{r ex-9P9lB-2, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Check Data Types'} sample_data <- read.csv(text = " date,value 2022-08-01,100 @@ -62,7 +62,7 @@ str(sample_data) We can use the `as.Date()` function to transform the column after reading the data, or we can use the `colClasses` argument in the `read.csv` function to ensure it's read correctly. -```{r ex-T2bJJ-3, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Correcting Data Types with colClasses'} +```{r ex-xVxxM-3, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Correcting Data Types with colClasses'} sample_data <- read.csv(colClasses = c("Date", "numeric"), text = " date,value 2022-08-01,100 @@ -78,7 +78,7 @@ str(sample_data) For both character and numeric data types, there may be values that should not be allowed. -```{r ex-nnd6k-1, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Identify and Correct Unallowed Data'} +```{r ex-XvPip-1, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Identify and Correct Unallowed Data'} # Example of correcting unallowed values values <- c(1, 2, -1, 3, -2, 4) values[values < 0] <- NA @@ -92,7 +92,7 @@ values Handling outliers is difficult because we do not necessarily want to remove data that may be uncommon but within the realm of possibility. -```{r ex-u3chC-1, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Identify and Handle Outliers'} +```{r ex-RwuAn-1, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Identify and Handle Outliers'} # Example of identifying and handling outliers data("example_dataset") boxplot(example_dataset$value) @@ -104,7 +104,7 @@ boxplot(example_dataset$value) If you run a command and get an error, then R should print an error message. Common syntax mistakes include missing commas, unmatched parentheses, and the wrong type of closing brace. -```{r ex-Swbgo-1, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Examples of Common Syntax Mistakes'} +```{r ex-LCQYa-1, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Examples of Common Syntax Mistakes'} # Example of a common syntax mistake: missing commas x <- c("a", "b" "c") diff --git a/source/2-Functions-and-Importing-Data/lesson2.yaml b/source/2-Functions-and-Importing-Data/lesson2.yaml index 98c671a..e4e2ff8 100644 --- a/source/2-Functions-and-Importing-Data/lesson2.yaml +++ b/source/2-Functions-and-Importing-Data/lesson2.yaml @@ -38,7 +38,8 @@ content: mean(x) - type: paragraph content: | - As you would expect, R has many built-in math functions. Below are a series of examples. + As you would expect, R has many built-in math functions. Below are a few + examples. - type: code language: r options: @@ -176,7 +177,7 @@ content: exercise.lines: 5 exercise.cap: "Maximum Value" content: | - max(c(6, 9, 3, 11, -2)) + max(c(15, 2, 8.3, -10, 21)) - type: section title: "4. `seq()`" content: @@ -306,7 +307,7 @@ content: content: - type: paragraph content: | - Most of the statistical summary functions in R have the argument `na.rm`. This stands for `NA` remove. `NA` value is how R represents a missing value, similar to the NULL value in a SQL database. + Most of the statistical summary functions in R have the argument `na.rm`. This stands for `NA` remove. The `NA` value is how R represents a missing value, similar to the NULL value in a SQL database. - type: paragraph content: | For example, there is a built-in data frame in R called `airquality` with daily measurements from a monitor in New York from 1973 (see `?airquality`). If we load the data frame using the `data()` function and take a look at the top 6 rows using the `head()` function, we can see some missing values represented as `NA`. @@ -407,7 +408,7 @@ content: Here is a link to a page that lists many useful packages for environmental data analysis: https://cran.r-project.org/web/views/Environmetrics.html - type: paragraph content: | - Remember, when you close down RStudio, then start it up again, you don’t have to download the package again. But you do have to load the package to use any function that's not in the R core functionality (this is very easy to forget). + Remember, when you close down RStudio, then start it up again, you don’t have to download the package again. But you do have to use the `library()` function to load the package before you can use any function that's not in the R core functionality (this is very easy to forget). - type: section title: Importing Data content: @@ -470,7 +471,7 @@ content: emissions <- read_excel("./data/emissions_IL_2022.xlsx", sheet = "UNIT_DATA", skip = 6) head(emissions) exercises: - - instructions: "Use the `seq()` function to create a vector from 1 to 20 by 2. For help with the parameters, run `?seq()` in the console and use the documentation." + - instructions: "Use the `seq()` function to create a vector from 1 to 20 by 2. For help with the parameters, run `?seq()` in the console and consult the documentation." hints: - "# To create a sequence, use the `seq()` function with `from`, `to`, and `by` parameters." - "# `from` specifies the starting number, `to` the ending number, and `by` how much to increment. For example, `seq(from = 1, to = 20, by = 2)`." diff --git a/source/3-Subsetting-Sorting-and-Combining/lesson3.yaml b/source/3-Subsetting-Sorting-and-Combining/lesson3.yaml index 8d96c35..99f8e7f 100644 --- a/source/3-Subsetting-Sorting-and-Combining/lesson3.yaml +++ b/source/3-Subsetting-Sorting-and-Combining/lesson3.yaml @@ -123,7 +123,7 @@ content: content: - type: paragraph content: | - If we want to work with a particular subset of a data frame, we need to know how to select particular records. We will cover how to subset using numeric indexing, logical conditions, and the `filter()` function. + If we want to work with a particular subset of a data frame, we need to know how to select those records. We will cover how to subset using numeric indexing, logical conditions, and the `filter()` function. - type: section title: Indexing content: @@ -196,6 +196,9 @@ content: - type: paragraph content: | Now that we understand indexing we can subset the `chicago_air` data frame by using the brackets `[ , ]` function. (This is a rare example of a function in R that does not have the form `function_name()`.) + - type: paragraph + content: | + To get one row of the data frame, specify the row number you would like in the brackets, on the left side of the comma. If you leave the column value on the right side of the comma blank, it returns all the columns associated with row number 1. - type: code language: r options: @@ -287,7 +290,7 @@ content: content: - type: paragraph content: | - In R, the dollar sign `$` is a special character that can be used to access a data frame column by name. The dollar sign is placed immediately after the variable name. For example, if we wanted to access the temperature values in the `chicago_air` data frame, then we would use `chicago_air$temp`. + In R, the dollar sign `$` is a special character that can be used to access a data frame column by name. The dollar sign is placed immediately after the variable name. For example, if we wanted to access the temperature values in the `chicago_air` data frame, then we could use `chicago_air$temp`. - type: code language: r options: @@ -345,6 +348,9 @@ content: - type: paragraph content: | Vectors can also be used in a logical expression. A vector of values on the left hand side of a logical operator will return a vector of the same length with boolean values. + - type: paragraph + content: | + Here, we check if any of the integers in the vector on the left are above 60. A logical vector is returned. - type: code language: r options: @@ -450,7 +456,7 @@ content: library(dplyr) - type: paragraph content: | - The benefit of using `filter()` is that it works the way other functions in R typically work. It used braces with parameters, and not brackets `[ , ]`. The first parameter is the data frame you want to subset, and the second parameter is a logical expression. It also allows you to reference the columns in the data frame by name, without having to access the column using `$`. + The benefit of using `filter()` is that it works the way other functions in R typically work. It used parentheses with parameters `( )`, and not brackets `[ , ]`. The first parameter is the data frame you want to subset, and the second parameter is a logical expression. It also allows you to reference the columns in the data frame by name, without having to access the column using `$`. - type: paragraph content: | If we want to filter down to records in the `chicago_air` data frame where ozone was above 60 ppb (.060 ppm), we would use the following code. @@ -651,3 +657,13 @@ exercises: descending <- arrange(chicago_air, desc(date)) head(descending) + - instructions: "Create two data frames using the `data.frame()` function. The first data frame should have the columns `monitor_id` and `state` and at least one record. The second data frame should have the same column names and at least one record. Use the `dplyr` function `bind_rows()` to combine the two data frames." + solution: + code: | + library(dplyr) + + monitors_1 <- data.frame(monitor_id = c(1, 2, 3), state = c("IL", "IN", "WI")) + + monitors_2 <- data.frame(monitor_id = c(4, 5, 6), state = c("MI", "OH", "MN")) + + bind_rows(monitors_1, monitors_2) diff --git a/source/4-Writing-Functions-Conditionals-and-Loops/lesson4.yaml b/source/4-Writing-Functions-Conditionals-and-Loops/lesson4.yaml index 03c061d..10a9f49 100644 --- a/source/4-Writing-Functions-Conditionals-and-Loops/lesson4.yaml +++ b/source/4-Writing-Functions-Conditionals-and-Loops/lesson4.yaml @@ -121,9 +121,9 @@ content: - type: paragraph content: | We can create a function with more than one argument, and set default values when - needed. Suppose we would like to make a function that checks if a measurement is below - a criteria pollutant standard. We could make simple function that takes two arguments: - one for the measurement value, and one for the standard value. + needed. Suppose we would like to make a function that checks if a measurement is + greater than a criteria pollutant standard. We could make a simple function that + takes two arguments: one for the measurement value, and one for the standard value. - type: code language: r options: @@ -131,7 +131,7 @@ content: exercise.eval: false exercise.cap: "Function with Two Arguments" content: | - check_standard <- function(measurement, standard) { + standard_violated <- function(measurement, standard) { measurement > standard @@ -141,16 +141,16 @@ content: options: exercise: true exercise.eval: false - exercise.cap: "Testing check_standard Function" + exercise.cap: "Testing standard_violated Function" content: | - check_standard(measurement = 84, standard = 70) + standard_violated(measurement = 84, standard = 70) - type: paragraph content: | We could write a more specific function for checking a value against the ozone standard. For this function, we want to keep the `standard` parameter but make sure the default is `70`. It may be that we typically want to use this function to check against the current 8-hour ozone standard in parts per billion, but have - then flexibility to use a different value. + the flexibility to use a different value. To set a default value, we use `= 70` when we create the function. - type: code @@ -160,7 +160,7 @@ content: exercise.eval: false exercise.cap: "Function with Default Value" content: | - check_standard <- function(measurement, standard = 70) { + standard_violated <- function(measurement, standard = 70) { measurement > standard @@ -170,9 +170,9 @@ content: options: exercise: true exercise.eval: false - exercise.cap: "Testing check_standard with Default Value" + exercise.cap: "Testing standard_violated with Default Value" content: | - check_standard(measurement = 50) + standard_violated(measurement = 50) - type: section title: Positional Arguments content: @@ -183,7 +183,7 @@ content: they appeared in the parentheses of the `function( ){}` call, without writing out the argument names. - For example, we can supply two numbers to the `check_standard()` function that we + For example, we can supply two numbers to the `standard_violated()` function that we created above, without writing out the `measurement` and `standard` arguments. When R executes the function, it will assign the numbers to the arguments based on the position in the parentheses. @@ -197,7 +197,7 @@ content: exercise.eval: false exercise.cap: "Testing Positional Arguments" content: | - check_standard(60, 70) + standard_violated(60, 70) - type: code language: r options: @@ -205,7 +205,7 @@ content: exercise.eval: false exercise.cap: "Testing Positional Arguments in Reverse Order" content: | - check_standard(70, 60) + standard_violated(70, 60) - type: section title: if Functions content: @@ -304,7 +304,7 @@ content: print(message) - type: section - title: For loop + title: For loops content: - type: paragraph content: | @@ -359,6 +359,12 @@ content: monitor3 = c(70, 62, 68, 71)) monitors + - type: paragraph + content: | + In the code below, we create an empty vector called max_values. As the + for() function loops through the vector c(1, 2, 3), the data frame columns + are accessed using square brackets [ , i]. Each max value is saved to + the max_values vector using square brackets [i]. - type: code language: r options: @@ -404,6 +410,24 @@ content: monitors_max <- apply(monitors, MARGIN = 2, FUN = max) monitors_max + - type: paragraph + content: | + The MARGIN argument is set to 2 because we are applying the max() function + to the columns of the data frame. Also notice that we do not need to create + an initial empty vector, as we did with the for() function. The returned + value is a named vector that is as long as the number of columns in the + data frame. + + We could also find the mean of each row in the `monitors` data frame. + To do this, we would set the `MARGIN` argument to `1`. + - type: code + language: r + options: + eval: false + exercise: true + exercise.cap: "apply Function Example with MARGIN = 1" + content: | + apply(monitors, MARGIN = 1, FUN = mean) exercises: - instructions: "Write a function named `ppm_to_ppb` that converts a value from parts per million to parts per billion." hints: diff --git a/source/5-Plotting/lesson5.yaml b/source/5-Plotting/lesson5.yaml index 56ab73d..fd66512 100644 --- a/source/5-Plotting/lesson5.yaml +++ b/source/5-Plotting/lesson5.yaml @@ -110,7 +110,7 @@ content: pch = 16, col = "purple", lwd = 2.5, - xlab="Date", + xlab = "Date", ylab = 'Ozone (ppm)', main = 'Chicago Ozone Data') - type: section @@ -170,8 +170,8 @@ content: are the min and max ozone values within a certain range from the box (specifically, 1.5 times the IQR). If a value falls outside that range, it will be represented by a point (the default point type is a circle). Overall, the plot gives an idea of - where the middle half of the values are, and if there are extreme values beyond - that. + where the middle half of the values are, and if there are extreme values + on either side of the distribution. - type: paragraph content: | The benefit of supplying a data frame to the `data` argument is to break the data @@ -180,7 +180,10 @@ content: an equation like y = x. The way to represent this relationship in R is with the `~` character: `y ~ x`. The `boxplot()` function needs a formula to know which column in the data frame is being plotted, and which column is used to do the - grouping. We can make a plot of ozone by month using the `chicago_air` data frame. + grouping. + + We can make a plot of ozone by month using the `chicag_air` data frame + and the formula `ozone ~ month`. - type: code language: r options: @@ -200,10 +203,10 @@ content: - type: paragraph content: | To use `ggplot2`, we typically follow this sequence of steps: - 1. Start with the `ggplot()` function where we specify the dataset and - map variables to aesthetics (visual properties of objects in the + 1. Start with the `ggplot()` function where we specify the dataset, and then we + map variables to "aesthetics" (i.e. visual properties of objects in the plot like shapes or colors). - 2. Add `geoms` – geometric objects like points (`geom_point` for + 2. Add `geoms` which are geometric objects like points (`geom_point` for scatter plots), bars (`geom_bar` for bar plots), or lines (`geom_line` for line plots) that determine the type of the plot. 3. Finally, customize and refine the plot with additional layers like @@ -221,11 +224,12 @@ content: library(ggplot2) - type: paragraph content: | - Let's recreate the base scatter plot of ozone and temperature from the previous + First we recreate the base scatter plot of ozone and temperature from the previous section, starting with the `ggplot( )` function. The first argument `data` takes - the data frame. The `mapping` argument takes another function named `aes()`, which + the data frame. The `mapping` argument takes a function named `aes()`, which is short for aesthetic. The primary arguments in the `aes( )` function are `x` - and `y`. These determine which column from the data frame is used on the x and y axes. + and `y`. These determine which columns from the data frame are displayed + on the graph. - type: code language: r options: @@ -265,7 +269,7 @@ content: - type: paragraph content: | The `aes()` function can also map aesthetic properties like color based on other - columns in the data frame. We could want each point to have a different color based + columns in the data frame. We might want each point to have a different color based on the month. To do this, we need to make the month column a factor and use the argument `color` in the `aes( )` function. - type: code @@ -283,9 +287,8 @@ content: - type: paragraph content: | The `factor()` function converts the data type of the month column to a `factor` - class. This class represents categorical variables in R. See the - [lesson on regression](../7-Regression-and-Data-Transformation/readme.md) for more - details on factors in R. + class. This class represents categorical variables in R. See the section + on Factors in this lesson for more details. - type: paragraph content: | To create a line plot of ozone over time, we use the `as.Date()` function on the date @@ -419,6 +422,40 @@ content: geom_smooth(method=loess) ggsave(filename = "my_plot.png", plot = my_plot) + - type: section + title: Factors + content: + - type: paragraph + content: | + Factors are a data structure used in R for categorizing data into a set of levels, + which is particularly useful in statistical modeling and visualizations that involve + categorical variables. They are important in R because they influence how data is + represented and analyzed in statistical models, ensuring the data is treated as + nominal or ordinal rather than continuous. + + To create a factor in R, you use the `factor()` function. This function takes a + vector of alphanumeric values and converts it into a factor, which stores the data + as integers internally while maintaining labels for these integers. Here is an + example of converting a character vector into a factor: + - type: code + language: r + options: + eval: false + content: | + months <- c("January", "February", "March", "January", "February") + months_factor <- factor(months) + + months_factor + - type: paragraph + content: | + Factors are particularly useful in data analysis for a few reasons: + + - __Statistical Analysis__: Many statistical models require categorical data to + be provided as factors in order to correctly analyze it. + - __Control Order__: Factors can be ordered or unordered, and you can specify the + order of levels to influence data analysis and visual representation. + - __Efficiency__: Factors store data as integers, which can be more memory efficient + than storing strings, especially for large datasets. exercises: - instructions: "Make a scatter plot of barometric pressure and ozone from the `chicago_air` data frame. Use the `plot()` function with ozone on the y-axis, and provide helpful labels to the axes." hints: @@ -471,3 +508,32 @@ exercises: ggplot(chicago_air, aes(pressure)) + geom_histogram() + facet_wrap(~month) + - instructions: "__CHALLENGING EXERCISE!__ Use `ggplot2` and the data frame `ertac_egu_projections` from the `region5air` package to make a map of facility locations in the CONUS ERTAC region. " + hints: + - "# Use the `data()` function to load the `ertac_egu_projections` data frame and ?ertac_egu_projections to see the documentation. Which columns would be useful for the x and y coordinates?" + - "# Use the `filter()` function from the `dplyr` package to filter down to the 'CONUS' region and look at the documentation for the `geom_polygon()` function: https://ggplot2.tidyverse.org/reference/geom_polygon.html." + - "# Use the `map_data()` function from the `maps` package to create a base map variable to use as the `data` parameter in the `geom_polygon()` function." + solution: + explanation: | + This exercise involves creating a map of facility locations in the CONUS ERTAC region using `ggplot2` and the `ertac_egu_projections` data frame from the `region5air` package. By filtering the data to the "CONUS" region and using the `geom_polygon()` function with a base map variable, we can visualize the facility locations effectively. This exercise provides a challenging opportunity to practice plotting geospatial data in R. + code: | + library(region5air) + library(ggplot2) + library(dplyr) + library(maps) + + data(ertac_egu_projections) + # filter to CONUS region + conus <- filter(ertac_egu_projections, ertac_region == "CONUS") + states_map <- map_data("state") + + ggplot() + + geom_polygon(data = states_map, aes(x = long, y = lat, group = group), + fill = "white", color = "black") + + geom_point(data = conus, aes(x = longitude, y = latitude), color = "red", + size = 3) + + coord_fixed(1.3) + + labs(title = "Map of the US with Points", x = "Longitude", y = "Latitude") + + theme_minimal() + + From e2897b0188406b329e83e2d5754d34a82570f21c Mon Sep 17 00:00:00 2001 From: NateRByers Date: Tue, 30 Apr 2024 14:56:06 -0400 Subject: [PATCH 3/3] yaml 3 --- docs/1-Introduction/readme.md | 82 +++++++++---- docs/2-Functions-and-Importing-Data/readme.md | 101 +++++++++------- .../readme.md | 110 ++++++++++-------- .../readme.md | 48 ++++---- docs/5-Plotting/readme.md | 50 ++++---- docs/6-Basic-Statistics/readme.md | 65 ++++++----- docs/7-Quality-Assurance/readme.md | 38 ++++-- inst/tutorials/1-Introduction/lesson.Rmd | 82 +++++++++---- .../2-Functions-and-Importing-Data/lesson.Rmd | 101 +++++++++------- .../lesson.Rmd | 110 ++++++++++-------- .../lesson.Rmd | 48 ++++---- inst/tutorials/5-Plotting/lesson.Rmd | 50 ++++---- inst/tutorials/6-Basic-Statistics/lesson.Rmd | 65 ++++++----- inst/tutorials/7-Quality-Assurance/lesson.Rmd | 38 ++++-- source/1-Introduction/lesson1.yaml | 83 +++++++------ .../lesson2.yaml | 50 +++++--- .../lesson3.yaml | 107 ++++++++++------- source/6-Basic-Statistics/lesson6.yaml | 19 +-- source/7-Quality-Assurance/lesson7.yaml | 28 ++++- 19 files changed, 762 insertions(+), 513 deletions(-) diff --git a/docs/1-Introduction/readme.md b/docs/1-Introduction/readme.md index d035b5e..75cc98c 100644 --- a/docs/1-Introduction/readme.md +++ b/docs/1-Introduction/readme.md @@ -29,7 +29,16 @@ This lesson is a part of the Introduction to R for Air Quality Data Science. The ## What is R? -R is a free, open-source computing language. It was originally written by statisticians for doing statistical analysis in academia. In recent years it has become more widely used in many industries for performing a variety of data science tasks such as: +R is a free, open-source computing language. It was originally written +by statisticians for doing statistical analysis in academia. In recent +years it has become more widely used in many industries for performing +a variety of data science tasks such as: + +- reading and writing files, +- data transformation, +- graphic visualization, +- geographic mapping, +- and predictive modeling. ## Why Use a Programming Language? @@ -44,6 +53,18 @@ Many data analysis tasks can be accomplished with spreadsheets and other busines BI. When should you move beyond BI tools and use a high-level programming language like R? Below are a few scenarios where a language like R is more advantageous than a BI tool. +- If you cannot access data easily in your BI tool, R can read just about any data source. +- If you need to download, save, or otherwise process a large number of files, R can automate those tasks in a way that BI tools cannot. +- Custom data transformations that are not possible in BI tools can be done with R. +- Custom data visualizations that are not available in BI tools can be done with R. +- Predictive modeling that is not available in BI tools, or only in a rudimentary way, can be done in R. + +BI tools are more advantageous if you need enterprise wide dashboards, +or tools that are more easily accessible to a wider audience. If there +are few occasions where you need custom visualizations or transformations, +or if you do not need automation in your work, you may not need to learn +a programming language. + ## Install R and RStudio @@ -108,7 +129,7 @@ Open up a script if you haven't already (“File” -> “New File” -> “R Sc the lines into your script. -```{r ex-utrcI-1, exercise = FALSE, eval = TRUE, exercise.cap = 'Practice Basic Math Operations'} +```{r ex-HdgAY-1, exercise = FALSE, eval = TRUE, exercise.cap = 'Practice Basic Math Operations'} 10 + 5 10 - 5 10 * 5 @@ -135,12 +156,12 @@ R follows the usual order of arithmetical operations and uses parentheses for gr see the different values that are returned. -```{r ex-Cvmbu-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Understanding Order of Operations'} +```{r ex-xEdMY-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Understanding Order of Operations'} 10 - 3 / 5 ``` -```{r ex-IKuQk-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Practicing Grouping with Parentheses'} +```{r ex-VGcxo-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Practicing Grouping with Parentheses'} (10 - 3) / 5 ``` @@ -154,7 +175,7 @@ no multi-line commenting in R, so every comment line must begin with the `#` cha Run all of the code below and see what gets returned in the R console (bottom left panel in RStudio). -```{r ex-MNLdZ-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Experiment with Comments'} +```{r ex-RC8UP-1, exercise = FALSE, exercise.eval = TRUE, exercise.cap = 'Experiment with Comments'} # Full line comment 5^2 # partial line comment @@ -164,7 +185,7 @@ In the example above and the previous section, you can see the R code and its ou the R code and `## [1] 2` being the output: -```{r ex-4ennA-2, exercise = FALSE, eval = TRUE, exercise.cap = 'Code and Output Example'} +```{r ex-rNPTd-2, exercise = FALSE, eval = TRUE, exercise.cap = 'Code and Output Example'} 1+1 ``` @@ -172,19 +193,25 @@ the R code and `## [1] 2` being the output: However, in the R console the code and output would look like this: -```{r ex-ngfqc-3, exercise = FALSE, eval = FALSE, echo = TRUE, exercise.cap = 'Console Code Execution'} +```{r ex-FA0xU-3, exercise = FALSE, eval = FALSE, echo = TRUE, exercise.cap = 'Console Code Execution'} > 1 + 1 [1] 2 ``` +The R code in code blocks do not show the `>` part of the console, called +the prompt, and the output block places two comment marks `##` before the +output. This is to make it possible to copy and paste the text into your +R console and run it without causing an error. + + ## Variables A variable is a letter or combination of alphanumeric characters that is used to store data. To create a variable in R, use the less-than character with the dash to create an arrow symbol pointing left `<-`. Below, the variables `x` and `y` are created by assigning some numbers to them. -```{r ex-EK419-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Creating Variables'} +```{r ex-1jvTE-1, exercise = FALSE, exercise.eval = TRUE, exercise.cap = 'Creating Variables'} x <- 10 y <- 5 x + y @@ -199,12 +226,12 @@ In RStudio, you will see the variables we created in the top right panel. If you've already created a variable, you can replace the value with another value. -```{r ex-8QJDo-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Exploring Variable Assignment'} +```{r ex-urj0S-2, exercise = FALSE, exercise.eval = TRUE, exercise.cap = 'Exploring Variable Assignment'} x ``` -```{r ex-GJ5Vc-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Updating Variable Values'} +```{r ex-VcnkF-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Updating Variable Values'} x <- 20 x @@ -217,11 +244,15 @@ In the top right panel you can see that the number stored in the variable `x` ha There are 3 important rules to remember when creating variable names: +1. You can't start your variable name with a number. +2. You can't use spaces or special characters ($,%,#,-). Periods `.` and underscores `_` are ok. +3. Capitalization __DOES__ matter in R. That is, R will consider `y` and `Y` to be different variables. + Try running the following code and you will see that in your global environment there are two different objects listed. -```{r ex-YZ7mK-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Variable Naming Rules'} +```{r ex-DdKbR-4, exercise = FALSE, exercise.eval = TRUE, exercise.cap = 'Variable Naming Rules'} y <- 5 Y <- 10 @@ -247,19 +278,24 @@ must be unquoted numbers, and the full-caps logical values `TRUE` and There are several ways to store groups of data to make them easier to work with: +- A __vector__ stores multiple values of the same type (e.g. all numeric values). +- A __list__ stores multiple values of different types (e.g. some numbers and character values). +- A __matrix__ is a table of values with only one data type. +- A __data frame__ is a table of values that can have columns with different data types (e.g. a numeric column and a logical column). + ## Vectors A vector variable can contain only one type of data (numeric, character, or logical). We use `c()` to create vectors. -```{r ex-5fniF-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Creating Numeric Vectors'} +```{r ex-0xxM6-1, exercise = FALSE, exercise.eval = TRUE, exercise.cap = 'Creating Numeric Vectors'} x <- c(1, 2, 3, 4, 5) x ``` -```{r ex-OeOzA-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Creating Character Vectors'} +```{r ex-1IrUs-2, exercise = FALSE, exercise.eval = TRUE, exercise.cap = 'Creating Character Vectors'} fruit <- c("apples", "bananas", "oranges") fruit @@ -269,7 +305,7 @@ If you try to type in text without using quotations marks for character values ( running the code below. -```{r ex-FU0tV-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Error Without Quotes'} +```{r ex-JT90l-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Error Without Quotes'} fruit <- c(apples, bananas, oranges) ``` @@ -279,7 +315,7 @@ find them and it returns an error. The members of a vector can be accessed by us `fruit` vector, you can use the single bracket with the number 3: -```{r ex-pWuHH-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Accessing Vector Elements'} +```{r ex-P8WcO-4, exercise = FALSE, exercise.eval = TRUE, exercise.cap = 'Accessing Vector Elements'} fruit[3] ``` @@ -289,7 +325,7 @@ fruit[3] Lists are like vectors but can contain any mix of data types. We use `list()` to create a list variable. -```{r ex-KYPJd-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Creating Lists'} +```{r ex-7M0x4-1, exercise = FALSE, exercise.eval = TRUE, exercise.cap = 'Creating Lists'} x <- list("Benzene", 1.3, TRUE) x @@ -300,7 +336,7 @@ is the second value in the list, so it is shown below the double bracket `[[2]]` list. -```{r ex-O0Eld-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Accessing List Elements'} +```{r ex-xYfvK-2, exercise = FALSE, exercise.eval = TRUE, exercise.cap = 'Accessing List Elements'} x[[2]] ``` @@ -308,7 +344,7 @@ x[[2]] Lists can also contain vectors and other lists. -```{r ex-LRiBO-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Lists Containing Vectors and Lists'} +```{r ex-Hwf7U-3, exercise = FALSE, exercise.eval = TRUE, exercise.cap = 'Lists Containing Vectors and Lists'} my_vector <- c(1, 2, 3) my_list <- list("Benzene", 1.3, TRUE) y <- list(TRUE, my_vector, my_list) @@ -320,7 +356,7 @@ In this example, you can use two double brackets to access the value `1.3` by se `my_list`: -```{r ex-J8HgS-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Nested List Access'} +```{r ex-bf5fg-4, exercise = FALSE, exercise.eval = TRUE, exercise.cap = 'Nested List Access'} y[[3]][[2]] ``` @@ -331,7 +367,7 @@ Data frames are data tables in R. We use `data.frame()` to create a data frame o vectors of the same length and use them to create a data frame. -```{r ex-jnPyU-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Creating Data Frames'} +```{r ex-TuNdl-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Creating Data Frames'} pollutant <- c("Benzene", "Toluene", "Xylenes") concentration <- c(1.3, 5.5, 6.0) carcinogen <- c(TRUE, FALSE, FALSE) @@ -341,10 +377,10 @@ my_data_frame ``` The output above shows a table with the vector variable names as column names, and the vector values below the respective column name. If you try to -create a data frame where the vectors are not all the same length, you will see the error shown below. +create a data frame where the vectors are not all the same length, you will get an error. -```{r ex-hJsDp-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Data Frame Dimension Mismatch Error'} +```{r ex-R6bar-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Data Frame Dimension Mismatch Error'} pollutant <- c("Benzene", "Toluene") concentration <- c(1.3, 5.5, 6.0) carcinogen <- c(TRUE, FALSE, FALSE) @@ -498,7 +534,7 @@ l <- list(5, 'apple', TRUE) ### Exercise 5 -Create a data frame `df` with two columns, `name` and `age`, each containing three entries of your choosing. +Create a data frame `df` with two columns, `name` and `age`. Use the names 'Alice', 'Bob', and 'Charlie' and give them each an age.
Click for Hint diff --git a/docs/2-Functions-and-Importing-Data/readme.md b/docs/2-Functions-and-Importing-Data/readme.md index 786cfdc..b3e9b70 100644 --- a/docs/2-Functions-and-Importing-Data/readme.md +++ b/docs/2-Functions-and-Importing-Data/readme.md @@ -32,7 +32,11 @@ data from CSV text files and Excel documents. ## Functions -In R, there are two main types of objects: variables and functions. We covered variables in the [introductory lesson](../1-Introduction-to-R/readme.md). A variable is used to create and reference data. The data can be a character, numeric, or logical data type. Variables can reference various "containers" for data, such as a __vector__, __list__, or __data frame__. +In R, there are two main types of objects: variables and functions. We +covered variables in the introductory lesson. A variable is used to create +and reference data. The data can be a character, numeric, or logical data +type. Variables can reference various "containers" for data, such as a +__vector__, __list__, or __data frame__. Functions are similar to variables in that they are short names that reference something saved in R. In this case, a function is not referencing data but a piece of code. A function is saved code that can be used to do some operation on data. @@ -41,7 +45,7 @@ Functions are similar to variables in that they are short names that reference s R has many built-in functions that perform common tasks. When you open RStudio you can immediately use a function called `mean( )`. Here is an example of using the `mean( )` function to find the average of a vector of integers. We first save a vector of integers in the `x` variable then put the variable inside the parentheses of the function. -```{r ex-ldqeB-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Mean'} +```{r ex-kJ792-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Mean'} x <- c(4, 8, 1, 14, 34) mean(x) @@ -51,22 +55,22 @@ As you would expect, R has many built-in math functions. Below are a few examples. -```{r ex-LJu2c-2, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Natural Logarithm'} +```{r ex-A24xw-2, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Natural Logarithm'} log(27) #Natural logarithm ``` -```{r ex-5gmln-3, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Base 10 Logarithm'} +```{r ex-EyADt-3, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Base 10 Logarithm'} log10(100) #base 10 logarithm ``` -```{r ex-RMHxy-4, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Square Root'} +```{r ex-w5O1f-4, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Square Root'} sqrt(225) # Square root ``` -```{r ex-JAkxX-5, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Absolute Value'} +```{r ex-Tp4pk-5, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Absolute Value'} abs(-5) #Absolute value ``` @@ -77,7 +81,7 @@ All of the examples show that the general form is `function_name( )`. The name o Many functions also have additional options you can choose, which are called the _arguments_. To see what needs to go inside `( )`, type a question mark in front of the function and run it in the R console. -```{r ex-yDYm5-6, eval = FALSE} +```{r ex-i0oon-6, eval = FALSE} ?mean() ``` @@ -90,27 +94,42 @@ In RStudio, you will see the help page for `mean()` in the bottom right corner p On the help page, under `Usage`, you see `mean(x, ...)`. This means that the only thing that necessarily has to go into `( )` is `x`. On the help page under `Arguments` you will find a description of what `x` needs to be: a numeric or logical vector. -Many built-in functions in R have multiple arguments. This allows you to give the function some more information to perform calculation you want. The example below shows how to use the `digits` argument in the `round( )` function. Providing different values to the `digits` argument will return different values. +Many built-in functions in R have multiple arguments. This allows you +to give the function some more information to perform calculation you +want. The example below shows how to use the `digits` argument in the +`round( )` function. Providing different values to the `digits` argument +will return different values. -```{r ex-tHLQ7-7, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Round to Zero Digits'} +```{r ex-JejZc-7, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Round to Zero Digits'} round(12.3456) ``` -```{r ex-ycGBk-8, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Round to Three Digits'} +```{r ex-vhN8x-8, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Round to Three Digits'} round(12.3456, digits=3) ``` -```{r ex-E1DYK-9, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Round to One Digit'} +```{r ex-9dkMc-9, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Round to One Digit'} round(12.3456, digits=1) ``` +In the first example, you can see that we did not provide a value for the +`digits` argument. That's because there is a default value `digits = 0` (see +the `Usage` section on the help page `?round`). If there is a default value, +then that argument does not need to be specified inside `( )`. If there is no +default value for an argument, then the function will error and tell you that +you forgot to supply a value for the argument. + + ## Useful Built-in Functions -When you start an R session there are many built-in functions that are immediately available for you to use. Other functions are available in community developed packages, as explained in a later section of this lesson. Below is a list of a few commonly used built-in functions in R. +When you start an R session there are many built-in functions that are +immediately available for you to use. Other functions are available in +community developed packages, as explained in a later section of this +lesson. Below is a list of a few commonly used built-in functions in R. ## 1. `sum( )` @@ -118,7 +137,7 @@ When you start an R session there are many built-in functions that are immediate Returns the sum of a vector of numeric values. -```{r ex-4sYL0-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Sum of Values'} +```{r ex-1A5T4-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Sum of Values'} sum(c(2.3, 7.5, 9, -10)) ``` @@ -128,7 +147,7 @@ sum(c(2.3, 7.5, 9, -10)) Get the minimum value from a numeric vector. -```{r ex-dZn5n-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Minimum Value'} +```{r ex-0GayS-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Minimum Value'} min(c(6, 9, 3, 11, -2)) ``` @@ -138,7 +157,7 @@ min(c(6, 9, 3, 11, -2)) Get the maximum value from a numeric vector. -```{r ex-3HUXM-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Maximum Value'} +```{r ex-EIX17-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Maximum Value'} max(c(15, 2, 8.3, -10, 21)) ``` @@ -148,7 +167,7 @@ max(c(15, 2, 8.3, -10, 21)) Create a numeric vector with a certain sequence. The example below creates a vector of integers from 1 to 5. -```{r ex-2OWF6-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Create Sequence with `seq()`'} +```{r ex-NLE4H-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Create Sequence with `seq()`'} seq(from = 1, to = 5, by = 1) ``` @@ -156,7 +175,7 @@ seq(from = 1, to = 5, by = 1) Another way to create a sequence of integers is to use the colon. -```{r ex-pLwp3-2, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Create Sequence with Colon Operator'} +```{r ex-onSea-2, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Create Sequence with Colon Operator'} 1:5 ``` @@ -166,7 +185,7 @@ Another way to create a sequence of integers is to use the colon. Concatenate two or more strings. -```{r ex-CSTmi-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Concatenate Strings'} +```{r ex-K1wix-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Concatenate Strings'} x <- "Hello" y <- "world!" paste(x, y, sep = " ") @@ -176,7 +195,7 @@ paste(x, y, sep = " ") Any numbers will be converted to strings. -```{r ex-8kS7p-2, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Concatenate Strings with Numbers'} +```{r ex-ldQ2A-2, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Concatenate Strings with Numbers'} x <- "You're number " y <- 1 z <- "!" @@ -192,7 +211,7 @@ The `substr()` function allows you to pull out a section from a string based on For example, in AQS data a monitor ID may be written in the following format: [State code - County code - Site number - Parameter code - POC]. If we only wanted to pull out the site number for this monitor ID we could do the following: -```{r ex-6Uwcp-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Extract Site Number'} +```{r ex-laxSn-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Extract Site Number'} wisconsin_monitor <- c('55-021-0015-44201-2') # Ozone monitor in Columbia County, WI site_id <- substr(wisconsin_monitor, start = 8, stop = 11) # start and stop position within the character string. site_id @@ -207,14 +226,14 @@ R allows you to place a function inside another function to perform multiple tas For instance, if you want to create a sequence of numbers and then take the mean of that sequence, you could either do it in a couple of steps, or all at once. -```{r ex-6U2qd-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Mean in Two Steps'} +```{r ex-a0zYo-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Mean in Two Steps'} #Two steps x <- seq(from=1, to=10, by=3) mean(x) ``` -```{r ex-NJbfk-2, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Mean in One Step'} +```{r ex-QoTJq-2, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Mean in One Step'} #One step mean(seq(from=1, to=10, by=3)) @@ -231,7 +250,7 @@ Most of the statistical summary functions in R have the argument `na.rm`. This s For example, there is a built-in data frame in R called `airquality` with daily measurements from a monitor in New York from 1973 (see `?airquality`). If we load the data frame using the `data()` function and take a look at the top 6 rows using the `head()` function, we can see some missing values represented as `NA`. -```{r ex-VgwFX-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Inspect `airquality` Data'} +```{r ex-EhBQa-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Inspect `airquality` Data'} data("airquality") head(airquality) @@ -240,7 +259,7 @@ head(airquality) The `mean()` function, for example, has the argument `na.rm` set to `FALSE`. This means that the `NA` values will not be removed from the vector for which it is calculating the mean. As a result, it will return an `NA` because it cannot properly calculate the average. Here we use the `Ozone` column from the `airquality` data frame. -```{r ex-YUYny-2, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Mean with NA Values'} +```{r ex-gc6EL-2, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Mean with NA Values'} mean(airquality$Ozone) ``` @@ -248,7 +267,7 @@ mean(airquality$Ozone) To get the mean value, we set `na.rm = TRUE`. -```{r ex-Rd5ZH-3, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Mean with NA Removal'} +```{r ex-7sKgg-3, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Mean with NA Removal'} mean(airquality$Ozone, na.rm = TRUE) ``` @@ -270,7 +289,7 @@ For example, if you wanted to find serial correlation in an environmental data s First, you might try to use the function. -```{r ex-Kdkm9-1, error = TRUE} +```{r ex-A4IiO-1, error = TRUE} x <- c(1.3, 3.5, 2.6, 3.4, 6.4) serialCorrelationTest(x) @@ -292,12 +311,9 @@ A window will pop up. Start typing "EnvStats" into the "Packages" box, select th Now that we've installed the package, we still can't use the function we want. We need to load the package first (opening the app). We use the `library()` function to do this. -```{r ex-ZPGae-2, message = FALSE} +```{r ex-EyPJy-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Use `serialCorrelationTest` from EnvStats'} library(EnvStats) -``` - -```{r ex-SIGlj-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Use `serialCorrelationTest` from EnvStats'} x <- c(1.3, 3.5, 2.6, 3.4, 6.4) serialCorrelationTest(x) @@ -311,15 +327,22 @@ Remember, when you close down RStudio, then start it up again, you don’t have ## Importing Data -R can import data from just about any format, including CSV, Excel, Databases, GIS shapefiles. This section will demonstrate how to import CSV and Excel files. +R can import data from just about any format, including + +- CSV, +- Excel, +- Databases, and +- GIS shapefiles. + +This section will demonstrate how to import CSV and Excel files. ## CSV -R has a built-in function called `read.csv()` for reading `.csv` files. Download the `chicago_daily.csv` file [here](../data/chicago_daily.csv) and save it to your working directory. If you don't know what your working directory is, run this code in R and it will tell you. +R has a built-in function called `read.csv()` for reading `.csv` files. Download the `chicago_daily.csv` file [here](https://github.com/LADCO/training-r-intro/blob/main/data/chicago_daily.csv) and save it to your working directory. If you don't know what your working directory is, run this code in R and it will tell you. -```{r ex-KBipD-1, eval = FALSE} +```{r ex-pOZWD-1, eval = FALSE} getwd() ``` @@ -327,7 +350,7 @@ getwd() Use `read.csv()` by providing the location and name of the file as the first argument. If the file is in your working directory, simply supply the name of the file. Below, the data from the file is read into R and saved as a data frame, which is the data type for storing tables. The function `head()` will show the first few lines. -```{r ex-fRIyU-2, eval = FALSE} +```{r ex-kn4N2-2, eval = FALSE} chicago_daily <- read.csv("chicago_daily.csv") head(chicago_daily) @@ -338,7 +361,7 @@ head(chicago_daily) There are several packages that can be used to import data from an Excel file, such as `xlsx`, `XLConnect`, and `readxl`. In this example, we'll use the `readxl` package. If you do not have the package installed, you can use RStudio to install as described in the section above on packages. You can also use the function `install.packages( )`. -```{r ex-vGMD9-1, eval = FALSE} +```{r ex-tF7II-1, eval = FALSE} install.packages("readxl") ``` @@ -351,10 +374,10 @@ library(readxl) ``` -Use the `read_excel()` function from the `readxl` package to read emissions data from [this Excel workbook](./data/emissions_IL_2022.xlsx). Download the file to your working directory and read the first worksheet (named "UNIT_DATA"), skipping the first 6 rows. +Use the `read_excel()` function from the `readxl` package to read emissions data from [this Excel workbook](https://github.com/LADCO/training-r-intro/blob/main/data/emissions_IL_2022.xlsx). Download the file to your working directory and read the first worksheet (named "UNIT_DATA"), skipping the first 6 rows. -```{r ex-21K7t-3, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Read and Inspect Excel Data'} +```{r ex-vRqYb-3, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Read and Inspect Excel Data'} library(readxl) emissions <- read_excel("./data/emissions_IL_2022.xlsx", sheet = "UNIT_DATA", skip = 6) head(emissions) @@ -499,7 +522,7 @@ sum(1:10) ### Exercise 5 -Read in the first 10 rows of the `chicago_daily.csv` file [here](../data/chicago_daily.csv). +Read in the first 10 rows of the `chicago_daily.csv` file [here](https://github.com/LADCO/training-r-intro/blob/main/data/chicago_daily.csv).
Click for Hint @@ -521,7 +544,7 @@ First, ensure the `chicago_daily.csv` file is saved in your working directory. T ```r -read.csv("../data/chicago_daily.csv", nrows = 10) +read.csv("chicago_daily.csv", nrows = 10) ```
diff --git a/docs/3-Subsetting-Sorting-and-Combining/readme.md b/docs/3-Subsetting-Sorting-and-Combining/readme.md index 94284a4..a7e53a9 100644 --- a/docs/3-Subsetting-Sorting-and-Combining/readme.md +++ b/docs/3-Subsetting-Sorting-and-Combining/readme.md @@ -22,16 +22,27 @@ This lesson covers how to subset data using indexing, logical operators, and the ## Prerequisites -This lesson assumes you are familiar with the material in the previous lesson on [Functions and Importing Data](../2-Functions-and_Importing-Data/readme.md). +This lesson assumes you are familiar with the material in the previous +lesson on Functions and Importing Data. +The data from the R package region5air is used throughout these lessons. +To install the package from GitHub, use the `remotes` package. Run the code +below to install the `remotes` package and install `region5air` from GitHub. -The example data for exercises in this lesson is available directly from this package. It is assumed that this package is already installed and loaded into your R session. +```{r ex-R4uFT-1, exercise = FALSE, exercise.eval = FALSE, eval = FALSE, exercise.cap = 'Install remotes and region5air'} +# if you have not installed remotes +install.packages("remotes") + +library(remotes) +install_github("FluentData/region5air") + +``` To load the `chicago_air` data frame we will be using in the lesson, simply use the `data()` function to load the data frame. -```{r ex-SqovC-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Load `chicago_air` Data Frame'} +```{r ex-Btdwr-2, exercise = FALSE, exercise.eval = FALSE, eval = FALSE, exercise.cap = 'Load `chicago_air` Data Frame'} data(chicago_air) ``` @@ -42,7 +53,7 @@ You should see the `chicago_air` variable in the top right panel of RStudio, whi We will also use some functions from the `dplyr` package. You will need to install the package if you haven't already. -```{r ex-2kcO9-2, exercise = FALSE, exercise.eval = FALSE, eval = FALSE, exercise.cap = 'Install `dplyr` Package'} +```{r ex-3LpLn-3, exercise = FALSE, exercise.eval = FALSE, eval = FALSE, exercise.cap = 'Install `dplyr` Package'} install.packages("dplyr") ``` @@ -55,7 +66,7 @@ We always want to make sure our data looks the way it is supposed to before we b The best way to take a quick look at the first few rows of a data frame is to use the `head()` function. -```{r ex-hv5Qx-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'View the First Few Rows of `chicago_air`'} +```{r ex-qg3g3-1, exercise = FALSE, exercise.eval = TRUE, exercise.cap = 'View the First Few Rows of `chicago_air`'} data(chicago_air) head(chicago_air) @@ -64,8 +75,7 @@ head(chicago_air) You can specify the number of lines to display by using the `n` parameter. -```{r ex-3x2rr-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Specify Number of Lines with `head()`'} -data(chicago_air) +```{r ex-GKUzG-2, exercise = FALSE, exercise.eval = TRUE, exercise.cap = 'Specify Number of Lines with `head()`'} head(chicago_air, n = 3) ``` @@ -73,8 +83,7 @@ head(chicago_air, n = 3) You can also look at the bottom of the data frame by using the `tail()` function. -```{r ex-BjwqU-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'View the Last Few Rows of `chicago_air`'} -data(chicago_air) +```{r ex-VaUTI-3, exercise = FALSE, exercise.eval = TRUE, exercise.cap = 'View the Last Few Rows of `chicago_air`'} tail(chicago_air) ``` @@ -82,8 +91,7 @@ tail(chicago_air) In RStudio, you can either click on the name of the data frame in the top right panel or use the `View()` function to generate a web based table of the data in the top left panel. -```{r ex-H1DeE-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Use `View()` to Inspect `chicago_air`'} -data(chicago_air) +```{r ex-QLU7a-4, exercise = FALSE, exercise.eval = FALSE, eval = FALSE, exercise.cap = 'Use `View()` to Inspect `chicago_air`'} View(chicago_air) ``` @@ -93,12 +101,13 @@ View(chicago_air) By inspecting the data frame this way, you can see that the records are daily values of ozone, temperature, and solar radiation. For more information about the data set you can type a question mark in from the name of the data frame variable in the console. -```{r ex-uojGa-5, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Learn More About `chicago_air`'} +```{r ex-qwdyV-5, exercise = FALSE, exercise.eval = FALSE, eval = FALSE, exercise.cap = 'Learn More About `chicago_air`'} ?chicago_air ``` From the `Description` section of the help page, the `chicago_air` data frame is: + > A dataset containing daily values of ozone, temperature, and solar radiation from a Chicago monitor between January 1, 2021 and December 31, 2021. @@ -109,7 +118,13 @@ If we want to work with a particular subset of a data frame, we need to know how ## Indexing -Values in a data frame can be selected, individually or in a group, based on their index values. These are integers that represent the locations in the data frame. If there is a 2 x 2 table, then there are 2 rows and 2 columns. Each cell can be represented by two numbers, like coordinates on a map. For a data frame, the format is `[row, column]`. Below is a table that shows the index values in each cell. +Values in a data frame can be selected, individually or in a group, +based on their index values. These are integers that represent the +locations in the data frame. If there is a 2 x 2 table, then there +are 2 rows and 2 columns. Each cell can be represented by two numbers, +like coordinates on a map. For a data frame, the format is `[row, column]`. +Below is a table that shows the index values in each cell. + |Column 1 | Column 2| |--- |--- | | `[1, 1]`| `[1, 2]`| @@ -119,7 +134,7 @@ Values in a data frame can be selected, individually or in a group, based on the Below is a data frame called `my_data` that has 3 rows and 2 columns. -```{r ex-KwyTj-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Explore Data Frame Indexing with `my_data`'} +```{r ex-azXUw-1, exercise = FALSE, exercise.eval = TRUE, exercise.cap = 'Explore Data Frame Indexing with `my_data`'} my_data <- data.frame(colors = c("red", "green", "yellow"), fruit = c("apple", "grape", "banana")) @@ -130,7 +145,7 @@ my_data To select a particular cell from the `my_data` data frame, we use the `[row, column]` construction. We place those square brackets at the end of the data frame variable `my_data[]` and use integers to select a value. If we wanted to select the "green" value, we would use `my_data[2, 1]`. -```{r ex-IAy8F-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Select a Specific Value Using Indexing'} +```{r ex-72hcm-2, exercise = FALSE, exercise.eval = TRUE, exercise.cap = 'Select a Specific Value Using Indexing'} my_data[2, 1] ``` @@ -138,7 +153,7 @@ my_data[2, 1] To select "banana", we use `my_data[3, 2]`. -```{r ex-Xs5Wm-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access Another Specific Value Using Indexing'} +```{r ex-tkWkI-3, exercise = FALSE, exercise.eval = TRUE, exercise.cap = 'Access Another Specific Value Using Indexing'} my_data[3, 2] ``` @@ -146,7 +161,7 @@ my_data[3, 2] We can also access data from a vector using the same indexing idea. In this case, you don’t need the comma to separate the rows and columns since you are accessing one dimensional data. Below is a vector of numbers. -```{r ex-y7DY6-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Vector Indexing Example'} +```{r ex-N4yR1-4, exercise = FALSE, exercise.eval = TRUE, exercise.cap = 'Vector Indexing Example'} x <- c(1, 3, 2, 7, 25.3, 6) x @@ -155,7 +170,7 @@ x If we want to access the 5th element of the vector, we would use `x[5]`. -```{r ex-cp1Uc-5, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access a Vector Element by Index'} +```{r ex-yndUU-5, exercise = FALSE, exercise.eval = TRUE, exercise.cap = 'Access a Vector Element by Index'} x[5] ``` @@ -166,7 +181,7 @@ Now that we understand indexing we can subset the `chicago_air` data frame by us To get one row of the data frame, specify the row number you would like in the brackets, on the left side of the comma. If you leave the column value on the right side of the comma blank, it returns all the columns associated with row number 1. -```{r ex-yCmpW-6, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Subset `chicago_air` Data Frame Using Indexing'} +```{r ex-evuMa-6, exercise = FALSE, exercise.eval = TRUE, exercise.cap = 'Subset `chicago_air` Data Frame Using Indexing'} data(chicago_air) chicago_air[1, ] @@ -175,8 +190,7 @@ chicago_air[1, ] If you want more than one row, you can supply a vector of row numbers. Below, the vector access the 1st, 2nd, and 5th rows of the data frame. -```{r ex-cOzSF-7, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access Multiple Rows with a Vector of Row Numbers'} -data(chicago_air) +```{r ex-Jy7Sa-7, exercise = FALSE, exercise.eval = TRUE, exercise.cap = 'Access Multiple Rows with a Vector of Row Numbers'} chicago_air[c(1, 2, 5), ] ``` @@ -184,8 +198,7 @@ chicago_air[c(1, 2, 5), ] To get a column from the data frame, specify the column number in the brackets, to the right of the comma. By leaving the row value blank, you are telling it to return all rows associated with column 1. Below, we wrap the output in the `head()` function to limit the number of rows printed. -```{r ex-IrR7i-8, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access a Single Column'} -data(chicago_air) +```{r ex-5V01c-8, exercise = FALSE, exercise.eval = TRUE, exercise.cap = 'Access a Single Column'} head(chicago_air[, 1]) ``` @@ -196,8 +209,7 @@ As you can see, a vector is returned. When a column of a data frame is selected You can also obtain more than one column by supplying a vector of column numbers. -```{r ex-loxf0-9, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access Multiple Columns'} -data(chicago_air) +```{r ex-BjIS5-9, exercise = FALSE, exercise.eval = TRUE, exercise.cap = 'Access Multiple Columns'} head(chicago_air[, c(3, 4, 6)]) ``` @@ -208,8 +220,7 @@ Since more than one column is selected, then a data frame is returned. A column name can be used to select a vector. -```{r ex-k12Vu-10, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access a Column by Name'} -data(chicago_air) +```{r ex-IDXfV-10, exercise = FALSE, exercise.eval = TRUE, exercise.cap = 'Access a Column by Name'} head(chicago_air[, "solar"]) ``` @@ -217,8 +228,7 @@ head(chicago_air[, "solar"]) Or a vector of column names can subset to a slimmed down data frame. -```{r ex-0TavJ-11, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Subset to Specific Columns by Name'} -data(chicago_air) +```{r ex-yp1dl-11, exercise = FALSE, exercise.eval = TRUE, exercise.cap = 'Subset to Specific Columns by Name'} head(chicago_air[, c("ozone", "temp", "month")]) ``` @@ -226,7 +236,7 @@ head(chicago_air[, c("ozone", "temp", "month")]) Both rows and columns can be specified at the same time. The example below returns the first 5 rows of the temperature and solar columns. -```{r ex-DACxr-12, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Specify Both Rows and Columns for Subsetting'} +```{r ex-DYMDv-12, exercise = FALSE, exercise.eval = TRUE, exercise.cap = 'Specify Both Rows and Columns for Subsetting'} chicago_air[1:5, c("temp", "solar")] ``` @@ -236,7 +246,7 @@ chicago_air[1:5, c("temp", "solar")] In R, the dollar sign `$` is a special character that can be used to access a data frame column by name. The dollar sign is placed immediately after the variable name. For example, if we wanted to access the temperature values in the `chicago_air` data frame, then we could use `chicago_air$temp`. -```{r ex-cvUm8-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access a Column with `$`'} +```{r ex-Sfc9U-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access a Column with `$`'} data(chicago_air) head(chicago_air$temp) @@ -270,12 +280,12 @@ Below is a table of logical operators in R that can be used to create logical co The result of a logical expression is a logical data type, a boolean value `TRUE` or `FALSE`. -```{r ex-TDHV9-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Logical Expressions Example 1'} +```{r ex-EmTqn-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Logical Expressions Example 1'} 1 + 1 == 2 ``` -```{r ex-55zuN-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Logical Expressions Example 2'} +```{r ex-eNXIj-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Logical Expressions Example 2'} 10 > 20 ``` @@ -286,7 +296,7 @@ Vectors can also be used in a logical expression. A vector of values on the left Here, we check if any of the integers in the vector on the left are above 60. A logical vector is returned. -```{r ex-hSELm-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Vector Logical Expression'} +```{r ex-4CKRF-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Vector Logical Expression'} c(25, 80, 55) > 60 ``` @@ -297,7 +307,7 @@ This concept can be used to subset a data frame. A logical vector can be used in We can use the data frame of colors and fruit again to demonstrate. -```{r ex-akTxt-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Create a Logical Vector for Subsetting'} +```{r ex-dgAiR-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Create a Logical Vector for Subsetting'} my_data <- data.frame(colors = c("red", "green", "yellow"), fruit = c("apple", "grape", "banana")) @@ -308,7 +318,7 @@ my_data If we only wanted records with the "yellow" color, we could use the vector `c(FALSE, FALSE, TRUE)`. Place this vector in the brackets of the data frame, where we select rows. -```{r ex-6PDgP-5, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Subset Using a Logical Vector'} +```{r ex-T1HkS-5, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Subset Using a Logical Vector'} my_data[c(FALSE, FALSE, TRUE), ] ``` @@ -319,7 +329,7 @@ A data frame is returned. The only record is from the 3rd row of the logical vec But a more useful way of creating the logical vector is with a logical expression. Below we access the "color" column as a vector using the `$` operator. Then we create a logical vector using a logical expression. -```{r ex-Z6Ft7-6, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Create a Logical Vector with a Logical Expression'} +```{r ex-8EdDn-6, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Create a Logical Vector with a Logical Expression'} colors <- my_data$colors colors @@ -333,7 +343,7 @@ yellow Now we can use the logical vector `yellow` to subset the data frame down to records that have the color yellow. -```{r ex-cAyf6-7, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Subset Data Frame Using Logical Vector'} +```{r ex-UBtS1-7, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Subset Data Frame Using Logical Vector'} my_data[yellow, ] ``` @@ -341,7 +351,7 @@ my_data[yellow, ] The `chicago_air` data frame can be subset in a similar way. Below, a logical vector `hot` is created to represent hot days above 90 degrees. The data frame is subset down to records with hot days. -```{r ex-XqUl2-8, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Subset `chicago_air` for Hot Days'} +```{r ex-I23rU-8, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Subset `chicago_air` for Hot Days'} data(chicago_air) hot <- chicago_air$temp > 90 @@ -357,7 +367,7 @@ A logical vector can also be used in combination with the function `filter()`. The `filter()` function is from a package called `dplyr` which provides many functions for manipulating data frames. -```{r ex-S2pIk-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Load `dplyr` and Explore `filter()`'} +```{r ex-AB3eq-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Load `dplyr` and Explore `filter()`'} # if you have not installed dplyr @@ -373,7 +383,7 @@ The benefit of using `filter()` is that it works the way other functions in R ty If we want to filter down to records in the `chicago_air` data frame where ozone was above 60 ppb (.060 ppm), we would use the following code. -```{r ex-ZJrGf-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Filter `chicago_air` for High Ozone Levels'} +```{r ex-yjkr3-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Filter `chicago_air` for High Ozone Levels'} data(chicago_air) high_ozone <- filter(chicago_air, ozone > 0.060) @@ -384,7 +394,7 @@ high_ozone If we wanted all of the high ozone days in the 6th month, we add another expression separated by a comma. -```{r ex-02QFi-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Filter for High Ozone Levels in June'} +```{r ex-eUZrT-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Filter for High Ozone Levels in June'} data(chicago_air) high_ozone_june <- filter(chicago_air, ozone > 0.060, month == 6) @@ -395,7 +405,7 @@ high_ozone_june Additional logical expressions can be added by separating each expression with a comma. The comma serves as a logical AND. Below is an equivalent output to the output above, using `&` instead of a comma. -```{r ex-pQZj5-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Alternative `filter()` Syntax Using `&`'} +```{r ex-9Iw1o-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Alternative `filter()` Syntax Using `&`'} data(chicago_air) high_ozone_june <- filter(chicago_air, ozone > 0.060 & month == 6) @@ -411,7 +421,7 @@ The `dplyr` package also has a function named `arrange()` that will sort a data Below, the `chicago_air` data frame is ordered by the `ozone` column. The default is ascending order. -```{r ex-kqnGo-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Sort `chicago_air` by `ozone`'} +```{r ex-JYNWL-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Sort `chicago_air` by `ozone`'} # if the dplyr library is not already loaded library(dplyr) data(chicago_air) @@ -425,7 +435,7 @@ head(ozone_ordered) To use descending order, wrap the column in the `desc()` function (also from the `dplyr` package). -```{r ex-Eqs1R-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Sort `chicago_air` in Descending Order by `ozone`'} +```{r ex-HM2N6-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Sort `chicago_air` in Descending Order by `ozone`'} data(chicago_air) ozone_descending <- arrange(chicago_air, desc(ozone)) @@ -436,7 +446,7 @@ head(ozone_descending) Additional columns can be used to sort the data frame, separated by a comma. -```{r ex-MlGEh-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Sort `chicago_air` by Multiple Columns'} +```{r ex-BxKWa-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Sort `chicago_air` by Multiple Columns'} data(chicago_air) ozone_temp <- arrange(chicago_air, desc(ozone), desc(temp)) @@ -452,7 +462,7 @@ If we are working with multiple data frames in R, it is sometimes useful to comb To illustrate, we will make two subsets of the `chicago_air` data frame, then combine them together using the `bind_rows()` function. Below, the original number of records in the `chicago_air` data frame is shown using the `nrow()` function. We will split the data frame and recombine to a data frame with the original number of records. -```{r ex-P0z5V-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Count Rows in `chicago_air`'} +```{r ex-0nUwV-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Count Rows in `chicago_air`'} # if you have not loaded the dplyr package library(dplyr) data(chicago_air) @@ -464,7 +474,7 @@ nrow(chicago_air) Now we split the data frame into warm and cool data frames using the `filter()` function. -```{r ex-zZOmG-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Split `chicago_air` into Warm and Cool Subsets'} +```{r ex-BhPaz-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Split `chicago_air` into Warm and Cool Subsets'} data(chicago_air) warm <- filter(chicago_air, temp > 80) @@ -480,7 +490,7 @@ nrow(cool) We can confirm that the rows from these two data frames add up to the original data frame. -```{r ex-vLzB7-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Verify Row Counts Before Combining'} +```{r ex-UkNk8-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Verify Row Counts Before Combining'} nrow(warm) + nrow(cool) == nrow(chicago_air) ``` @@ -488,7 +498,7 @@ nrow(warm) + nrow(cool) == nrow(chicago_air) Now we combine using the `bind_rows()` function and confirm that the new `recombined` data frame has the same number of records as the original data frame. -```{r ex-vyQkw-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Recombine Data Frames and Verify Row Count'} +```{r ex-QPOc2-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Recombine Data Frames and Verify Row Count'} data(chicago_air) recombined <- bind_rows(warm, cool) diff --git a/docs/4-Writing-Functions-Conditionals-and-Loops/readme.md b/docs/4-Writing-Functions-Conditionals-and-Loops/readme.md index b09a6ea..e4f630c 100644 --- a/docs/4-Writing-Functions-Conditionals-and-Loops/readme.md +++ b/docs/4-Writing-Functions-Conditionals-and-Loops/readme.md @@ -31,7 +31,7 @@ This lesson assumes you are familiar with the material in the previous lessons: The data for these lessons is available from this package. It is assumed that this package is already installed and loaded into the R session. If you need to refer to the package, simply refer to it as "this package". -```{r ex-aTV9t-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Load Data from This Package'} +```{r ex-8TTHh-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Load Data from This Package'} # Assuming the package is already loaded data(chicago_air) @@ -48,7 +48,7 @@ the thing that's saved is not a data object but lines of R code. To save your own function, use this construction: -```{r ex-PQd95-1, eval = FALSE, exercise = TRUE, exercise.cap = 'Creating Your Own Function'} +```{r ex-neBnK-1, eval = FALSE, exercise = TRUE, exercise.cap = 'Creating Your Own Function'} my_function_name <- function() { # lines of R code @@ -61,7 +61,7 @@ We can write a simple function that prints something to the console. Here is a function named `print_hello`. -```{r ex-LwvrO-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Simple Function to Print Hello'} +```{r ex-mQoqe-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Simple Function to Print Hello'} print_hello <- function() { print("Hello") @@ -70,7 +70,7 @@ print_hello <- function() { ``` -```{r ex-Hfydw-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing the print_hello Function'} +```{r ex-7BLAg-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing the print_hello Function'} print_hello() ``` @@ -86,7 +86,7 @@ Here we recreate the same function, but this time we add an argument `text` insi of the parentheses. -```{r ex-FVuEL-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Function with an Argument'} +```{r ex-BgSKl-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Function with an Argument'} print_hello <- function(text) { message <- paste("Hello", text) @@ -97,7 +97,7 @@ print_hello <- function(text) { ``` -```{r ex-QOUS5-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing the Modified print_hello Function'} +```{r ex-Q6vpX-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing the Modified print_hello Function'} print_hello(text = "everybody!") ``` @@ -110,7 +110,7 @@ greater than a criteria pollutant standard. We could make a simple function that takes two arguments: one for the measurement value, and one for the standard value. -```{r ex-orqEk-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Function with Two Arguments'} +```{r ex-ALDHL-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Function with Two Arguments'} standard_violated <- function(measurement, standard) { measurement > standard @@ -119,7 +119,7 @@ standard_violated <- function(measurement, standard) { ``` -```{r ex-787G5-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing standard_violated Function'} +```{r ex-Omj4G-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing standard_violated Function'} standard_violated(measurement = 84, standard = 70) ``` @@ -133,7 +133,7 @@ the flexibility to use a different value. To set a default value, we use `= 70` when we create the function. -```{r ex-OSEAX-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Function with Default Value'} +```{r ex-Aaca0-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Function with Default Value'} standard_violated <- function(measurement, standard = 70) { measurement > standard @@ -142,7 +142,7 @@ standard_violated <- function(measurement, standard = 70) { ``` -```{r ex-OcZtl-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing standard_violated with Default Value'} +```{r ex-iTVv7-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing standard_violated with Default Value'} standard_violated(measurement = 50) ``` @@ -163,12 +163,12 @@ Here we show that using two numbers in a different order will return different outputs. -```{r ex-pBarX-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing Positional Arguments'} +```{r ex-pXUkw-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing Positional Arguments'} standard_violated(60, 70) ``` -```{r ex-fkuGs-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing Positional Arguments in Reverse Order'} +```{r ex-LmYHt-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing Positional Arguments in Reverse Order'} standard_violated(70, 60) ``` @@ -185,7 +185,7 @@ will run if the logical expression is `TRUE` is placed inside curly braces. Belo is the outline (not actual R code). -```{r ex-t3z5X-1, eval = FALSE, exercise = TRUE, exercise.cap = 'if Function Outline'} +```{r ex-e78TJ-1, eval = FALSE, exercise = TRUE, exercise.cap = 'if Function Outline'} if(logical expression>) { @@ -194,7 +194,7 @@ if(logical expression>) { ``` -```{r ex-fOsIR-2, eval = FALSE, exercise = TRUE, exercise.cap = 'if-else Function Outline'} +```{r ex-kqQHu-2, eval = FALSE, exercise = TRUE, exercise.cap = 'if-else Function Outline'} if() { @@ -207,7 +207,7 @@ if() { ``` -```{r ex-jEvVd-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'if Function Example'} +```{r ex-QmpyS-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'if Function Example'} ozone <- 0.075 if(ozone > 0.065) { @@ -222,7 +222,7 @@ if(ozone > 0.065) { ``` -```{r ex-KMFJr-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Another if Function Example'} +```{r ex-4cokn-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Another if Function Example'} ozone <- 0.06 if(ozone > 0.065) { @@ -237,12 +237,12 @@ if(ozone > 0.065) { ``` -```{r ex-9E7jX-5, eval = FALSE, exercise = TRUE, exercise.cap = 'ifelse Function Outline'} +```{r ex-oVA6y-5, eval = FALSE, exercise = TRUE, exercise.cap = 'ifelse Function Outline'} ifelse(, , ) ``` -```{r ex-mzTAI-6, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'ifelse Function Example'} +```{r ex-cxiJ0-6, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'ifelse Function Example'} ozone_value <- 0.06 message <- ifelse(ozone_value > 0.065, "Potential Health Effects", "All Good") @@ -261,7 +261,7 @@ For loops are used to repeat an operation a set number of times. Here is the basic outline: -```{r ex-rMRtF-1, eval = FALSE, exercise = TRUE, exercise.cap = 'For Loop Outline'} +```{r ex-dyBCK-1, eval = FALSE, exercise = TRUE, exercise.cap = 'For Loop Outline'} for(i in sequence){ @@ -270,7 +270,7 @@ for(i in sequence){ ``` -```{r ex-9uJFw-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'For Loop Example'} +```{r ex-B3TG2-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'For Loop Example'} for(i in c(1, 2, 3)) { print(i) @@ -288,7 +288,7 @@ Here is an example data frame we will use. It represents a few values from three monitors. -```{r ex-8jAA9-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'For Loop with Data Frame'} +```{r ex-ZU3OM-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'For Loop with Data Frame'} monitors <- data.frame(monitor1 = c(50, 60, 58, 52), monitor2 = c(55, 59, 65, 61), monitor3 = c(70, 62, 68, 71)) @@ -303,7 +303,7 @@ are accessed using square brackets [ , i]. Each max value is saved to the max_values vector using square brackets [i]. -```{r ex-ZQtez-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Finding Maximum Values with For Loop'} +```{r ex-o8Fn1-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Finding Maximum Values with For Loop'} max_values <- c() for(i in c(1, 2, 3)) { @@ -334,7 +334,7 @@ The example below applies the `max()` function to the `monitors` data frame from the previous section. -```{r ex-AZOyl-1, eval = FALSE, exercise = TRUE, exercise.cap = 'apply Function Example'} +```{r ex-HKdx1-1, eval = FALSE, exercise = TRUE, exercise.cap = 'apply Function Example'} monitors_max <- apply(monitors, MARGIN = 2, FUN = max) monitors_max @@ -351,7 +351,7 @@ We could also find the mean of each row in the `monitors` data frame. To do this, we would set the `MARGIN` argument to `1`. -```{r ex-uDkYC-2, eval = FALSE, exercise = TRUE, exercise.cap = 'apply Function Example with MARGIN = 1'} +```{r ex-7f5j2-2, eval = FALSE, exercise = TRUE, exercise.cap = 'apply Function Example with MARGIN = 1'} apply(monitors, MARGIN = 1, FUN = mean) ``` diff --git a/docs/5-Plotting/readme.md b/docs/5-Plotting/readme.md index b3e806b..6c8ee1a 100644 --- a/docs/5-Plotting/readme.md +++ b/docs/5-Plotting/readme.md @@ -30,7 +30,7 @@ This lesson assumes you are familiar with the material in the lesson on It also uses functions from the `ggplot2` package which needs to be installed. -```{r ex-x8DCE-1, eval = FALSE} +```{r ex-OMjta-1, eval = FALSE} install.packages("ggplot2") ``` @@ -38,7 +38,7 @@ install.packages("ggplot2") The example data for the exercises is available from this package. To load the `chicago_air` data frame we will be using in the lesson, simply use the `data()` function. -```{r ex-AHP6c-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Loading Data'} +```{r ex-qZNKf-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Loading Data'} data(chicago_air) ``` @@ -50,7 +50,7 @@ will be displayed on the y-axis of a coordinate graph, with the index number of vector taking the x-axis values. -```{r ex-HSzdk-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Basic Plot'} +```{r ex-OxuJt-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Basic Plot'} benzene <- c(1.3, 4.5, 2.6, 3.4, 6.4) plot(benzene) @@ -63,7 +63,7 @@ the `chicago_air` data frame to create a scatterplot of temperature on the x-axi and ozone on the y-axis. -```{r ex-O0gbX-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Scatter Plot'} +```{r ex-1BXg1-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Scatter Plot'} plot(x = chicago_air$temp, y = chicago_air$ozone) ``` @@ -72,7 +72,7 @@ To see data plotted over time, we need to convert the `date` column to a `Date` data type. -```{r ex-HWUJ5-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Converting Date Column'} +```{r ex-HHuTD-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Converting Date Column'} chicago_air$date <- as.Date(chicago_air$date) ``` @@ -80,7 +80,7 @@ chicago_air$date <- as.Date(chicago_air$date) Here is ozone plotted by day as a line graph. -```{r ex-zhdKD-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Line Graph'} +```{r ex-bMaPI-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Line Graph'} plot(x = chicago_air$date, y = chicago_air$ozone, type = 'l') ``` @@ -90,7 +90,7 @@ to control the look of the graph. The plot below demonstrates a few of these options. Run `?plot` to see a list of all the arguments in the help file. -```{r ex-7bj5y-5, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Customized Plot'} +```{r ex-1nZPP-5, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Customized Plot'} plot(x = chicago_air$date, y = chicago_air$ozone, type='l', pch = 16, @@ -109,7 +109,7 @@ of a data set as a histogram. Below is the default output of the ozone data from the `chicago_air` data frame. -```{r ex-vOsLC-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Default Histogram'} +```{r ex-Za4ke-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Default Histogram'} hist(chicago_air$ozone) ``` @@ -119,7 +119,7 @@ each bar, with the `breaks` argument. For example, supplying `breaks = 20` will make a histogram with 20 bars. Other arguments allow you to control the titles and colors of the plot. Run `?hist` to see a complete list of arguments on the help page. -```{r ex-JH4UK-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Customized Histogram'} +```{r ex-21yuS-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Customized Histogram'} hist(chicago_air$ozone, breaks = 20, main = "Histogram of Ozone", @@ -136,7 +136,7 @@ argument. If a data frame is used, then the columns can be referenced without th `$` operator, and a formula must be used. -```{r ex-TzseJ-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Simple Box Plot'} +```{r ex-xST3J-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Simple Box Plot'} boxplot(chicago_air$ozone) ``` @@ -162,7 +162,7 @@ We can make a plot of ozone by month using the `chicag_air` data frame and the formula `ozone ~ month`. -```{r ex-NI9lL-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Box Plot by Month'} +```{r ex-rirN8-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Box Plot by Month'} boxplot(ozone ~ month, data = chicago_air) ``` @@ -188,7 +188,7 @@ To use `ggplot2`, we typically follow this sequence of steps: Let's begin by loading the `ggplot2` package. -```{r ex-XUxiq-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Loading ggplot2 Package'} +```{r ex-tPuy3-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Loading ggplot2 Package'} library(ggplot2) ``` @@ -201,7 +201,7 @@ and `y`. These determine which columns from the data frame are displayed on the graph. -```{r ex-oTzAU-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Base ggplot'} +```{r ex-zknS5-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Base ggplot'} ggplot(chicago_air, aes(x = temp, y = ozone)) ``` @@ -212,14 +212,14 @@ to the plot, which is done by adding a function using the `+` sign. For a point plot, we add the `geom_point()` function. -```{r ex-JIEo6-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'ggplot Scatter Plot'} +```{r ex-9f8BI-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'ggplot Scatter Plot'} ggplot(chicago_air, aes(x = temp, y = ozone)) + geom_point() ``` Additional modifications can be made. Customize it by adding color, title, and labels. -```{r ex-m0aaG-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Customized ggplot Scatter Plot'} +```{r ex-6Fz5m-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Customized ggplot Scatter Plot'} ggplot(chicago_air, aes(x = temp, y = ozone)) + geom_point(color = "forestgreen") + ggtitle('Relationship between Ozone and Temperature') + @@ -234,7 +234,7 @@ on the month. To do this, we need to make the month column a factor and use the argument `color` in the `aes( )` function. -```{r ex-09Sbl-5, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Color Coded Scatter Plot'} +```{r ex-MiND3-5, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Color Coded Scatter Plot'} ggplot(chicago_air, aes(x = temp, y = ozone, color = factor(month))) + geom_point() + ggtitle('Relationship between Ozone and Temperature') + @@ -252,7 +252,7 @@ To create a line plot of ozone over time, we use the `as.Date()` function on the column and replace the `geom_point( )` function with the `geom_line( )` function. -```{r ex-ot69d-6, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Line Plot Over Time'} +```{r ex-bb3P0-6, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Line Plot Over Time'} ggplot(chicago_air, aes(x = as.Date(date), y = ozone)) + geom_line() ``` @@ -262,7 +262,7 @@ the width of each bar, the `fill` argument the color of the bars, and the `color argument the outline of the bars. -```{r ex-HQ6Fj-7, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'ggplot Histogram'} +```{r ex-sRwjj-7, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'ggplot Histogram'} ggplot(chicago_air, aes(ozone)) + geom_histogram(binwidth=0.005, fill='darkorange', color='black') @@ -271,7 +271,7 @@ ggplot(chicago_air, aes(ozone)) + The `geom_boxplot()` function will create a box plot. -```{r ex-aO0gx-8, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Simple ggplot Box Plot'} +```{r ex-JhX2Z-8, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Simple ggplot Box Plot'} ggplot(chicago_air, aes(ozone)) + geom_boxplot() ``` @@ -280,7 +280,7 @@ Using the `y` argument can split the data into groups. Here we use the `factor() function on the month column to create 12 box plots on the graph. -```{r ex-v9v5U-9, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Grouped Box Plot'} +```{r ex-7ZFZW-9, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Grouped Box Plot'} ggplot(chicago_air, aes(x = ozone, y = factor(month))) + geom_boxplot() ``` @@ -296,7 +296,7 @@ multiple plots or facets. The `facet_wrap()` function allows you to use a column to choose the facets. Below is a faceted histogram of ozone values. -```{r ex-EUZmI-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Faceted Histogram'} +```{r ex-wElOy-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Faceted Histogram'} ggplot(chicago_air, aes(ozone)) + geom_histogram() + facet_wrap("month") @@ -312,7 +312,7 @@ represented as a shaded area. Below, the argument `method` is given the value `lm` which stands for a linear model. -```{r ex-pCCm7-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Linear Model Fitted Line', message = FALSE, warning = FALSE} +```{r ex-0AHiX-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Linear Model Fitted Line', message = FALSE, warning = FALSE} ggplot(chicago_air, aes(temp, ozone) ) + geom_point() + geom_smooth(method=lm) @@ -324,7 +324,7 @@ argument will draw a nonlinear curve which represents localized relationships be the x and y variables. -```{r ex-w6Hdx-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Nonlinear Curve Fitting', message = FALSE, warning = FALSE} +```{r ex-Nt4cm-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Nonlinear Curve Fitting', message = FALSE, warning = FALSE} ggplot(chicago_air, aes(temp, ozone) ) + geom_point() + geom_smooth(method=loess) @@ -342,7 +342,7 @@ pane. You can also save a plot made by `ggplot2` using the `ggsave()` function. -```{r ex-SZeDt-1, eval = FALSE} +```{r ex-nbmgf-1, eval = FALSE} my_plot <- ggplot(chicago_air, aes(temp, ozone) ) + geom_point() + geom_smooth(method=loess) @@ -365,7 +365,7 @@ as integers internally while maintaining labels for these integers. Here is an example of converting a character vector into a factor: -```{r ex-DfS6P-1, eval = FALSE} +```{r ex-P95zZ-1, eval = FALSE} months <- c("January", "February", "March", "January", "February") months_factor <- factor(months) diff --git a/docs/6-Basic-Statistics/readme.md b/docs/6-Basic-Statistics/readme.md index cd5ea8b..06a5ff0 100644 --- a/docs/6-Basic-Statistics/readme.md +++ b/docs/6-Basic-Statistics/readme.md @@ -23,10 +23,10 @@ R was originally developed as a statistical programming language and its built-i This lesson assumes you are familiar with the material in the lesson on [Functions and Importing Data](../2-Functions-and-Importing-Data/readme.md). -Statistical functions are used in this lesson that require installation of the following packages. +Statistical functions are used in this lesson that require installation of the `envstats` package. -```{r ex-51h4u-1, eval = FALSE} +```{r ex-pkO4T-1, eval = FALSE} install.packages("envstats") ``` @@ -40,7 +40,7 @@ R has many built-in functions for descriptive statistics. We will use these functions to understand the example environmental data available in this package. -```{r ex-vVzfW-1, exercise = TRUE, exercise.cap = 'Extract example data'} +```{r ex-62gXq-1, exercise = TRUE, exercise.cap = 'Extract example data'} data <- example_data # Assuming example_data is available in this package ``` @@ -49,21 +49,21 @@ Most of the functions we'll be using have an argument named `na.rm` that stands for `NA` remove. If the argument is set to `TRUE` then the function will remove all missing values from the data set. Otherwise, the function will error. -These functions let us know the range of the data values, i.e., the highest and +These functions tell us the range of the data values, i.e., the highest and lowest values. -```{r ex-VQdWQ-2, exercise = TRUE, exercise.cap = 'Find minimum value'} +```{r ex-FGRCe-2, exercise = TRUE, exercise.cap = 'Find minimum value'} min(data, na.rm=TRUE) ``` -```{r ex-MsGTt-3, exercise = TRUE, exercise.cap = 'Find maximum value'} +```{r ex-C6d4G-3, exercise = TRUE, exercise.cap = 'Find maximum value'} max(data, na.rm=TRUE) ``` -```{r ex-yK2wX-4, exercise = TRUE, exercise.cap = 'Find range of values'} +```{r ex-FTKgo-4, exercise = TRUE, exercise.cap = 'Find range of values'} range(data, na.rm=TRUE) ``` @@ -71,27 +71,27 @@ range(data, na.rm=TRUE) We can also get the mean and the quartile values from the `summary()` function. -```{r ex-6miWt-5, exercise = TRUE, exercise.cap = 'Summary statistics'} +```{r ex-0pca4-5, exercise = TRUE, exercise.cap = 'Summary statistics'} summary(data) ``` The `IQR()` function gives us the interquartile range, which lets us know how large -the spread is for the values in the central range of the distribution, i.e., between -the 1st quartile and the 3rd quartile. +the spread is for the values in the central range of the distribution, i.e. between +the 25th percentile and the 75th percentile. -```{r ex-1vu7t-6, exercise = TRUE, exercise.cap = 'Calculate IQR'} +```{r ex-iqubR-6, exercise = TRUE, exercise.cap = 'Calculate IQR'} IQR(data, na.rm=TRUE) ``` We can use the `boxplot()` function to visualize the interquartile range. The outline -of the box itself shows the first and third quartile, while the line in the middle +of the box itself shows the middle 50% of the data, while the line in the middle of the box shows the median. -```{r ex-9l0RF-7, exercise = TRUE, exercise.cap = 'Visualize IQR with boxplot'} +```{r ex-pNj09-7, exercise = TRUE, exercise.cap = 'Visualize IQR with boxplot'} boxplot(data) ``` @@ -101,12 +101,12 @@ boxplot(data) R has functions for finding the mean and median of a set of values. -```{r ex-068Qu-1, exercise = TRUE, exercise.cap = 'Calculate mean'} +```{r ex-kxHsF-1, exercise = TRUE, exercise.cap = 'Calculate mean'} mean(data, na.rm=TRUE) ``` -```{r ex-mT7sT-2, exercise = TRUE, exercise.cap = 'Calculate median'} +```{r ex-pE9gV-2, exercise = TRUE, exercise.cap = 'Calculate median'} median(data, na.rm=TRUE) ``` @@ -115,12 +115,12 @@ The functions `var()` and `sd()` calculate the variance and standard deviation, respectively. -```{r ex-ptbDz-3, exercise = TRUE, exercise.cap = 'Calculate variance'} +```{r ex-ASLr1-3, exercise = TRUE, exercise.cap = 'Calculate variance'} var(data, na.rm=TRUE) ``` -```{r ex-SSlLM-4, exercise = TRUE, exercise.cap = 'Calculate standard deviation'} +```{r ex-tpyjT-4, exercise = TRUE, exercise.cap = 'Calculate standard deviation'} sd(data, na.rm=TRUE) ``` @@ -133,7 +133,7 @@ the `t.test()` function to perform a two-sample t-test on the example data. First, let's visualize our dataset. -```{r ex-lBOr0-1, warning = FALSE, message = FALSE, exercise = TRUE, exercise.cap = 'Visualize dataset'} +```{r ex-dvrmp-1, warning = FALSE, message = FALSE, exercise = TRUE, exercise.cap = 'Visualize dataset'} ggplot(data, aes(factor(variable), value)) + geom_boxplot() ``` @@ -143,7 +143,7 @@ difference in concentrations. Below is a plot of those two groups side by side. -```{r ex-SfQkn-2, warning = FALSE, message = FALSE, exercise = TRUE, exercise.cap = 'Compare two groups'} +```{r ex-NhRQu-2, warning = FALSE, message = FALSE, exercise = TRUE, exercise.cap = 'Compare two groups'} filtered_data <- filter(data, group == "Group1" | group == "Group2") ggplot(filtered_data, aes(factor(group), value)) + geom_boxplot() @@ -154,7 +154,7 @@ We should also check for normality before doing any statistical tests. Below are histograms of the datasets. -```{r ex-QohKM-3, exercise = TRUE, exercise.cap = 'Check for normality with histograms'} +```{r ex-gRI4C-3, exercise = TRUE, exercise.cap = 'Check for normality with histograms'} ggplot(filtered_data, aes(value)) + facet_grid(rows = vars(group)) + geom_histogram() @@ -168,14 +168,14 @@ comes from a normal distribution. If the p-value of the test is less than .05, we reject the null hypothesis and conclude the data is not normal. -```{r ex-22UpB-4, exercise = TRUE, exercise.cap = 'Shapiro-Wilk test for Group1'} +```{r ex-hJHRI-4, exercise = TRUE, exercise.cap = 'Shapiro-Wilk test for Group1'} group1_data <- filter(data, group == "Group1") shapiro.test(group1_data$value) ``` -```{r ex-4jnjG-5, exercise = TRUE, exercise.cap = 'Shapiro-Wilk test for Group2'} +```{r ex-nTI4H-5, exercise = TRUE, exercise.cap = 'Shapiro-Wilk test for Group2'} group2_data <- filter(data, group == "Group2") shapiro.test(group2_data$value) @@ -186,13 +186,13 @@ The p-values for the tests are well above 0.05, so we assume the null hypothesis is true. Meaning, we can assume the distributions of values in the two groups are normal. -Now we can do some comparisons between these 2 groups of readings using the -Student's t-test. The test is meant to determine if the two means from the two +Now we can do some comparisons between these 2 months of measurements +using the Student's t-test. The test is meant to determine if the two means from the two datasets are from the same distribution or not. The assumption, or null hypothesis, is that they are, in fact, mean values from the same distribution. -```{r ex-UOqOH-6, exercise = TRUE, exercise.cap = 'Student's t-test between two groups'} +```{r ex-LwQE5-6, exercise = TRUE, exercise.cap = 'Student's t-test between two groups'} t.test(group1_data$value, group2_data$value) ``` @@ -217,7 +217,7 @@ The `EnvStats` package has a comprehensive list of basic and more advanced stati tests for Environmental Data. -```{r ex-etfDX-1, eval = FALSE} +```{r ex-P56P3-1, eval = FALSE} library(EnvStats) ?FcnsByCatHypothTests @@ -236,7 +236,7 @@ arguments that we only want to include complete observations and the Pearson met of finding correlations. -```{r ex-uEkBn-1, exercise = TRUE, exercise.cap = 'Correlation matrix of select variables'} +```{r ex-Rpyub-1, exercise = TRUE, exercise.cap = 'Correlation matrix of select variables'} cor(data[, c("Variable1", "Variable2", "Variable3")], use = "complete.obs", method ="pearson") @@ -253,7 +253,7 @@ We could also perform a correlation test using the `cor.test()` function. Here we test the correlation between two variables. -```{r ex-Ixup6-2, exercise = TRUE, exercise.cap = 'Test correlation between two variables'} +```{r ex-urG6l-2, exercise = TRUE, exercise.cap = 'Test correlation between two variables'} cor.test(data$Variable1, data$Variable2, method = "pearson") ``` @@ -267,7 +267,7 @@ do not reject the null hypothesis. We conclude there is no correlation between these two variables. -```{r ex-vhUiL-3, exercise = TRUE, exercise.cap = 'Test correlation between another set of two variables'} +```{r ex-87sBr-3, exercise = TRUE, exercise.cap = 'Test correlation between another set of two variables'} cor.test(data$Variable1, data$Variable3, method = "pearson") ``` @@ -278,11 +278,16 @@ plot between each pair of columns in the data frame. Setting `lower.panel = pane will draw a smooth line through the scatter plots on the lower panels. -```{r ex-9Vs8T-4, exercise = TRUE, exercise.cap = 'Pairwise plots of select variables'} +```{r ex-1t9vT-4, exercise = TRUE, exercise.cap = 'Pairwise plots of select variables'} pairs(data[, c("Variable1", "Variable2", "Variable3")], lower.panel = panel.smooth) ``` +You can see from the lower panel plots the increasing slope of the line +for ozone and temp; a decreasing slope for temp and pressure; and a flat +line for ozone and pressure. + + ## Exercises diff --git a/docs/7-Quality-Assurance/readme.md b/docs/7-Quality-Assurance/readme.md index 1c083f4..9f62510 100644 --- a/docs/7-Quality-Assurance/readme.md +++ b/docs/7-Quality-Assurance/readme.md @@ -21,7 +21,7 @@ This lesson assumes you are familiar with the material in the lesson on [Functio The data used throughout these lessons is provided by this package. To access the data, simply use the `data()` function with the name of the dataset provided by this package. -```{r ex-t8RUE-1, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Load Example Data Frame'} +```{r ex-D9BqC-1, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Load Example Data Frame'} # Assuming this package is already loaded into your R session data("example_dataset") @@ -34,7 +34,7 @@ data("example_dataset") Data types are the first thing to consider when using data in R. Many errors can happen if we assume that our data is a certain type, when in reality it is not. After reading data into R, we should look at the data types in RStudio or using the function `str()`. -```{r ex-pHE7v-1, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Inspect Example Dataset Data Types'} +```{r ex-ymVyW-1, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Inspect Example Dataset Data Types'} str(example_dataset) ``` @@ -42,7 +42,7 @@ str(example_dataset) Here is an example of text that is read into R, and a certain column might be `character` when we expected it to be `Date`. -```{r ex-KJOyG-2, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Check Data Types'} +```{r ex-9bKiT-2, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Check Data Types'} sample_data <- read.csv(text = " date,value 2022-08-01,100 @@ -56,7 +56,7 @@ str(sample_data) We can use the `as.Date()` function to transform the column after reading the data, or we can use the `colClasses` argument in the `read.csv` function to ensure it's read correctly. -```{r ex-nDuQ0-3, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Correcting Data Types with colClasses'} +```{r ex-dNzFU-3, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Correcting Data Types with colClasses'} sample_data <- read.csv(colClasses = c("Date", "numeric"), text = " date,value 2022-08-01,100 @@ -72,7 +72,7 @@ str(sample_data) For both character and numeric data types, there may be values that should not be allowed. -```{r ex-8da09-1, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Identify and Correct Unallowed Data'} +```{r ex-Oq8fw-1, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Identify and Correct Unallowed Data'} # Example of correcting unallowed values values <- c(1, 2, -1, 3, -2, 4) values[values < 0] <- NA @@ -83,13 +83,27 @@ values ## Outliers -Handling outliers is difficult because we do not necessarily want to remove data that may be uncommon but within the realm of possibility. +Handling outliers is difficult because we do not necessarily want +to remove data that may be uncommon but within the realm of possibility. +The best way to detect extreme values is to look at the summary of +your data and pay attention to min and max values. You can plot the +data to see if you can detect anything weird through visual inspection. +Boxplots with outliers plotted as points are handy for this. Below +is a boxplot of the ozone column in the `chicago_air` data frame. -```{r ex-h7kSA-1, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Identify and Handle Outliers'} -# Example of identifying and handling outliers -data("example_dataset") -boxplot(example_dataset$value) +```{r ex-xH78T-1, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Identify and Handle Outliers'} +boxplot(chicago_air$ozone) + +``` + +We can see that two values are printed as points on the high end of the distribution. +We can use the `boxplot.stats()` function to get the values used in the `boxplot()` +function. The `out` values are the outliers. + + +```{r ex-3nU4U-2, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Return Outlier Values from Boxplot'} +boxplot.stats(chicago_air$ozone)$out ``` @@ -98,7 +112,7 @@ boxplot(example_dataset$value) If you run a command and get an error, then R should print an error message. Common syntax mistakes include missing commas, unmatched parentheses, and the wrong type of closing brace. -```{r ex-Ufpjn-1, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Examples of Common Syntax Mistakes'} +```{r ex-qXxiL-1, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Examples of Common Syntax Mistakes'} # Example of a common syntax mistake: missing commas x <- c("a", "b" "c") @@ -192,7 +206,7 @@ monitors ### Exercise 3 -Use the boxplot to check for outliers in the ozone column of the built-in `airquality` data frame. +Use a boxplot to check for outliers in the ozone column of the built-in `airquality` data frame.
Click for Hint diff --git a/inst/tutorials/1-Introduction/lesson.Rmd b/inst/tutorials/1-Introduction/lesson.Rmd index ae7e245..1f2a0db 100644 --- a/inst/tutorials/1-Introduction/lesson.Rmd +++ b/inst/tutorials/1-Introduction/lesson.Rmd @@ -25,7 +25,16 @@ This lesson is a part of the Introduction to R for Air Quality Data Science. The ## What is R? -R is a free, open-source computing language. It was originally written by statisticians for doing statistical analysis in academia. In recent years it has become more widely used in many industries for performing a variety of data science tasks such as: +R is a free, open-source computing language. It was originally written +by statisticians for doing statistical analysis in academia. In recent +years it has become more widely used in many industries for performing +a variety of data science tasks such as: + +- reading and writing files, +- data transformation, +- graphic visualization, +- geographic mapping, +- and predictive modeling. ## Why Use a Programming Language? @@ -40,6 +49,18 @@ Many data analysis tasks can be accomplished with spreadsheets and other busines BI. When should you move beyond BI tools and use a high-level programming language like R? Below are a few scenarios where a language like R is more advantageous than a BI tool. +- If you cannot access data easily in your BI tool, R can read just about any data source. +- If you need to download, save, or otherwise process a large number of files, R can automate those tasks in a way that BI tools cannot. +- Custom data transformations that are not possible in BI tools can be done with R. +- Custom data visualizations that are not available in BI tools can be done with R. +- Predictive modeling that is not available in BI tools, or only in a rudimentary way, can be done in R. + +BI tools are more advantageous if you need enterprise wide dashboards, +or tools that are more easily accessible to a wider audience. If there +are few occasions where you need custom visualizations or transformations, +or if you do not need automation in your work, you may not need to learn +a programming language. + ## Basic Math @@ -47,7 +68,7 @@ Open up a script if you haven't already (“File” -> “New File” -> “R Sc the lines into your script. -```{r ex-bPZt0-1, exercise = FALSE, eval = TRUE, exercise.cap = 'Practice Basic Math Operations'} +```{r ex-7nqo2-1, exercise = FALSE, eval = TRUE, exercise.cap = 'Practice Basic Math Operations'} 10 + 5 10 - 5 10 * 5 @@ -74,12 +95,12 @@ R follows the usual order of arithmetical operations and uses parentheses for gr see the different values that are returned. -```{r ex-ykH12-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Understanding Order of Operations'} +```{r ex-BRp7h-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Understanding Order of Operations'} 10 - 3 / 5 ``` -```{r ex-qeOCv-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Practicing Grouping with Parentheses'} +```{r ex-NM5I0-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Practicing Grouping with Parentheses'} (10 - 3) / 5 ``` @@ -93,7 +114,7 @@ no multi-line commenting in R, so every comment line must begin with the `#` cha Run all of the code below and see what gets returned in the R console (bottom left panel in RStudio). -```{r ex-6edAy-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Experiment with Comments'} +```{r ex-LcAdb-1, exercise = FALSE, exercise.eval = TRUE, exercise.cap = 'Experiment with Comments'} # Full line comment 5^2 # partial line comment @@ -103,7 +124,7 @@ In the example above and the previous section, you can see the R code and its ou the R code and `## [1] 2` being the output: -```{r ex-WwgSA-2, exercise = FALSE, eval = TRUE, exercise.cap = 'Code and Output Example'} +```{r ex-uuwEP-2, exercise = FALSE, eval = TRUE, exercise.cap = 'Code and Output Example'} 1+1 ``` @@ -111,19 +132,25 @@ the R code and `## [1] 2` being the output: However, in the R console the code and output would look like this: -```{r ex-q9e11-3, exercise = FALSE, eval = FALSE, echo = TRUE, exercise.cap = 'Console Code Execution'} +```{r ex-cltSU-3, exercise = FALSE, eval = FALSE, echo = TRUE, exercise.cap = 'Console Code Execution'} > 1 + 1 [1] 2 ``` +The R code in code blocks do not show the `>` part of the console, called +the prompt, and the output block places two comment marks `##` before the +output. This is to make it possible to copy and paste the text into your +R console and run it without causing an error. + + ## Variables A variable is a letter or combination of alphanumeric characters that is used to store data. To create a variable in R, use the less-than character with the dash to create an arrow symbol pointing left `<-`. Below, the variables `x` and `y` are created by assigning some numbers to them. -```{r ex-mMpFn-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Creating Variables'} +```{r ex-94DBv-1, exercise = FALSE, exercise.eval = TRUE, exercise.cap = 'Creating Variables'} x <- 10 y <- 5 x + y @@ -138,12 +165,12 @@ In RStudio, you will see the variables we created in the top right panel. If you've already created a variable, you can replace the value with another value. -```{r ex-4Aher-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Exploring Variable Assignment'} +```{r ex-GuvHA-2, exercise = FALSE, exercise.eval = TRUE, exercise.cap = 'Exploring Variable Assignment'} x ``` -```{r ex-6FMuV-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Updating Variable Values'} +```{r ex-7Ltup-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Updating Variable Values'} x <- 20 x @@ -156,11 +183,15 @@ In the top right panel you can see that the number stored in the variable `x` ha There are 3 important rules to remember when creating variable names: +1. You can't start your variable name with a number. +2. You can't use spaces or special characters ($,%,#,-). Periods `.` and underscores `_` are ok. +3. Capitalization __DOES__ matter in R. That is, R will consider `y` and `Y` to be different variables. + Try running the following code and you will see that in your global environment there are two different objects listed. -```{r ex-rbQdo-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Variable Naming Rules'} +```{r ex-aAUUH-4, exercise = FALSE, exercise.eval = TRUE, exercise.cap = 'Variable Naming Rules'} y <- 5 Y <- 10 @@ -186,19 +217,24 @@ must be unquoted numbers, and the full-caps logical values `TRUE` and There are several ways to store groups of data to make them easier to work with: +- A __vector__ stores multiple values of the same type (e.g. all numeric values). +- A __list__ stores multiple values of different types (e.g. some numbers and character values). +- A __matrix__ is a table of values with only one data type. +- A __data frame__ is a table of values that can have columns with different data types (e.g. a numeric column and a logical column). + ### Vectors A vector variable can contain only one type of data (numeric, character, or logical). We use `c()` to create vectors. -```{r ex-ASwYE-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Creating Numeric Vectors'} +```{r ex-19FJx-1, exercise = FALSE, exercise.eval = TRUE, exercise.cap = 'Creating Numeric Vectors'} x <- c(1, 2, 3, 4, 5) x ``` -```{r ex-qXb3N-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Creating Character Vectors'} +```{r ex-PBtWI-2, exercise = FALSE, exercise.eval = TRUE, exercise.cap = 'Creating Character Vectors'} fruit <- c("apples", "bananas", "oranges") fruit @@ -208,7 +244,7 @@ If you try to type in text without using quotations marks for character values ( running the code below. -```{r ex-2si1E-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Error Without Quotes'} +```{r ex-7nZoG-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Error Without Quotes'} fruit <- c(apples, bananas, oranges) ``` @@ -218,7 +254,7 @@ find them and it returns an error. The members of a vector can be accessed by us `fruit` vector, you can use the single bracket with the number 3: -```{r ex-icerQ-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Accessing Vector Elements'} +```{r ex-5Fhv4-4, exercise = FALSE, exercise.eval = TRUE, exercise.cap = 'Accessing Vector Elements'} fruit[3] ``` @@ -228,7 +264,7 @@ fruit[3] Lists are like vectors but can contain any mix of data types. We use `list()` to create a list variable. -```{r ex-X8z2D-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Creating Lists'} +```{r ex-6n4iC-1, exercise = FALSE, exercise.eval = TRUE, exercise.cap = 'Creating Lists'} x <- list("Benzene", 1.3, TRUE) x @@ -239,7 +275,7 @@ is the second value in the list, so it is shown below the double bracket `[[2]]` list. -```{r ex-EX1dw-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Accessing List Elements'} +```{r ex-RpG4A-2, exercise = FALSE, exercise.eval = TRUE, exercise.cap = 'Accessing List Elements'} x[[2]] ``` @@ -247,7 +283,7 @@ x[[2]] Lists can also contain vectors and other lists. -```{r ex-UJ7Pe-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Lists Containing Vectors and Lists'} +```{r ex-LzgX9-3, exercise = FALSE, exercise.eval = TRUE, exercise.cap = 'Lists Containing Vectors and Lists'} my_vector <- c(1, 2, 3) my_list <- list("Benzene", 1.3, TRUE) y <- list(TRUE, my_vector, my_list) @@ -259,7 +295,7 @@ In this example, you can use two double brackets to access the value `1.3` by se `my_list`: -```{r ex-zi0BA-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Nested List Access'} +```{r ex-X2V86-4, exercise = FALSE, exercise.eval = TRUE, exercise.cap = 'Nested List Access'} y[[3]][[2]] ``` @@ -270,7 +306,7 @@ Data frames are data tables in R. We use `data.frame()` to create a data frame o vectors of the same length and use them to create a data frame. -```{r ex-hhNQ3-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Creating Data Frames'} +```{r ex-7l5rU-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Creating Data Frames'} pollutant <- c("Benzene", "Toluene", "Xylenes") concentration <- c(1.3, 5.5, 6.0) carcinogen <- c(TRUE, FALSE, FALSE) @@ -280,10 +316,10 @@ my_data_frame ``` The output above shows a table with the vector variable names as column names, and the vector values below the respective column name. If you try to -create a data frame where the vectors are not all the same length, you will see the error shown below. +create a data frame where the vectors are not all the same length, you will get an error. -```{r ex-GHIDP-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Data Frame Dimension Mismatch Error'} +```{r ex-5KjLa-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Data Frame Dimension Mismatch Error'} pollutant <- c("Benzene", "Toluene") concentration <- c(1.3, 5.5, 6.0) carcinogen <- c(TRUE, FALSE, FALSE) @@ -416,7 +452,7 @@ grade_this_code( ### Exercise 5 -Create a data frame `df` with two columns, `name` and `age`, each containing three entries of your choosing. +Create a data frame `df` with two columns, `name` and `age`. Use the names 'Alice', 'Bob', and 'Charlie' and give them each an age. ```{r exercise5, exercise = TRUE} # Your code here diff --git a/inst/tutorials/2-Functions-and-Importing-Data/lesson.Rmd b/inst/tutorials/2-Functions-and-Importing-Data/lesson.Rmd index d29b445..e85d650 100644 --- a/inst/tutorials/2-Functions-and-Importing-Data/lesson.Rmd +++ b/inst/tutorials/2-Functions-and-Importing-Data/lesson.Rmd @@ -24,7 +24,11 @@ data from CSV text files and Excel documents. ## Functions -In R, there are two main types of objects: variables and functions. We covered variables in the [introductory lesson](../1-Introduction-to-R/readme.md). A variable is used to create and reference data. The data can be a character, numeric, or logical data type. Variables can reference various "containers" for data, such as a __vector__, __list__, or __data frame__. +In R, there are two main types of objects: variables and functions. We +covered variables in the introductory lesson. A variable is used to create +and reference data. The data can be a character, numeric, or logical data +type. Variables can reference various "containers" for data, such as a +__vector__, __list__, or __data frame__. Functions are similar to variables in that they are short names that reference something saved in R. In this case, a function is not referencing data but a piece of code. A function is saved code that can be used to do some operation on data. @@ -33,7 +37,7 @@ Functions are similar to variables in that they are short names that reference s R has many built-in functions that perform common tasks. When you open RStudio you can immediately use a function called `mean( )`. Here is an example of using the `mean( )` function to find the average of a vector of integers. We first save a vector of integers in the `x` variable then put the variable inside the parentheses of the function. -```{r ex-KGbwx-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Mean'} +```{r ex-MgpQd-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Mean'} x <- c(4, 8, 1, 14, 34) mean(x) @@ -43,22 +47,22 @@ As you would expect, R has many built-in math functions. Below are a few examples. -```{r ex-N3tac-2, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Natural Logarithm'} +```{r ex-z3aum-2, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Natural Logarithm'} log(27) #Natural logarithm ``` -```{r ex-ul39I-3, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Base 10 Logarithm'} +```{r ex-QLbk9-3, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Base 10 Logarithm'} log10(100) #base 10 logarithm ``` -```{r ex-HWVfX-4, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Square Root'} +```{r ex-TQmUm-4, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Square Root'} sqrt(225) # Square root ``` -```{r ex-8Inet-5, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Absolute Value'} +```{r ex-yM1XR-5, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Absolute Value'} abs(-5) #Absolute value ``` @@ -69,7 +73,7 @@ All of the examples show that the general form is `function_name( )`. The name o Many functions also have additional options you can choose, which are called the _arguments_. To see what needs to go inside `( )`, type a question mark in front of the function and run it in the R console. -```{r ex-Gw7lg-6, eval = FALSE} +```{r ex-AA5Ei-6, eval = FALSE} ?mean() ``` @@ -82,27 +86,42 @@ In RStudio, you will see the help page for `mean()` in the bottom right corner p On the help page, under `Usage`, you see `mean(x, ...)`. This means that the only thing that necessarily has to go into `( )` is `x`. On the help page under `Arguments` you will find a description of what `x` needs to be: a numeric or logical vector. -Many built-in functions in R have multiple arguments. This allows you to give the function some more information to perform calculation you want. The example below shows how to use the `digits` argument in the `round( )` function. Providing different values to the `digits` argument will return different values. +Many built-in functions in R have multiple arguments. This allows you +to give the function some more information to perform calculation you +want. The example below shows how to use the `digits` argument in the +`round( )` function. Providing different values to the `digits` argument +will return different values. -```{r ex-z51CK-7, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Round to Zero Digits'} +```{r ex-OgB3j-7, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Round to Zero Digits'} round(12.3456) ``` -```{r ex-Lt2TR-8, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Round to Three Digits'} +```{r ex-luBiP-8, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Round to Three Digits'} round(12.3456, digits=3) ``` -```{r ex-VaHN1-9, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Round to One Digit'} +```{r ex-aWrJz-9, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Round to One Digit'} round(12.3456, digits=1) ``` +In the first example, you can see that we did not provide a value for the +`digits` argument. That's because there is a default value `digits = 0` (see +the `Usage` section on the help page `?round`). If there is a default value, +then that argument does not need to be specified inside `( )`. If there is no +default value for an argument, then the function will error and tell you that +you forgot to supply a value for the argument. + + ## Useful Built-in Functions -When you start an R session there are many built-in functions that are immediately available for you to use. Other functions are available in community developed packages, as explained in a later section of this lesson. Below is a list of a few commonly used built-in functions in R. +When you start an R session there are many built-in functions that are +immediately available for you to use. Other functions are available in +community developed packages, as explained in a later section of this +lesson. Below is a list of a few commonly used built-in functions in R. ### 1. `sum( )` @@ -110,7 +129,7 @@ When you start an R session there are many built-in functions that are immediate Returns the sum of a vector of numeric values. -```{r ex-pv936-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Sum of Values'} +```{r ex-VKZXw-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Sum of Values'} sum(c(2.3, 7.5, 9, -10)) ``` @@ -120,7 +139,7 @@ sum(c(2.3, 7.5, 9, -10)) Get the minimum value from a numeric vector. -```{r ex-eaXyL-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Minimum Value'} +```{r ex-f46Sf-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Minimum Value'} min(c(6, 9, 3, 11, -2)) ``` @@ -130,7 +149,7 @@ min(c(6, 9, 3, 11, -2)) Get the maximum value from a numeric vector. -```{r ex-NH2bE-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Maximum Value'} +```{r ex-jM6jJ-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Maximum Value'} max(c(15, 2, 8.3, -10, 21)) ``` @@ -140,7 +159,7 @@ max(c(15, 2, 8.3, -10, 21)) Create a numeric vector with a certain sequence. The example below creates a vector of integers from 1 to 5. -```{r ex-RM9C8-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Create Sequence with `seq()`'} +```{r ex-7IsKK-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Create Sequence with `seq()`'} seq(from = 1, to = 5, by = 1) ``` @@ -148,7 +167,7 @@ seq(from = 1, to = 5, by = 1) Another way to create a sequence of integers is to use the colon. -```{r ex-eLFnq-2, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Create Sequence with Colon Operator'} +```{r ex-IGHBw-2, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Create Sequence with Colon Operator'} 1:5 ``` @@ -158,7 +177,7 @@ Another way to create a sequence of integers is to use the colon. Concatenate two or more strings. -```{r ex-C27oO-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Concatenate Strings'} +```{r ex-KghO3-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Concatenate Strings'} x <- "Hello" y <- "world!" paste(x, y, sep = " ") @@ -168,7 +187,7 @@ paste(x, y, sep = " ") Any numbers will be converted to strings. -```{r ex-wTyKS-2, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Concatenate Strings with Numbers'} +```{r ex-czUgQ-2, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Concatenate Strings with Numbers'} x <- "You're number " y <- 1 z <- "!" @@ -184,7 +203,7 @@ The `substr()` function allows you to pull out a section from a string based on For example, in AQS data a monitor ID may be written in the following format: [State code - County code - Site number - Parameter code - POC]. If we only wanted to pull out the site number for this monitor ID we could do the following: -```{r ex-8vFtN-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Extract Site Number'} +```{r ex-v8WVk-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Extract Site Number'} wisconsin_monitor <- c('55-021-0015-44201-2') # Ozone monitor in Columbia County, WI site_id <- substr(wisconsin_monitor, start = 8, stop = 11) # start and stop position within the character string. site_id @@ -199,14 +218,14 @@ R allows you to place a function inside another function to perform multiple tas For instance, if you want to create a sequence of numbers and then take the mean of that sequence, you could either do it in a couple of steps, or all at once. -```{r ex-Pf9AU-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Mean in Two Steps'} +```{r ex-MlB9S-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Mean in Two Steps'} #Two steps x <- seq(from=1, to=10, by=3) mean(x) ``` -```{r ex-1K8TV-2, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Mean in One Step'} +```{r ex-0UTw3-2, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Calculate Mean in One Step'} #One step mean(seq(from=1, to=10, by=3)) @@ -223,7 +242,7 @@ Most of the statistical summary functions in R have the argument `na.rm`. This s For example, there is a built-in data frame in R called `airquality` with daily measurements from a monitor in New York from 1973 (see `?airquality`). If we load the data frame using the `data()` function and take a look at the top 6 rows using the `head()` function, we can see some missing values represented as `NA`. -```{r ex-fmSM6-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Inspect `airquality` Data'} +```{r ex-JuIt0-1, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Inspect `airquality` Data'} data("airquality") head(airquality) @@ -232,7 +251,7 @@ head(airquality) The `mean()` function, for example, has the argument `na.rm` set to `FALSE`. This means that the `NA` values will not be removed from the vector for which it is calculating the mean. As a result, it will return an `NA` because it cannot properly calculate the average. Here we use the `Ozone` column from the `airquality` data frame. -```{r ex-SjMht-2, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Mean with NA Values'} +```{r ex-DLSUa-2, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Mean with NA Values'} mean(airquality$Ozone) ``` @@ -240,7 +259,7 @@ mean(airquality$Ozone) To get the mean value, we set `na.rm = TRUE`. -```{r ex-LVWAV-3, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Mean with NA Removal'} +```{r ex-B6W22-3, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Mean with NA Removal'} mean(airquality$Ozone, na.rm = TRUE) ``` @@ -262,7 +281,7 @@ For example, if you wanted to find serial correlation in an environmental data s First, you might try to use the function. -```{r ex-xBFkR-1, error = TRUE} +```{r ex-Y9Srz-1, error = TRUE} x <- c(1.3, 3.5, 2.6, 3.4, 6.4) serialCorrelationTest(x) @@ -284,12 +303,9 @@ A window will pop up. Start typing "EnvStats" into the "Packages" box, select th Now that we've installed the package, we still can't use the function we want. We need to load the package first (opening the app). We use the `library()` function to do this. -```{r ex-ga6IS-2, message = FALSE} +```{r ex-D6pPJ-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Use `serialCorrelationTest` from EnvStats'} library(EnvStats) -``` - -```{r ex-Ps96A-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Use `serialCorrelationTest` from EnvStats'} x <- c(1.3, 3.5, 2.6, 3.4, 6.4) serialCorrelationTest(x) @@ -303,15 +319,22 @@ Remember, when you close down RStudio, then start it up again, you don’t have ## Importing Data -R can import data from just about any format, including CSV, Excel, Databases, GIS shapefiles. This section will demonstrate how to import CSV and Excel files. +R can import data from just about any format, including + +- CSV, +- Excel, +- Databases, and +- GIS shapefiles. + +This section will demonstrate how to import CSV and Excel files. ### CSV -R has a built-in function called `read.csv()` for reading `.csv` files. Download the `chicago_daily.csv` file [here](../data/chicago_daily.csv) and save it to your working directory. If you don't know what your working directory is, run this code in R and it will tell you. +R has a built-in function called `read.csv()` for reading `.csv` files. Download the `chicago_daily.csv` file [here](https://github.com/LADCO/training-r-intro/blob/main/data/chicago_daily.csv) and save it to your working directory. If you don't know what your working directory is, run this code in R and it will tell you. -```{r ex-ftJJf-1, eval = FALSE} +```{r ex-ISupI-1, eval = FALSE} getwd() ``` @@ -319,7 +342,7 @@ getwd() Use `read.csv()` by providing the location and name of the file as the first argument. If the file is in your working directory, simply supply the name of the file. Below, the data from the file is read into R and saved as a data frame, which is the data type for storing tables. The function `head()` will show the first few lines. -```{r ex-coc7X-2, eval = FALSE} +```{r ex-Me1vB-2, eval = FALSE} chicago_daily <- read.csv("chicago_daily.csv") head(chicago_daily) @@ -330,7 +353,7 @@ head(chicago_daily) There are several packages that can be used to import data from an Excel file, such as `xlsx`, `XLConnect`, and `readxl`. In this example, we'll use the `readxl` package. If you do not have the package installed, you can use RStudio to install as described in the section above on packages. You can also use the function `install.packages( )`. -```{r ex-GltGn-1, eval = FALSE} +```{r ex-HR2Qm-1, eval = FALSE} install.packages("readxl") ``` @@ -343,10 +366,10 @@ library(readxl) ``` -Use the `read_excel()` function from the `readxl` package to read emissions data from [this Excel workbook](./data/emissions_IL_2022.xlsx). Download the file to your working directory and read the first worksheet (named "UNIT_DATA"), skipping the first 6 rows. +Use the `read_excel()` function from the `readxl` package to read emissions data from [this Excel workbook](https://github.com/LADCO/training-r-intro/blob/main/data/emissions_IL_2022.xlsx). Download the file to your working directory and read the first worksheet (named "UNIT_DATA"), skipping the first 6 rows. -```{r ex-Zus1W-3, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Read and Inspect Excel Data'} +```{r ex-ZTndA-3, exercise = TRUE, exercise.eval = FALSE, exercise.lines = 5, exercise.cap = 'Read and Inspect Excel Data'} library(readxl) emissions <- read_excel("./data/emissions_IL_2022.xlsx", sheet = "UNIT_DATA", skip = 6) head(emissions) @@ -470,7 +493,7 @@ grade_this_code( ### Exercise 5 -Read in the first 10 rows of the `chicago_daily.csv` file [here](../data/chicago_daily.csv). +Read in the first 10 rows of the `chicago_daily.csv` file [here](https://github.com/LADCO/training-r-intro/blob/main/data/chicago_daily.csv). ```{r exercise5, exercise = TRUE} # Your code here @@ -485,7 +508,7 @@ Read in the first 10 rows of the `chicago_daily.csv` file [here](../data/chicago ``` ```{r exercise5-solution} -read.csv("../data/chicago_daily.csv", nrows = 10) +read.csv("chicago_daily.csv", nrows = 10) ``` ```{r exercise5-check} diff --git a/inst/tutorials/3-Subsetting-Sorting-and-Combining/lesson.Rmd b/inst/tutorials/3-Subsetting-Sorting-and-Combining/lesson.Rmd index d7ca5b4..fe8fa62 100644 --- a/inst/tutorials/3-Subsetting-Sorting-and-Combining/lesson.Rmd +++ b/inst/tutorials/3-Subsetting-Sorting-and-Combining/lesson.Rmd @@ -22,16 +22,27 @@ This lesson covers how to subset data using indexing, logical operators, and the ## Prerequisites -This lesson assumes you are familiar with the material in the previous lesson on [Functions and Importing Data](../2-Functions-and_Importing-Data/readme.md). +This lesson assumes you are familiar with the material in the previous +lesson on Functions and Importing Data. +The data from the R package region5air is used throughout these lessons. +To install the package from GitHub, use the `remotes` package. Run the code +below to install the `remotes` package and install `region5air` from GitHub. -The example data for exercises in this lesson is available directly from this package. It is assumed that this package is already installed and loaded into your R session. +```{r ex-kbqJN-1, exercise = FALSE, exercise.eval = FALSE, eval = FALSE, exercise.cap = 'Install remotes and region5air'} +# if you have not installed remotes +install.packages("remotes") + +library(remotes) +install_github("FluentData/region5air") + +``` To load the `chicago_air` data frame we will be using in the lesson, simply use the `data()` function to load the data frame. -```{r ex-Ff5Sd-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Load `chicago_air` Data Frame'} +```{r ex-OI6Mf-2, exercise = FALSE, exercise.eval = FALSE, eval = FALSE, exercise.cap = 'Load `chicago_air` Data Frame'} data(chicago_air) ``` @@ -42,7 +53,7 @@ You should see the `chicago_air` variable in the top right panel of RStudio, whi We will also use some functions from the `dplyr` package. You will need to install the package if you haven't already. -```{r ex-61y8i-2, exercise = FALSE, exercise.eval = FALSE, eval = FALSE, exercise.cap = 'Install `dplyr` Package'} +```{r ex-8SV1n-3, exercise = FALSE, exercise.eval = FALSE, eval = FALSE, exercise.cap = 'Install `dplyr` Package'} install.packages("dplyr") ``` @@ -55,7 +66,7 @@ We always want to make sure our data looks the way it is supposed to before we b The best way to take a quick look at the first few rows of a data frame is to use the `head()` function. -```{r ex-XBPSe-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'View the First Few Rows of `chicago_air`'} +```{r ex-O4plF-1, exercise = FALSE, exercise.eval = TRUE, exercise.cap = 'View the First Few Rows of `chicago_air`'} data(chicago_air) head(chicago_air) @@ -64,8 +75,7 @@ head(chicago_air) You can specify the number of lines to display by using the `n` parameter. -```{r ex-W8xk7-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Specify Number of Lines with `head()`'} -data(chicago_air) +```{r ex-Yx71R-2, exercise = FALSE, exercise.eval = TRUE, exercise.cap = 'Specify Number of Lines with `head()`'} head(chicago_air, n = 3) ``` @@ -73,8 +83,7 @@ head(chicago_air, n = 3) You can also look at the bottom of the data frame by using the `tail()` function. -```{r ex-k9SN6-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'View the Last Few Rows of `chicago_air`'} -data(chicago_air) +```{r ex-4PlDa-3, exercise = FALSE, exercise.eval = TRUE, exercise.cap = 'View the Last Few Rows of `chicago_air`'} tail(chicago_air) ``` @@ -82,8 +91,7 @@ tail(chicago_air) In RStudio, you can either click on the name of the data frame in the top right panel or use the `View()` function to generate a web based table of the data in the top left panel. -```{r ex-4Q5uw-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Use `View()` to Inspect `chicago_air`'} -data(chicago_air) +```{r ex-GSqk0-4, exercise = FALSE, exercise.eval = FALSE, eval = FALSE, exercise.cap = 'Use `View()` to Inspect `chicago_air`'} View(chicago_air) ``` @@ -93,12 +101,13 @@ View(chicago_air) By inspecting the data frame this way, you can see that the records are daily values of ozone, temperature, and solar radiation. For more information about the data set you can type a question mark in from the name of the data frame variable in the console. -```{r ex-fcXTz-5, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Learn More About `chicago_air`'} +```{r ex-gp3P7-5, exercise = FALSE, exercise.eval = FALSE, eval = FALSE, exercise.cap = 'Learn More About `chicago_air`'} ?chicago_air ``` From the `Description` section of the help page, the `chicago_air` data frame is: + > A dataset containing daily values of ozone, temperature, and solar radiation from a Chicago monitor between January 1, 2021 and December 31, 2021. @@ -109,7 +118,13 @@ If we want to work with a particular subset of a data frame, we need to know how ### Indexing -Values in a data frame can be selected, individually or in a group, based on their index values. These are integers that represent the locations in the data frame. If there is a 2 x 2 table, then there are 2 rows and 2 columns. Each cell can be represented by two numbers, like coordinates on a map. For a data frame, the format is `[row, column]`. Below is a table that shows the index values in each cell. +Values in a data frame can be selected, individually or in a group, +based on their index values. These are integers that represent the +locations in the data frame. If there is a 2 x 2 table, then there +are 2 rows and 2 columns. Each cell can be represented by two numbers, +like coordinates on a map. For a data frame, the format is `[row, column]`. +Below is a table that shows the index values in each cell. + |Column 1 | Column 2| |--- |--- | | `[1, 1]`| `[1, 2]`| @@ -119,7 +134,7 @@ Values in a data frame can be selected, individually or in a group, based on the Below is a data frame called `my_data` that has 3 rows and 2 columns. -```{r ex-wbzvC-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Explore Data Frame Indexing with `my_data`'} +```{r ex-BpkGD-1, exercise = FALSE, exercise.eval = TRUE, exercise.cap = 'Explore Data Frame Indexing with `my_data`'} my_data <- data.frame(colors = c("red", "green", "yellow"), fruit = c("apple", "grape", "banana")) @@ -130,7 +145,7 @@ my_data To select a particular cell from the `my_data` data frame, we use the `[row, column]` construction. We place those square brackets at the end of the data frame variable `my_data[]` and use integers to select a value. If we wanted to select the "green" value, we would use `my_data[2, 1]`. -```{r ex-dIfsD-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Select a Specific Value Using Indexing'} +```{r ex-qzI9W-2, exercise = FALSE, exercise.eval = TRUE, exercise.cap = 'Select a Specific Value Using Indexing'} my_data[2, 1] ``` @@ -138,7 +153,7 @@ my_data[2, 1] To select "banana", we use `my_data[3, 2]`. -```{r ex-dQYP7-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access Another Specific Value Using Indexing'} +```{r ex-t2Pvs-3, exercise = FALSE, exercise.eval = TRUE, exercise.cap = 'Access Another Specific Value Using Indexing'} my_data[3, 2] ``` @@ -146,7 +161,7 @@ my_data[3, 2] We can also access data from a vector using the same indexing idea. In this case, you don’t need the comma to separate the rows and columns since you are accessing one dimensional data. Below is a vector of numbers. -```{r ex-crQeB-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Vector Indexing Example'} +```{r ex-yfBVU-4, exercise = FALSE, exercise.eval = TRUE, exercise.cap = 'Vector Indexing Example'} x <- c(1, 3, 2, 7, 25.3, 6) x @@ -155,7 +170,7 @@ x If we want to access the 5th element of the vector, we would use `x[5]`. -```{r ex-0RkJ8-5, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access a Vector Element by Index'} +```{r ex-SIpIJ-5, exercise = FALSE, exercise.eval = TRUE, exercise.cap = 'Access a Vector Element by Index'} x[5] ``` @@ -166,7 +181,7 @@ Now that we understand indexing we can subset the `chicago_air` data frame by us To get one row of the data frame, specify the row number you would like in the brackets, on the left side of the comma. If you leave the column value on the right side of the comma blank, it returns all the columns associated with row number 1. -```{r ex-pJuzG-6, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Subset `chicago_air` Data Frame Using Indexing'} +```{r ex-JSlpx-6, exercise = FALSE, exercise.eval = TRUE, exercise.cap = 'Subset `chicago_air` Data Frame Using Indexing'} data(chicago_air) chicago_air[1, ] @@ -175,8 +190,7 @@ chicago_air[1, ] If you want more than one row, you can supply a vector of row numbers. Below, the vector access the 1st, 2nd, and 5th rows of the data frame. -```{r ex-rOwjG-7, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access Multiple Rows with a Vector of Row Numbers'} -data(chicago_air) +```{r ex-fCofe-7, exercise = FALSE, exercise.eval = TRUE, exercise.cap = 'Access Multiple Rows with a Vector of Row Numbers'} chicago_air[c(1, 2, 5), ] ``` @@ -184,8 +198,7 @@ chicago_air[c(1, 2, 5), ] To get a column from the data frame, specify the column number in the brackets, to the right of the comma. By leaving the row value blank, you are telling it to return all rows associated with column 1. Below, we wrap the output in the `head()` function to limit the number of rows printed. -```{r ex-DidXA-8, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access a Single Column'} -data(chicago_air) +```{r ex-uo0uN-8, exercise = FALSE, exercise.eval = TRUE, exercise.cap = 'Access a Single Column'} head(chicago_air[, 1]) ``` @@ -196,8 +209,7 @@ As you can see, a vector is returned. When a column of a data frame is selected You can also obtain more than one column by supplying a vector of column numbers. -```{r ex-lUdte-9, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access Multiple Columns'} -data(chicago_air) +```{r ex-uHOQ7-9, exercise = FALSE, exercise.eval = TRUE, exercise.cap = 'Access Multiple Columns'} head(chicago_air[, c(3, 4, 6)]) ``` @@ -208,8 +220,7 @@ Since more than one column is selected, then a data frame is returned. A column name can be used to select a vector. -```{r ex-vaAO3-10, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access a Column by Name'} -data(chicago_air) +```{r ex-lRoPL-10, exercise = FALSE, exercise.eval = TRUE, exercise.cap = 'Access a Column by Name'} head(chicago_air[, "solar"]) ``` @@ -217,8 +228,7 @@ head(chicago_air[, "solar"]) Or a vector of column names can subset to a slimmed down data frame. -```{r ex-3yZVc-11, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Subset to Specific Columns by Name'} -data(chicago_air) +```{r ex-zsQ7h-11, exercise = FALSE, exercise.eval = TRUE, exercise.cap = 'Subset to Specific Columns by Name'} head(chicago_air[, c("ozone", "temp", "month")]) ``` @@ -226,7 +236,7 @@ head(chicago_air[, c("ozone", "temp", "month")]) Both rows and columns can be specified at the same time. The example below returns the first 5 rows of the temperature and solar columns. -```{r ex-YRGcl-12, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Specify Both Rows and Columns for Subsetting'} +```{r ex-Iw6Um-12, exercise = FALSE, exercise.eval = TRUE, exercise.cap = 'Specify Both Rows and Columns for Subsetting'} chicago_air[1:5, c("temp", "solar")] ``` @@ -236,7 +246,7 @@ chicago_air[1:5, c("temp", "solar")] In R, the dollar sign `$` is a special character that can be used to access a data frame column by name. The dollar sign is placed immediately after the variable name. For example, if we wanted to access the temperature values in the `chicago_air` data frame, then we could use `chicago_air$temp`. -```{r ex-9PZ88-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access a Column with `$`'} +```{r ex-ACUjj-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Access a Column with `$`'} data(chicago_air) head(chicago_air$temp) @@ -270,12 +280,12 @@ Below is a table of logical operators in R that can be used to create logical co The result of a logical expression is a logical data type, a boolean value `TRUE` or `FALSE`. -```{r ex-ZjgTb-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Logical Expressions Example 1'} +```{r ex-0I2Yj-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Logical Expressions Example 1'} 1 + 1 == 2 ``` -```{r ex-rlN7A-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Logical Expressions Example 2'} +```{r ex-Ge4cz-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Logical Expressions Example 2'} 10 > 20 ``` @@ -286,7 +296,7 @@ Vectors can also be used in a logical expression. A vector of values on the left Here, we check if any of the integers in the vector on the left are above 60. A logical vector is returned. -```{r ex-wZ7Fl-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Vector Logical Expression'} +```{r ex-Fxp6H-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Vector Logical Expression'} c(25, 80, 55) > 60 ``` @@ -297,7 +307,7 @@ This concept can be used to subset a data frame. A logical vector can be used in We can use the data frame of colors and fruit again to demonstrate. -```{r ex-9sTMH-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Create a Logical Vector for Subsetting'} +```{r ex-jlNrt-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Create a Logical Vector for Subsetting'} my_data <- data.frame(colors = c("red", "green", "yellow"), fruit = c("apple", "grape", "banana")) @@ -308,7 +318,7 @@ my_data If we only wanted records with the "yellow" color, we could use the vector `c(FALSE, FALSE, TRUE)`. Place this vector in the brackets of the data frame, where we select rows. -```{r ex-Ld3tB-5, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Subset Using a Logical Vector'} +```{r ex-oQ31u-5, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Subset Using a Logical Vector'} my_data[c(FALSE, FALSE, TRUE), ] ``` @@ -319,7 +329,7 @@ A data frame is returned. The only record is from the 3rd row of the logical vec But a more useful way of creating the logical vector is with a logical expression. Below we access the "color" column as a vector using the `$` operator. Then we create a logical vector using a logical expression. -```{r ex-NEemJ-6, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Create a Logical Vector with a Logical Expression'} +```{r ex-QYi36-6, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Create a Logical Vector with a Logical Expression'} colors <- my_data$colors colors @@ -333,7 +343,7 @@ yellow Now we can use the logical vector `yellow` to subset the data frame down to records that have the color yellow. -```{r ex-uzDCk-7, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Subset Data Frame Using Logical Vector'} +```{r ex-DcdoY-7, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Subset Data Frame Using Logical Vector'} my_data[yellow, ] ``` @@ -341,7 +351,7 @@ my_data[yellow, ] The `chicago_air` data frame can be subset in a similar way. Below, a logical vector `hot` is created to represent hot days above 90 degrees. The data frame is subset down to records with hot days. -```{r ex-s91s1-8, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Subset `chicago_air` for Hot Days'} +```{r ex-vDCUe-8, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Subset `chicago_air` for Hot Days'} data(chicago_air) hot <- chicago_air$temp > 90 @@ -357,7 +367,7 @@ A logical vector can also be used in combination with the function `filter()`. The `filter()` function is from a package called `dplyr` which provides many functions for manipulating data frames. -```{r ex-xdO0G-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Load `dplyr` and Explore `filter()`'} +```{r ex-lgj04-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Load `dplyr` and Explore `filter()`'} # if you have not installed dplyr @@ -373,7 +383,7 @@ The benefit of using `filter()` is that it works the way other functions in R ty If we want to filter down to records in the `chicago_air` data frame where ozone was above 60 ppb (.060 ppm), we would use the following code. -```{r ex-xGrfr-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Filter `chicago_air` for High Ozone Levels'} +```{r ex-4t2jz-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Filter `chicago_air` for High Ozone Levels'} data(chicago_air) high_ozone <- filter(chicago_air, ozone > 0.060) @@ -384,7 +394,7 @@ high_ozone If we wanted all of the high ozone days in the 6th month, we add another expression separated by a comma. -```{r ex-4GEoD-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Filter for High Ozone Levels in June'} +```{r ex-OfG6z-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Filter for High Ozone Levels in June'} data(chicago_air) high_ozone_june <- filter(chicago_air, ozone > 0.060, month == 6) @@ -395,7 +405,7 @@ high_ozone_june Additional logical expressions can be added by separating each expression with a comma. The comma serves as a logical AND. Below is an equivalent output to the output above, using `&` instead of a comma. -```{r ex-nZCJu-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Alternative `filter()` Syntax Using `&`'} +```{r ex-g4DNV-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Alternative `filter()` Syntax Using `&`'} data(chicago_air) high_ozone_june <- filter(chicago_air, ozone > 0.060 & month == 6) @@ -411,7 +421,7 @@ The `dplyr` package also has a function named `arrange()` that will sort a data Below, the `chicago_air` data frame is ordered by the `ozone` column. The default is ascending order. -```{r ex-dobXr-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Sort `chicago_air` by `ozone`'} +```{r ex-po4R3-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Sort `chicago_air` by `ozone`'} # if the dplyr library is not already loaded library(dplyr) data(chicago_air) @@ -425,7 +435,7 @@ head(ozone_ordered) To use descending order, wrap the column in the `desc()` function (also from the `dplyr` package). -```{r ex-oI5Hz-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Sort `chicago_air` in Descending Order by `ozone`'} +```{r ex-I9DYJ-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Sort `chicago_air` in Descending Order by `ozone`'} data(chicago_air) ozone_descending <- arrange(chicago_air, desc(ozone)) @@ -436,7 +446,7 @@ head(ozone_descending) Additional columns can be used to sort the data frame, separated by a comma. -```{r ex-E6e60-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Sort `chicago_air` by Multiple Columns'} +```{r ex-e44XY-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Sort `chicago_air` by Multiple Columns'} data(chicago_air) ozone_temp <- arrange(chicago_air, desc(ozone), desc(temp)) @@ -452,7 +462,7 @@ If we are working with multiple data frames in R, it is sometimes useful to comb To illustrate, we will make two subsets of the `chicago_air` data frame, then combine them together using the `bind_rows()` function. Below, the original number of records in the `chicago_air` data frame is shown using the `nrow()` function. We will split the data frame and recombine to a data frame with the original number of records. -```{r ex-sS6aV-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Count Rows in `chicago_air`'} +```{r ex-7qfPE-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Count Rows in `chicago_air`'} # if you have not loaded the dplyr package library(dplyr) data(chicago_air) @@ -464,7 +474,7 @@ nrow(chicago_air) Now we split the data frame into warm and cool data frames using the `filter()` function. -```{r ex-kZtfG-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Split `chicago_air` into Warm and Cool Subsets'} +```{r ex-eBNAU-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Split `chicago_air` into Warm and Cool Subsets'} data(chicago_air) warm <- filter(chicago_air, temp > 80) @@ -480,7 +490,7 @@ nrow(cool) We can confirm that the rows from these two data frames add up to the original data frame. -```{r ex-n1L5E-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Verify Row Counts Before Combining'} +```{r ex-dYaIj-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Verify Row Counts Before Combining'} nrow(warm) + nrow(cool) == nrow(chicago_air) ``` @@ -488,7 +498,7 @@ nrow(warm) + nrow(cool) == nrow(chicago_air) Now we combine using the `bind_rows()` function and confirm that the new `recombined` data frame has the same number of records as the original data frame. -```{r ex-ghHy3-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Recombine Data Frames and Verify Row Count'} +```{r ex-BjJ28-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Recombine Data Frames and Verify Row Count'} data(chicago_air) recombined <- bind_rows(warm, cool) diff --git a/inst/tutorials/4-Writing-Functions-Conditionals-and-Loops/lesson.Rmd b/inst/tutorials/4-Writing-Functions-Conditionals-and-Loops/lesson.Rmd index 58e7fd0..b01969a 100644 --- a/inst/tutorials/4-Writing-Functions-Conditionals-and-Loops/lesson.Rmd +++ b/inst/tutorials/4-Writing-Functions-Conditionals-and-Loops/lesson.Rmd @@ -32,7 +32,7 @@ This lesson assumes you are familiar with the material in the previous lessons: The data for these lessons is available from this package. It is assumed that this package is already installed and loaded into the R session. If you need to refer to the package, simply refer to it as "this package". -```{r ex-nrhfD-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Load Data from This Package'} +```{r ex-24n3M-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Load Data from This Package'} # Assuming the package is already loaded data(chicago_air) @@ -49,7 +49,7 @@ the thing that's saved is not a data object but lines of R code. To save your own function, use this construction: -```{r ex-leVmm-1, eval = FALSE, exercise = TRUE, exercise.cap = 'Creating Your Own Function'} +```{r ex-8LfCw-1, eval = FALSE, exercise = TRUE, exercise.cap = 'Creating Your Own Function'} my_function_name <- function() { # lines of R code @@ -62,7 +62,7 @@ We can write a simple function that prints something to the console. Here is a function named `print_hello`. -```{r ex-HqNOS-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Simple Function to Print Hello'} +```{r ex-fJfsU-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Simple Function to Print Hello'} print_hello <- function() { print("Hello") @@ -71,7 +71,7 @@ print_hello <- function() { ``` -```{r ex-fwQ5L-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing the print_hello Function'} +```{r ex-Vn2hP-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing the print_hello Function'} print_hello() ``` @@ -87,7 +87,7 @@ Here we recreate the same function, but this time we add an argument `text` insi of the parentheses. -```{r ex-2P8Tn-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Function with an Argument'} +```{r ex-K2PH7-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Function with an Argument'} print_hello <- function(text) { message <- paste("Hello", text) @@ -98,7 +98,7 @@ print_hello <- function(text) { ``` -```{r ex-LRgr3-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing the Modified print_hello Function'} +```{r ex-dbS3B-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing the Modified print_hello Function'} print_hello(text = "everybody!") ``` @@ -111,7 +111,7 @@ greater than a criteria pollutant standard. We could make a simple function that takes two arguments: one for the measurement value, and one for the standard value. -```{r ex-0qVRQ-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Function with Two Arguments'} +```{r ex-O1ybZ-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Function with Two Arguments'} standard_violated <- function(measurement, standard) { measurement > standard @@ -120,7 +120,7 @@ standard_violated <- function(measurement, standard) { ``` -```{r ex-Qop4d-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing standard_violated Function'} +```{r ex-UDcsY-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing standard_violated Function'} standard_violated(measurement = 84, standard = 70) ``` @@ -134,7 +134,7 @@ the flexibility to use a different value. To set a default value, we use `= 70` when we create the function. -```{r ex-MZLuD-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Function with Default Value'} +```{r ex-1jUrF-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Function with Default Value'} standard_violated <- function(measurement, standard = 70) { measurement > standard @@ -143,7 +143,7 @@ standard_violated <- function(measurement, standard = 70) { ``` -```{r ex-3zbVT-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing standard_violated with Default Value'} +```{r ex-5LeDW-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing standard_violated with Default Value'} standard_violated(measurement = 50) ``` @@ -164,12 +164,12 @@ Here we show that using two numbers in a different order will return different outputs. -```{r ex-GWBtx-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing Positional Arguments'} +```{r ex-IJniY-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing Positional Arguments'} standard_violated(60, 70) ``` -```{r ex-8GIop-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing Positional Arguments in Reverse Order'} +```{r ex-kb4G6-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Testing Positional Arguments in Reverse Order'} standard_violated(70, 60) ``` @@ -186,7 +186,7 @@ will run if the logical expression is `TRUE` is placed inside curly braces. Belo is the outline (not actual R code). -```{r ex-QUyL3-1, eval = FALSE, exercise = TRUE, exercise.cap = 'if Function Outline'} +```{r ex-4luYi-1, eval = FALSE, exercise = TRUE, exercise.cap = 'if Function Outline'} if(logical expression>) { @@ -195,7 +195,7 @@ if(logical expression>) { ``` -```{r ex-UdUgn-2, eval = FALSE, exercise = TRUE, exercise.cap = 'if-else Function Outline'} +```{r ex-5JB6E-2, eval = FALSE, exercise = TRUE, exercise.cap = 'if-else Function Outline'} if() { @@ -208,7 +208,7 @@ if() { ``` -```{r ex-48zih-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'if Function Example'} +```{r ex-eb5H2-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'if Function Example'} ozone <- 0.075 if(ozone > 0.065) { @@ -223,7 +223,7 @@ if(ozone > 0.065) { ``` -```{r ex-gDSuK-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Another if Function Example'} +```{r ex-cf4DF-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Another if Function Example'} ozone <- 0.06 if(ozone > 0.065) { @@ -238,12 +238,12 @@ if(ozone > 0.065) { ``` -```{r ex-3aWMJ-5, eval = FALSE, exercise = TRUE, exercise.cap = 'ifelse Function Outline'} +```{r ex-eZIEd-5, eval = FALSE, exercise = TRUE, exercise.cap = 'ifelse Function Outline'} ifelse(, , ) ``` -```{r ex-2YM6L-6, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'ifelse Function Example'} +```{r ex-9JM2l-6, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'ifelse Function Example'} ozone_value <- 0.06 message <- ifelse(ozone_value > 0.065, "Potential Health Effects", "All Good") @@ -262,7 +262,7 @@ For loops are used to repeat an operation a set number of times. Here is the basic outline: -```{r ex-ofECG-1, eval = FALSE, exercise = TRUE, exercise.cap = 'For Loop Outline'} +```{r ex-Smpxz-1, eval = FALSE, exercise = TRUE, exercise.cap = 'For Loop Outline'} for(i in sequence){ @@ -271,7 +271,7 @@ for(i in sequence){ ``` -```{r ex-dU92z-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'For Loop Example'} +```{r ex-fBA2v-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'For Loop Example'} for(i in c(1, 2, 3)) { print(i) @@ -289,7 +289,7 @@ Here is an example data frame we will use. It represents a few values from three monitors. -```{r ex-0a2Hb-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'For Loop with Data Frame'} +```{r ex-nl8GA-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'For Loop with Data Frame'} monitors <- data.frame(monitor1 = c(50, 60, 58, 52), monitor2 = c(55, 59, 65, 61), monitor3 = c(70, 62, 68, 71)) @@ -304,7 +304,7 @@ are accessed using square brackets [ , i]. Each max value is saved to the max_values vector using square brackets [i]. -```{r ex-koVZJ-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Finding Maximum Values with For Loop'} +```{r ex-XonUy-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Finding Maximum Values with For Loop'} max_values <- c() for(i in c(1, 2, 3)) { @@ -335,7 +335,7 @@ The example below applies the `max()` function to the `monitors` data frame from the previous section. -```{r ex-TQVh0-1, eval = FALSE, exercise = TRUE, exercise.cap = 'apply Function Example'} +```{r ex-RldII-1, eval = FALSE, exercise = TRUE, exercise.cap = 'apply Function Example'} monitors_max <- apply(monitors, MARGIN = 2, FUN = max) monitors_max @@ -352,7 +352,7 @@ We could also find the mean of each row in the `monitors` data frame. To do this, we would set the `MARGIN` argument to `1`. -```{r ex-bGiYj-2, eval = FALSE, exercise = TRUE, exercise.cap = 'apply Function Example with MARGIN = 1'} +```{r ex-fyIqy-2, eval = FALSE, exercise = TRUE, exercise.cap = 'apply Function Example with MARGIN = 1'} apply(monitors, MARGIN = 1, FUN = mean) ``` diff --git a/inst/tutorials/5-Plotting/lesson.Rmd b/inst/tutorials/5-Plotting/lesson.Rmd index 7a0e21e..16ff1a8 100644 --- a/inst/tutorials/5-Plotting/lesson.Rmd +++ b/inst/tutorials/5-Plotting/lesson.Rmd @@ -29,7 +29,7 @@ This lesson assumes you are familiar with the material in the lesson on It also uses functions from the `ggplot2` package which needs to be installed. -```{r ex-wZCMY-1, eval = FALSE} +```{r ex-AlMgc-1, eval = FALSE} install.packages("ggplot2") ``` @@ -37,7 +37,7 @@ install.packages("ggplot2") The example data for the exercises is available from this package. To load the `chicago_air` data frame we will be using in the lesson, simply use the `data()` function. -```{r ex-WmpIN-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Loading Data'} +```{r ex-pFXUx-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Loading Data'} data(chicago_air) ``` @@ -49,7 +49,7 @@ will be displayed on the y-axis of a coordinate graph, with the index number of vector taking the x-axis values. -```{r ex-mprS0-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Basic Plot'} +```{r ex-j7Lsc-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Basic Plot'} benzene <- c(1.3, 4.5, 2.6, 3.4, 6.4) plot(benzene) @@ -62,7 +62,7 @@ the `chicago_air` data frame to create a scatterplot of temperature on the x-axi and ozone on the y-axis. -```{r ex-6RNew-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Scatter Plot'} +```{r ex-0nmTj-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Scatter Plot'} plot(x = chicago_air$temp, y = chicago_air$ozone) ``` @@ -71,7 +71,7 @@ To see data plotted over time, we need to convert the `date` column to a `Date` data type. -```{r ex-bluf7-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Converting Date Column'} +```{r ex-JP8rj-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Converting Date Column'} chicago_air$date <- as.Date(chicago_air$date) ``` @@ -79,7 +79,7 @@ chicago_air$date <- as.Date(chicago_air$date) Here is ozone plotted by day as a line graph. -```{r ex-UjZBQ-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Line Graph'} +```{r ex-w69D7-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Line Graph'} plot(x = chicago_air$date, y = chicago_air$ozone, type = 'l') ``` @@ -89,7 +89,7 @@ to control the look of the graph. The plot below demonstrates a few of these options. Run `?plot` to see a list of all the arguments in the help file. -```{r ex-qXCFF-5, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Customized Plot'} +```{r ex-rNSVR-5, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Customized Plot'} plot(x = chicago_air$date, y = chicago_air$ozone, type='l', pch = 16, @@ -108,7 +108,7 @@ of a data set as a histogram. Below is the default output of the ozone data from the `chicago_air` data frame. -```{r ex-HxDBk-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Default Histogram'} +```{r ex-4smPc-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Default Histogram'} hist(chicago_air$ozone) ``` @@ -118,7 +118,7 @@ each bar, with the `breaks` argument. For example, supplying `breaks = 20` will make a histogram with 20 bars. Other arguments allow you to control the titles and colors of the plot. Run `?hist` to see a complete list of arguments on the help page. -```{r ex-pn82v-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Customized Histogram'} +```{r ex-GwTQO-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Customized Histogram'} hist(chicago_air$ozone, breaks = 20, main = "Histogram of Ozone", @@ -135,7 +135,7 @@ argument. If a data frame is used, then the columns can be referenced without th `$` operator, and a formula must be used. -```{r ex-DqR5T-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Simple Box Plot'} +```{r ex-6K6PA-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Simple Box Plot'} boxplot(chicago_air$ozone) ``` @@ -161,7 +161,7 @@ We can make a plot of ozone by month using the `chicag_air` data frame and the formula `ozone ~ month`. -```{r ex-9sWx8-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Box Plot by Month'} +```{r ex-kynwl-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Box Plot by Month'} boxplot(ozone ~ month, data = chicago_air) ``` @@ -187,7 +187,7 @@ To use `ggplot2`, we typically follow this sequence of steps: Let's begin by loading the `ggplot2` package. -```{r ex-MfDZd-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Loading ggplot2 Package'} +```{r ex-zvHLa-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Loading ggplot2 Package'} library(ggplot2) ``` @@ -200,7 +200,7 @@ and `y`. These determine which columns from the data frame are displayed on the graph. -```{r ex-GM7Gb-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Base ggplot'} +```{r ex-KpaPR-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Base ggplot'} ggplot(chicago_air, aes(x = temp, y = ozone)) ``` @@ -211,14 +211,14 @@ to the plot, which is done by adding a function using the `+` sign. For a point plot, we add the `geom_point()` function. -```{r ex-QA75M-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'ggplot Scatter Plot'} +```{r ex-gwuQx-3, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'ggplot Scatter Plot'} ggplot(chicago_air, aes(x = temp, y = ozone)) + geom_point() ``` Additional modifications can be made. Customize it by adding color, title, and labels. -```{r ex-4ylyu-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Customized ggplot Scatter Plot'} +```{r ex-HZpOi-4, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Customized ggplot Scatter Plot'} ggplot(chicago_air, aes(x = temp, y = ozone)) + geom_point(color = "forestgreen") + ggtitle('Relationship between Ozone and Temperature') + @@ -233,7 +233,7 @@ on the month. To do this, we need to make the month column a factor and use the argument `color` in the `aes( )` function. -```{r ex-yBykX-5, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Color Coded Scatter Plot'} +```{r ex-XFCra-5, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Color Coded Scatter Plot'} ggplot(chicago_air, aes(x = temp, y = ozone, color = factor(month))) + geom_point() + ggtitle('Relationship between Ozone and Temperature') + @@ -251,7 +251,7 @@ To create a line plot of ozone over time, we use the `as.Date()` function on the column and replace the `geom_point( )` function with the `geom_line( )` function. -```{r ex-G0l0I-6, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Line Plot Over Time'} +```{r ex-4IAKH-6, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Line Plot Over Time'} ggplot(chicago_air, aes(x = as.Date(date), y = ozone)) + geom_line() ``` @@ -261,7 +261,7 @@ the width of each bar, the `fill` argument the color of the bars, and the `color argument the outline of the bars. -```{r ex-WzjC4-7, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'ggplot Histogram'} +```{r ex-gcloO-7, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'ggplot Histogram'} ggplot(chicago_air, aes(ozone)) + geom_histogram(binwidth=0.005, fill='darkorange', color='black') @@ -270,7 +270,7 @@ ggplot(chicago_air, aes(ozone)) + The `geom_boxplot()` function will create a box plot. -```{r ex-XFlIb-8, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Simple ggplot Box Plot'} +```{r ex-5ytTS-8, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Simple ggplot Box Plot'} ggplot(chicago_air, aes(ozone)) + geom_boxplot() ``` @@ -279,7 +279,7 @@ Using the `y` argument can split the data into groups. Here we use the `factor() function on the month column to create 12 box plots on the graph. -```{r ex-bnjIN-9, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Grouped Box Plot'} +```{r ex-OWMkH-9, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Grouped Box Plot'} ggplot(chicago_air, aes(x = ozone, y = factor(month))) + geom_boxplot() ``` @@ -295,7 +295,7 @@ multiple plots or facets. The `facet_wrap()` function allows you to use a column to choose the facets. Below is a faceted histogram of ozone values. -```{r ex-fvlLl-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Faceted Histogram'} +```{r ex-bj7Iz-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Faceted Histogram'} ggplot(chicago_air, aes(ozone)) + geom_histogram() + facet_wrap("month") @@ -311,7 +311,7 @@ represented as a shaded area. Below, the argument `method` is given the value `lm` which stands for a linear model. -```{r ex-iIqRr-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Linear Model Fitted Line', message = FALSE, warning = FALSE} +```{r ex-XZEjG-1, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Linear Model Fitted Line', message = FALSE, warning = FALSE} ggplot(chicago_air, aes(temp, ozone) ) + geom_point() + geom_smooth(method=lm) @@ -323,7 +323,7 @@ argument will draw a nonlinear curve which represents localized relationships be the x and y variables. -```{r ex-XPuES-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Nonlinear Curve Fitting', message = FALSE, warning = FALSE} +```{r ex-eqFAk-2, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Nonlinear Curve Fitting', message = FALSE, warning = FALSE} ggplot(chicago_air, aes(temp, ozone) ) + geom_point() + geom_smooth(method=loess) @@ -341,7 +341,7 @@ pane. You can also save a plot made by `ggplot2` using the `ggsave()` function. -```{r ex-jCzeg-1, eval = FALSE} +```{r ex-23jVi-1, eval = FALSE} my_plot <- ggplot(chicago_air, aes(temp, ozone) ) + geom_point() + geom_smooth(method=loess) @@ -364,7 +364,7 @@ as integers internally while maintaining labels for these integers. Here is an example of converting a character vector into a factor: -```{r ex-jUKrV-1, eval = FALSE} +```{r ex-jwlXZ-1, eval = FALSE} months <- c("January", "February", "March", "January", "February") months_factor <- factor(months) diff --git a/inst/tutorials/6-Basic-Statistics/lesson.Rmd b/inst/tutorials/6-Basic-Statistics/lesson.Rmd index 637a65c..7cde403 100644 --- a/inst/tutorials/6-Basic-Statistics/lesson.Rmd +++ b/inst/tutorials/6-Basic-Statistics/lesson.Rmd @@ -26,10 +26,10 @@ R was originally developed as a statistical programming language and its built-i This lesson assumes you are familiar with the material in the lesson on [Functions and Importing Data](../2-Functions-and-Importing-Data/readme.md). -Statistical functions are used in this lesson that require installation of the following packages. +Statistical functions are used in this lesson that require installation of the `envstats` package. -```{r ex-a2k2I-1, eval = FALSE} +```{r ex-1o1Xg-1, eval = FALSE} install.packages("envstats") ``` @@ -43,7 +43,7 @@ R has many built-in functions for descriptive statistics. We will use these functions to understand the example environmental data available in this package. -```{r ex-1qQ2j-1, exercise = TRUE, exercise.cap = 'Extract example data'} +```{r ex-1ILIM-1, exercise = TRUE, exercise.cap = 'Extract example data'} data <- example_data # Assuming example_data is available in this package ``` @@ -52,21 +52,21 @@ Most of the functions we'll be using have an argument named `na.rm` that stands for `NA` remove. If the argument is set to `TRUE` then the function will remove all missing values from the data set. Otherwise, the function will error. -These functions let us know the range of the data values, i.e., the highest and +These functions tell us the range of the data values, i.e., the highest and lowest values. -```{r ex-weaEx-2, exercise = TRUE, exercise.cap = 'Find minimum value'} +```{r ex-vb3PY-2, exercise = TRUE, exercise.cap = 'Find minimum value'} min(data, na.rm=TRUE) ``` -```{r ex-7QIRr-3, exercise = TRUE, exercise.cap = 'Find maximum value'} +```{r ex-awwNz-3, exercise = TRUE, exercise.cap = 'Find maximum value'} max(data, na.rm=TRUE) ``` -```{r ex-xgBvB-4, exercise = TRUE, exercise.cap = 'Find range of values'} +```{r ex-UcPGQ-4, exercise = TRUE, exercise.cap = 'Find range of values'} range(data, na.rm=TRUE) ``` @@ -74,27 +74,27 @@ range(data, na.rm=TRUE) We can also get the mean and the quartile values from the `summary()` function. -```{r ex-UT6Xm-5, exercise = TRUE, exercise.cap = 'Summary statistics'} +```{r ex-TTxJF-5, exercise = TRUE, exercise.cap = 'Summary statistics'} summary(data) ``` The `IQR()` function gives us the interquartile range, which lets us know how large -the spread is for the values in the central range of the distribution, i.e., between -the 1st quartile and the 3rd quartile. +the spread is for the values in the central range of the distribution, i.e. between +the 25th percentile and the 75th percentile. -```{r ex-eXOtH-6, exercise = TRUE, exercise.cap = 'Calculate IQR'} +```{r ex-0Uz5N-6, exercise = TRUE, exercise.cap = 'Calculate IQR'} IQR(data, na.rm=TRUE) ``` We can use the `boxplot()` function to visualize the interquartile range. The outline -of the box itself shows the first and third quartile, while the line in the middle +of the box itself shows the middle 50% of the data, while the line in the middle of the box shows the median. -```{r ex-Iw1o0-7, exercise = TRUE, exercise.cap = 'Visualize IQR with boxplot'} +```{r ex-9ZmCe-7, exercise = TRUE, exercise.cap = 'Visualize IQR with boxplot'} boxplot(data) ``` @@ -104,12 +104,12 @@ boxplot(data) R has functions for finding the mean and median of a set of values. -```{r ex-K7PqR-1, exercise = TRUE, exercise.cap = 'Calculate mean'} +```{r ex-lewxj-1, exercise = TRUE, exercise.cap = 'Calculate mean'} mean(data, na.rm=TRUE) ``` -```{r ex-bkBnL-2, exercise = TRUE, exercise.cap = 'Calculate median'} +```{r ex-TEP4d-2, exercise = TRUE, exercise.cap = 'Calculate median'} median(data, na.rm=TRUE) ``` @@ -118,12 +118,12 @@ The functions `var()` and `sd()` calculate the variance and standard deviation, respectively. -```{r ex-kPlk7-3, exercise = TRUE, exercise.cap = 'Calculate variance'} +```{r ex-2LPqj-3, exercise = TRUE, exercise.cap = 'Calculate variance'} var(data, na.rm=TRUE) ``` -```{r ex-kmE21-4, exercise = TRUE, exercise.cap = 'Calculate standard deviation'} +```{r ex-os2Ij-4, exercise = TRUE, exercise.cap = 'Calculate standard deviation'} sd(data, na.rm=TRUE) ``` @@ -136,7 +136,7 @@ the `t.test()` function to perform a two-sample t-test on the example data. First, let's visualize our dataset. -```{r ex-qqNT1-1, warning = FALSE, message = FALSE, exercise = TRUE, exercise.cap = 'Visualize dataset'} +```{r ex-dXnAG-1, warning = FALSE, message = FALSE, exercise = TRUE, exercise.cap = 'Visualize dataset'} ggplot(data, aes(factor(variable), value)) + geom_boxplot() ``` @@ -146,7 +146,7 @@ difference in concentrations. Below is a plot of those two groups side by side. -```{r ex-RSizC-2, warning = FALSE, message = FALSE, exercise = TRUE, exercise.cap = 'Compare two groups'} +```{r ex-0aPfw-2, warning = FALSE, message = FALSE, exercise = TRUE, exercise.cap = 'Compare two groups'} filtered_data <- filter(data, group == "Group1" | group == "Group2") ggplot(filtered_data, aes(factor(group), value)) + geom_boxplot() @@ -157,7 +157,7 @@ We should also check for normality before doing any statistical tests. Below are histograms of the datasets. -```{r ex-5Ltn0-3, exercise = TRUE, exercise.cap = 'Check for normality with histograms'} +```{r ex-cNlFU-3, exercise = TRUE, exercise.cap = 'Check for normality with histograms'} ggplot(filtered_data, aes(value)) + facet_grid(rows = vars(group)) + geom_histogram() @@ -171,14 +171,14 @@ comes from a normal distribution. If the p-value of the test is less than .05, we reject the null hypothesis and conclude the data is not normal. -```{r ex-rKJeL-4, exercise = TRUE, exercise.cap = 'Shapiro-Wilk test for Group1'} +```{r ex-VcWO3-4, exercise = TRUE, exercise.cap = 'Shapiro-Wilk test for Group1'} group1_data <- filter(data, group == "Group1") shapiro.test(group1_data$value) ``` -```{r ex-gKeKN-5, exercise = TRUE, exercise.cap = 'Shapiro-Wilk test for Group2'} +```{r ex-A0jwf-5, exercise = TRUE, exercise.cap = 'Shapiro-Wilk test for Group2'} group2_data <- filter(data, group == "Group2") shapiro.test(group2_data$value) @@ -189,13 +189,13 @@ The p-values for the tests are well above 0.05, so we assume the null hypothesis is true. Meaning, we can assume the distributions of values in the two groups are normal. -Now we can do some comparisons between these 2 groups of readings using the -Student's t-test. The test is meant to determine if the two means from the two +Now we can do some comparisons between these 2 months of measurements +using the Student's t-test. The test is meant to determine if the two means from the two datasets are from the same distribution or not. The assumption, or null hypothesis, is that they are, in fact, mean values from the same distribution. -```{r ex-5CJ5g-6, exercise = TRUE, exercise.cap = 'Student's t-test between two groups'} +```{r ex-8g4Rk-6, exercise = TRUE, exercise.cap = 'Student's t-test between two groups'} t.test(group1_data$value, group2_data$value) ``` @@ -220,7 +220,7 @@ The `EnvStats` package has a comprehensive list of basic and more advanced stati tests for Environmental Data. -```{r ex-ja2Cp-1, eval = FALSE} +```{r ex-2z862-1, eval = FALSE} library(EnvStats) ?FcnsByCatHypothTests @@ -239,7 +239,7 @@ arguments that we only want to include complete observations and the Pearson met of finding correlations. -```{r ex-Mt13i-1, exercise = TRUE, exercise.cap = 'Correlation matrix of select variables'} +```{r ex-asNU8-1, exercise = TRUE, exercise.cap = 'Correlation matrix of select variables'} cor(data[, c("Variable1", "Variable2", "Variable3")], use = "complete.obs", method ="pearson") @@ -256,7 +256,7 @@ We could also perform a correlation test using the `cor.test()` function. Here we test the correlation between two variables. -```{r ex-cBslA-2, exercise = TRUE, exercise.cap = 'Test correlation between two variables'} +```{r ex-dCgBz-2, exercise = TRUE, exercise.cap = 'Test correlation between two variables'} cor.test(data$Variable1, data$Variable2, method = "pearson") ``` @@ -270,7 +270,7 @@ do not reject the null hypothesis. We conclude there is no correlation between these two variables. -```{r ex-hmGTn-3, exercise = TRUE, exercise.cap = 'Test correlation between another set of two variables'} +```{r ex-g79Ns-3, exercise = TRUE, exercise.cap = 'Test correlation between another set of two variables'} cor.test(data$Variable1, data$Variable3, method = "pearson") ``` @@ -281,11 +281,16 @@ plot between each pair of columns in the data frame. Setting `lower.panel = pane will draw a smooth line through the scatter plots on the lower panels. -```{r ex-3N14X-4, exercise = TRUE, exercise.cap = 'Pairwise plots of select variables'} +```{r ex-semLW-4, exercise = TRUE, exercise.cap = 'Pairwise plots of select variables'} pairs(data[, c("Variable1", "Variable2", "Variable3")], lower.panel = panel.smooth) ``` +You can see from the lower panel plots the increasing slope of the line +for ozone and temp; a decreasing slope for temp and pressure; and a flat +line for ozone and pressure. + + ## Exercises {data-progressive=TRUE} diff --git a/inst/tutorials/7-Quality-Assurance/lesson.Rmd b/inst/tutorials/7-Quality-Assurance/lesson.Rmd index 367352d..d44caad 100644 --- a/inst/tutorials/7-Quality-Assurance/lesson.Rmd +++ b/inst/tutorials/7-Quality-Assurance/lesson.Rmd @@ -27,7 +27,7 @@ This lesson assumes you are familiar with the material in the lesson on [Functio The data used throughout these lessons is provided by this package. To access the data, simply use the `data()` function with the name of the dataset provided by this package. -```{r ex-oJlVk-1, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Load Example Data Frame'} +```{r ex-NQKSW-1, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Load Example Data Frame'} # Assuming this package is already loaded into your R session data("example_dataset") @@ -40,7 +40,7 @@ data("example_dataset") Data types are the first thing to consider when using data in R. Many errors can happen if we assume that our data is a certain type, when in reality it is not. After reading data into R, we should look at the data types in RStudio or using the function `str()`. -```{r ex-l5aYh-1, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Inspect Example Dataset Data Types'} +```{r ex-A4ceZ-1, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Inspect Example Dataset Data Types'} str(example_dataset) ``` @@ -48,7 +48,7 @@ str(example_dataset) Here is an example of text that is read into R, and a certain column might be `character` when we expected it to be `Date`. -```{r ex-9P9lB-2, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Check Data Types'} +```{r ex-SquPk-2, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Check Data Types'} sample_data <- read.csv(text = " date,value 2022-08-01,100 @@ -62,7 +62,7 @@ str(sample_data) We can use the `as.Date()` function to transform the column after reading the data, or we can use the `colClasses` argument in the `read.csv` function to ensure it's read correctly. -```{r ex-xVxxM-3, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Correcting Data Types with colClasses'} +```{r ex-UJJBG-3, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Correcting Data Types with colClasses'} sample_data <- read.csv(colClasses = c("Date", "numeric"), text = " date,value 2022-08-01,100 @@ -78,7 +78,7 @@ str(sample_data) For both character and numeric data types, there may be values that should not be allowed. -```{r ex-XvPip-1, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Identify and Correct Unallowed Data'} +```{r ex-g2J9r-1, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Identify and Correct Unallowed Data'} # Example of correcting unallowed values values <- c(1, 2, -1, 3, -2, 4) values[values < 0] <- NA @@ -89,13 +89,27 @@ values ### Outliers -Handling outliers is difficult because we do not necessarily want to remove data that may be uncommon but within the realm of possibility. +Handling outliers is difficult because we do not necessarily want +to remove data that may be uncommon but within the realm of possibility. +The best way to detect extreme values is to look at the summary of +your data and pay attention to min and max values. You can plot the +data to see if you can detect anything weird through visual inspection. +Boxplots with outliers plotted as points are handy for this. Below +is a boxplot of the ozone column in the `chicago_air` data frame. -```{r ex-RwuAn-1, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Identify and Handle Outliers'} -# Example of identifying and handling outliers -data("example_dataset") -boxplot(example_dataset$value) +```{r ex-SVhhL-1, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Identify and Handle Outliers'} +boxplot(chicago_air$ozone) + +``` + +We can see that two values are printed as points on the high end of the distribution. +We can use the `boxplot.stats()` function to get the values used in the `boxplot()` +function. The `out` values are the outliers. + + +```{r ex-z4Cv1-2, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Return Outlier Values from Boxplot'} +boxplot.stats(chicago_air$ozone)$out ``` @@ -104,7 +118,7 @@ boxplot(example_dataset$value) If you run a command and get an error, then R should print an error message. Common syntax mistakes include missing commas, unmatched parentheses, and the wrong type of closing brace. -```{r ex-LCQYa-1, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Examples of Common Syntax Mistakes'} +```{r ex-v8p41-1, eval = TRUE, exercise = TRUE, exercise.eval = FALSE, exercise.cap = 'Examples of Common Syntax Mistakes'} # Example of a common syntax mistake: missing commas x <- c("a", "b" "c") @@ -187,7 +201,7 @@ grade_this_code( ### Exercise 3 -Use the boxplot to check for outliers in the ozone column of the built-in `airquality` data frame. +Use a boxplot to check for outliers in the ozone column of the built-in `airquality` data frame. ```{r exercise3, exercise = TRUE} # Your code here diff --git a/source/1-Introduction/lesson1.yaml b/source/1-Introduction/lesson1.yaml index aa46317..01f1c5c 100644 --- a/source/1-Introduction/lesson1.yaml +++ b/source/1-Introduction/lesson1.yaml @@ -18,9 +18,11 @@ content: content: - type: paragraph content: | - R is a free, open-source computing language. It was originally written by statisticians for doing statistical analysis in academia. In recent years it has become more widely used in many industries for performing a variety of data science tasks such as: - - type: list - bullets: + R is a free, open-source computing language. It was originally written + by statisticians for doing statistical analysis in academia. In recent + years it has become more widely used in many industries for performing + a variety of data science tasks such as: + - reading and writing files, - data transformation, - graphic visualization, @@ -40,13 +42,18 @@ content: Many data analysis tasks can be accomplished with spreadsheets and other business intelligence (BI) tools such as Looker and Power BI. When should you move beyond BI tools and use a high-level programming language like R? Below are a few scenarios where a language like R is more advantageous than a BI tool. - - type: list - bullets: + - If you cannot access data easily in your BI tool, R can read just about any data source. - If you need to download, save, or otherwise process a large number of files, R can automate those tasks in a way that BI tools cannot. - Custom data transformations that are not possible in BI tools can be done with R. - Custom data visualizations that are not available in BI tools can be done with R. - Predictive modeling that is not available in BI tools, or only in a rudimentary way, can be done in R. + + BI tools are more advantageous if you need enterprise wide dashboards, + or tools that are more easily accessible to a wider audience. If there + are few occasions where you need custom visualizations or transformations, + or if you do not need automation in your work, you may not need to learn + a programming language. - type: section title: Install R and RStudio skip: @@ -199,8 +206,8 @@ content: - type: code language: r options: - exercise: true - exercise.eval: false + exercise: false + exercise.eval: true exercise.cap: "Experiment with Comments" content: | # Full line comment @@ -230,6 +237,12 @@ content: content: | > 1 + 1 [1] 2 + - type: paragraph + content: | + The R code in code blocks do not show the `>` part of the console, called + the prompt, and the output block places two comment marks `##` before the + output. This is to make it possible to copy and paste the text into your + R console and run it without causing an error. - type: section title: Variables content: @@ -240,8 +253,8 @@ content: - type: code language: r options: - exercise: true - exercise.eval: false + exercise: false + exercise.eval: true exercise.cap: "Creating Variables" content: | x <- 10 @@ -259,8 +272,8 @@ content: - type: code language: r options: - exercise: true - exercise.eval: false + exercise: false + exercise.eval: true exercise.cap: "Exploring Variable Assignment" content: | x @@ -282,19 +295,18 @@ content: - type: paragraph content: | There are 3 important rules to remember when creating variable names: - - type: list - numbers: - - You can't start your variable name with a number. - - You can't use spaces or special characters ($,%,#,-). Periods `.` and underscores `_` are ok. - - Capitalization __DOES__ matter in R. That is, R will consider `y` and `Y` to be different variables. + + 1. You can't start your variable name with a number. + 2. You can't use spaces or special characters ($,%,#,-). Periods `.` and underscores `_` are ok. + 3. Capitalization __DOES__ matter in R. That is, R will consider `y` and `Y` to be different variables. - type: paragraph content: | Try running the following code and you will see that in your global environment there are two different objects listed. - type: code language: r options: - exercise: true - exercise.eval: false + exercise: false + exercise.eval: true exercise.cap: "Variable Naming Rules" content: | y <- 5 @@ -334,8 +346,7 @@ content: - type: paragraph content: | There are several ways to store groups of data to make them easier to work with: - - type: list - bullets: + - A __vector__ stores multiple values of the same type (e.g. all numeric values). - A __list__ stores multiple values of different types (e.g. some numbers and character values). - A __matrix__ is a table of values with only one data type. @@ -349,8 +360,8 @@ content: - type: code language: r options: - exercise: true - exercise.eval: false + exercise: false + exercise.eval: true exercise.cap: "Creating Numeric Vectors" content: | x <- c(1, 2, 3, 4, 5) @@ -358,8 +369,8 @@ content: - type: code language: r options: - exercise: true - exercise.eval: false + exercise: false + exercise.eval: true exercise.cap: "Creating Character Vectors" content: | fruit <- c("apples", "bananas", "oranges") @@ -384,8 +395,8 @@ content: - type: code language: r options: - exercise: true - exercise.eval: false + exercise: false + exercise.eval: true exercise.cap: "Accessing Vector Elements" content: | fruit[3] @@ -398,8 +409,8 @@ content: - type: code language: r options: - exercise: true - exercise.eval: false + exercise: false + exercise.eval: true exercise.cap: "Creating Lists" content: | x <- list("Benzene", 1.3, TRUE) @@ -412,8 +423,8 @@ content: - type: code language: r options: - exercise: true - exercise.eval: false + exercise: false + exercise.eval: true exercise.cap: "Accessing List Elements" content: | x[[2]] @@ -423,8 +434,8 @@ content: - type: code language: r options: - exercise: true - exercise.eval: false + exercise: false + exercise.eval: true exercise.cap: "Lists Containing Vectors and Lists" content: | my_vector <- c(1, 2, 3) @@ -438,8 +449,8 @@ content: - type: code language: r options: - exercise: true - exercise.eval: false + exercise: false + exercise.eval: true exercise.cap: "Nested List Access" content: | y[[3]][[2]] @@ -465,7 +476,7 @@ content: - type: paragraph content: | The output above shows a table with the vector variable names as column names, and the vector values below the respective column name. If you try to - create a data frame where the vectors are not all the same length, you will see the error shown below. + create a data frame where the vectors are not all the same length, you will get an error. - type: code language: r options: @@ -523,7 +534,7 @@ exercises: In R, we create a list using the `list()` function. A list can contain elements of different types, such as numbers, strings, and logical values, allowing for flexible data structures. code: | l <- list(5, 'apple', TRUE) - - instructions: "Create a data frame `df` with two columns, `name` and `age`, each containing three entries of your choosing." + - instructions: "Create a data frame `df` with two columns, `name` and `age`. Use the names 'Alice', 'Bob', and 'Charlie' and give them each an age." hints: - "# Use the `data.frame()` function to create a data frame." - "# To create a data frame with `name` and `age` columns, define each column as a vector and use them as arguments in the `data.frame()` function." diff --git a/source/2-Functions-and-Importing-Data/lesson2.yaml b/source/2-Functions-and-Importing-Data/lesson2.yaml index e4e2ff8..f168d53 100644 --- a/source/2-Functions-and-Importing-Data/lesson2.yaml +++ b/source/2-Functions-and-Importing-Data/lesson2.yaml @@ -19,7 +19,11 @@ content: content: - type: paragraph content: | - In R, there are two main types of objects: variables and functions. We covered variables in the [introductory lesson](../1-Introduction-to-R/readme.md). A variable is used to create and reference data. The data can be a character, numeric, or logical data type. Variables can reference various "containers" for data, such as a __vector__, __list__, or __data frame__. + In R, there are two main types of objects: variables and functions. We + covered variables in the introductory lesson. A variable is used to create + and reference data. The data can be a character, numeric, or logical data + type. Variables can reference various "containers" for data, such as a + __vector__, __list__, or __data frame__. - type: paragraph content: | Functions are similar to variables in that they are short names that reference something saved in R. In this case, a function is not referencing data but a piece of code. A function is saved code that can be used to do some operation on data. @@ -99,7 +103,11 @@ content: On the help page, under `Usage`, you see `mean(x, ...)`. This means that the only thing that necessarily has to go into `( )` is `x`. On the help page under `Arguments` you will find a description of what `x` needs to be: a numeric or logical vector. - type: paragraph content: | - Many built-in functions in R have multiple arguments. This allows you to give the function some more information to perform calculation you want. The example below shows how to use the `digits` argument in the `round( )` function. Providing different values to the `digits` argument will return different values. + Many built-in functions in R have multiple arguments. This allows you + to give the function some more information to perform calculation you + want. The example below shows how to use the `digits` argument in the + `round( )` function. Providing different values to the `digits` argument + will return different values. - type: code language: r options: @@ -127,12 +135,23 @@ content: exercise.cap: "Round to One Digit" content: | round(12.3456, digits=1) + - type: paragraph + content: | + In the first example, you can see that we did not provide a value for the + `digits` argument. That's because there is a default value `digits = 0` (see + the `Usage` section on the help page `?round`). If there is a default value, + then that argument does not need to be specified inside `( )`. If there is no + default value for an argument, then the function will error and tell you that + you forgot to supply a value for the argument. - type: section title: Useful Built-in Functions content: - type: paragraph content: | - When you start an R session there are many built-in functions that are immediately available for you to use. Other functions are available in community developed packages, as explained in a later section of this lesson. Below is a list of a few commonly used built-in functions in R. + When you start an R session there are many built-in functions that are + immediately available for you to use. Other functions are available in + community developed packages, as explained in a later section of this + lesson. Below is a list of a few commonly used built-in functions in R. - type: section title: "1. `sum( )`" content: @@ -388,12 +407,6 @@ content: - type: paragraph content: | Now that we've installed the package, we still can't use the function we want. We need to load the package first (opening the app). We use the `library()` function to do this. - - type: code - language: r - options: - message: false - content: | - library(EnvStats) - type: code language: r options: @@ -401,6 +414,8 @@ content: exercise.eval: false exercise.cap: "Use `serialCorrelationTest` from EnvStats" content: | + library(EnvStats) + x <- c(1.3, 3.5, 2.6, 3.4, 6.4) serialCorrelationTest(x) - type: paragraph @@ -414,13 +429,20 @@ content: content: - type: paragraph content: | - R can import data from just about any format, including CSV, Excel, Databases, GIS shapefiles. This section will demonstrate how to import CSV and Excel files. + R can import data from just about any format, including + + - CSV, + - Excel, + - Databases, and + - GIS shapefiles. + + This section will demonstrate how to import CSV and Excel files. - type: section title: CSV content: - type: paragraph content: | - R has a built-in function called `read.csv()` for reading `.csv` files. Download the `chicago_daily.csv` file [here](../data/chicago_daily.csv) and save it to your working directory. If you don't know what your working directory is, run this code in R and it will tell you. + R has a built-in function called `read.csv()` for reading `.csv` files. Download the `chicago_daily.csv` file [here](https://github.com/LADCO/training-r-intro/blob/main/data/chicago_daily.csv) and save it to your working directory. If you don't know what your working directory is, run this code in R and it will tell you. - type: code language: r options: @@ -458,7 +480,7 @@ content: library(readxl) - type: paragraph content: | - Use the `read_excel()` function from the `readxl` package to read emissions data from [this Excel workbook](./data/emissions_IL_2022.xlsx). Download the file to your working directory and read the first worksheet (named "UNIT_DATA"), skipping the first 6 rows. + Use the `read_excel()` function from the `readxl` package to read emissions data from [this Excel workbook](https://github.com/LADCO/training-r-intro/blob/main/data/emissions_IL_2022.xlsx). Download the file to your working directory and read the first worksheet (named "UNIT_DATA"), skipping the first 6 rows. - type: code language: r options: @@ -507,7 +529,7 @@ exercises: By using the `:` operator, we create a sequence of integers from 1 to 10. Placing this sequence inside the `sum()` function calculates the total sum of these numbers. code: | sum(1:10) - - instructions: "Read in the first 10 rows of the `chicago_daily.csv` file [here](../data/chicago_daily.csv)." + - instructions: "Read in the first 10 rows of the `chicago_daily.csv` file [here](https://github.com/LADCO/training-r-intro/blob/main/data/chicago_daily.csv)." hints: - "# Use the `read.csv()` function to read in CSV files. Specify the file path and use `nrows` to limit the number of rows read." - "# To read the first 10 rows, set `nrows = 10` in the `read.csv()` function." @@ -515,4 +537,4 @@ exercises: explanation: | First, ensure the `chicago_daily.csv` file is saved in your working directory. Then, by using the `read.csv()` function with the `nrows` parameter set to 10, we can read only the first 10 rows of the file. code: | - read.csv("../data/chicago_daily.csv", nrows = 10) + read.csv("chicago_daily.csv", nrows = 10) diff --git a/source/3-Subsetting-Sorting-and-Combining/lesson3.yaml b/source/3-Subsetting-Sorting-and-Combining/lesson3.yaml index 99f8e7f..1b69e6f 100644 --- a/source/3-Subsetting-Sorting-and-Combining/lesson3.yaml +++ b/source/3-Subsetting-Sorting-and-Combining/lesson3.yaml @@ -16,18 +16,34 @@ content: content: - type: paragraph content: | - This lesson assumes you are familiar with the material in the previous lesson on [Functions and Importing Data](../2-Functions-and_Importing-Data/readme.md). - - type: paragraph + This lesson assumes you are familiar with the material in the previous + lesson on Functions and Importing Data. + + The data from the R package region5air is used throughout these lessons. + To install the package from GitHub, use the `remotes` package. Run the code + below to install the `remotes` package and install `region5air` from GitHub. + - type: code + language: r + options: + exercise: false + exercise.eval: false + eval: false + exercise.cap: "Install remotes and region5air" content: | - The example data for exercises in this lesson is available directly from this package. It is assumed that this package is already installed and loaded into your R session. + # if you have not installed remotes + install.packages("remotes") + + library(remotes) + install_github("FluentData/region5air") - type: paragraph content: | To load the `chicago_air` data frame we will be using in the lesson, simply use the `data()` function to load the data frame. - type: code language: r options: - exercise: true + exercise: false exercise.eval: false + eval: false exercise.cap: "Load `chicago_air` Data Frame" content: | data(chicago_air) @@ -58,8 +74,8 @@ content: - type: code language: r options: - exercise: true - exercise.eval: false + exercise: false + exercise.eval: true exercise.cap: "View the First Few Rows of `chicago_air`" content: | data(chicago_air) @@ -70,11 +86,10 @@ content: - type: code language: r options: - exercise: true - exercise.eval: false + exercise: false + exercise.eval: true exercise.cap: "Specify Number of Lines with `head()`" content: | - data(chicago_air) head(chicago_air, n = 3) - type: paragraph content: | @@ -82,11 +97,10 @@ content: - type: code language: r options: - exercise: true - exercise.eval: false + exercise: false + exercise.eval: true exercise.cap: "View the Last Few Rows of `chicago_air`" content: | - data(chicago_air) tail(chicago_air) - type: paragraph content: | @@ -94,11 +108,11 @@ content: - type: code language: r options: - exercise: true + exercise: false exercise.eval: false + eval: false exercise.cap: "Use `View()` to Inspect `chicago_air`" content: | - data(chicago_air) View(chicago_air) - type: image src: "images/view.png" @@ -109,14 +123,16 @@ content: - type: code language: r options: - exercise: true + exercise: false exercise.eval: false + eval: false exercise.cap: "Learn More About `chicago_air`" content: | ?chicago_air - type: paragraph content: | From the `Description` section of the help page, the `chicago_air` data frame is: + > A dataset containing daily values of ozone, temperature, and solar radiation from a Chicago monitor between January 1, 2021 and December 31, 2021. - type: section title: Subsetting @@ -129,7 +145,13 @@ content: content: - type: paragraph content: | - Values in a data frame can be selected, individually or in a group, based on their index values. These are integers that represent the locations in the data frame. If there is a 2 x 2 table, then there are 2 rows and 2 columns. Each cell can be represented by two numbers, like coordinates on a map. For a data frame, the format is `[row, column]`. Below is a table that shows the index values in each cell. + Values in a data frame can be selected, individually or in a group, + based on their index values. These are integers that represent the + locations in the data frame. If there is a 2 x 2 table, then there + are 2 rows and 2 columns. Each cell can be represented by two numbers, + like coordinates on a map. For a data frame, the format is `[row, column]`. + Below is a table that shows the index values in each cell. + |Column 1 | Column 2| |--- |--- | | `[1, 1]`| `[1, 2]`| @@ -140,8 +162,8 @@ content: - type: code language: r options: - exercise: true - exercise.eval: false + exercise: false + exercise.eval: true exercise.cap: "Explore Data Frame Indexing with `my_data`" content: | my_data <- data.frame(colors = c("red", "green", "yellow"), @@ -154,8 +176,8 @@ content: - type: code language: r options: - exercise: true - exercise.eval: false + exercise: false + exercise.eval: true exercise.cap: "Select a Specific Value Using Indexing" content: | my_data[2, 1] @@ -165,8 +187,8 @@ content: - type: code language: r options: - exercise: true - exercise.eval: false + exercise: false + exercise.eval: true exercise.cap: "Access Another Specific Value Using Indexing" content: | my_data[3, 2] @@ -176,8 +198,8 @@ content: - type: code language: r options: - exercise: true - exercise.eval: false + exercise: false + exercise.eval: true exercise.cap: "Vector Indexing Example" content: | x <- c(1, 3, 2, 7, 25.3, 6) @@ -188,8 +210,8 @@ content: - type: code language: r options: - exercise: true - exercise.eval: false + exercise: false + exercise.eval: true exercise.cap: "Access a Vector Element by Index" content: | x[5] @@ -202,8 +224,8 @@ content: - type: code language: r options: - exercise: true - exercise.eval: false + exercise: false + exercise.eval: true exercise.cap: "Subset `chicago_air` Data Frame Using Indexing" content: | data(chicago_air) @@ -214,11 +236,10 @@ content: - type: code language: r options: - exercise: true - exercise.eval: false + exercise: false + exercise.eval: true exercise.cap: "Access Multiple Rows with a Vector of Row Numbers" content: | - data(chicago_air) chicago_air[c(1, 2, 5), ] - type: paragraph content: | @@ -226,11 +247,10 @@ content: - type: code language: r options: - exercise: true - exercise.eval: false + exercise: false + exercise.eval: true exercise.cap: "Access a Single Column" content: | - data(chicago_air) head(chicago_air[, 1]) - type: paragraph content: | @@ -241,11 +261,10 @@ content: - type: code language: r options: - exercise: true - exercise.eval: false + exercise: false + exercise.eval: true exercise.cap: "Access Multiple Columns" content: | - data(chicago_air) head(chicago_air[, c(3, 4, 6)]) - type: paragraph content: | @@ -256,11 +275,10 @@ content: - type: code language: r options: - exercise: true - exercise.eval: false + exercise: false + exercise.eval: true exercise.cap: "Access a Column by Name" content: | - data(chicago_air) head(chicago_air[, "solar"]) - type: paragraph content: | @@ -268,11 +286,10 @@ content: - type: code language: r options: - exercise: true - exercise.eval: false + exercise: false + exercise.eval: true exercise.cap: "Subset to Specific Columns by Name" content: | - data(chicago_air) head(chicago_air[, c("ozone", "temp", "month")]) - type: paragraph content: | @@ -280,8 +297,8 @@ content: - type: code language: r options: - exercise: true - exercise.eval: false + exercise: false + exercise.eval: true exercise.cap: "Specify Both Rows and Columns for Subsetting" content: | chicago_air[1:5, c("temp", "solar")] diff --git a/source/6-Basic-Statistics/lesson6.yaml b/source/6-Basic-Statistics/lesson6.yaml index 9aa374f..de579fa 100644 --- a/source/6-Basic-Statistics/lesson6.yaml +++ b/source/6-Basic-Statistics/lesson6.yaml @@ -21,7 +21,7 @@ content: content: | This lesson assumes you are familiar with the material in the lesson on [Functions and Importing Data](../2-Functions-and-Importing-Data/readme.md). - Statistical functions are used in this lesson that require installation of the following packages. + Statistical functions are used in this lesson that require installation of the `envstats` package. - type: code language: r options: @@ -51,7 +51,7 @@ content: for `NA` remove. If the argument is set to `TRUE` then the function will remove all missing values from the data set. Otherwise, the function will error. - These functions let us know the range of the data values, i.e., the highest and + These functions tell us the range of the data values, i.e., the highest and lowest values. - type: code language: r @@ -87,8 +87,8 @@ content: - type: paragraph content: | The `IQR()` function gives us the interquartile range, which lets us know how large - the spread is for the values in the central range of the distribution, i.e., between - the 1st quartile and the 3rd quartile. + the spread is for the values in the central range of the distribution, i.e. between + the 25th percentile and the 75th percentile. - type: code language: r options: @@ -99,7 +99,7 @@ content: - type: paragraph content: | We can use the `boxplot()` function to visualize the interquartile range. The outline - of the box itself shows the first and third quartile, while the line in the middle + of the box itself shows the middle 50% of the data, while the line in the middle of the box shows the median. - type: code language: r @@ -224,8 +224,8 @@ content: is true. Meaning, we can assume the distributions of values in the two groups are normal. - Now we can do some comparisons between these 2 groups of readings using the - Student's t-test. The test is meant to determine if the two means from the two + Now we can do some comparisons between these 2 months of measurements + using the Student's t-test. The test is meant to determine if the two means from the two datasets are from the same distribution or not. The assumption, or null hypothesis, is that they are, in fact, mean values from the same distribution. - type: code @@ -337,6 +337,11 @@ content: exercise.cap: "Pairwise plots of select variables" content: | pairs(data[, c("Variable1", "Variable2", "Variable3")], lower.panel = panel.smooth) + - type: paragraph + content: | + You can see from the lower panel plots the increasing slope of the line + for ozone and temp; a decreasing slope for temp and pressure; and a flat + line for ozone and pressure. exercises: - instructions: "Find the mean and median of a specific column in the example data frame and compare the two values." hints: diff --git a/source/7-Quality-Assurance/lesson7.yaml b/source/7-Quality-Assurance/lesson7.yaml index 704bc6d..2c175e8 100644 --- a/source/7-Quality-Assurance/lesson7.yaml +++ b/source/7-Quality-Assurance/lesson7.yaml @@ -107,7 +107,13 @@ content: content: - type: paragraph content: | - Handling outliers is difficult because we do not necessarily want to remove data that may be uncommon but within the realm of possibility. + Handling outliers is difficult because we do not necessarily want + to remove data that may be uncommon but within the realm of possibility. + The best way to detect extreme values is to look at the summary of + your data and pay attention to min and max values. You can plot the + data to see if you can detect anything weird through visual inspection. + Boxplots with outliers plotted as points are handy for this. Below + is a boxplot of the ozone column in the `chicago_air` data frame. - type: code language: r options: @@ -116,9 +122,21 @@ content: exercise.eval: false exercise.cap: "Identify and Handle Outliers" content: | - # Example of identifying and handling outliers - data("example_dataset") - boxplot(example_dataset$value) + boxplot(chicago_air$ozone) + - type: paragraph + content: | + We can see that two values are printed as points on the high end of the distribution. + We can use the `boxplot.stats()` function to get the values used in the `boxplot()` + function. The `out` values are the outliers. + - type: code + language: r + options: + eval: true + exercise: true + exercise.eval: false + exercise.cap: "Return Outlier Values from Boxplot" + content: | + boxplot.stats(chicago_air$ozone)$out - type: section title: Common Mistakes content: @@ -170,7 +188,7 @@ exercises: monitors[monitors == "site two"] <- "site 2" monitors - - instructions: "Use the boxplot to check for outliers in the ozone column of the built-in `airquality` data frame." + - instructions: "Use a boxplot to check for outliers in the ozone column of the built-in `airquality` data frame." hints: - "# First, load the `airquality` data frame using `data(\"airquality\")`." - "# Use the `boxplot()` function and specify `airquality$Ozone` as the argument to plot the ozone column."