this is my notebook for [chapter 3](https://www.modernstatisticswithr.com/datachapter.html) of [modern statistics with r](https://www.modernstatisticswithr.com/) by [måns thulin](https://github.com/mthulin)

[Thulin, M. (2021). Modern Statistics with R. Eos Chasma Press. ISBN 9789152701515.](https://www.modernstatisticswithr.com/)

In [2]:
# mount gdrive
from google.colab import drive
drive.mount('/gdrive')

# r magic 🧙🪄
%load_ext rpy2.ipython

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [3]:
%%R
# install / load packages
# install.packages(c("ggplot2", "openxlsx"))
library(ggplot2)
library(openxlsx)

# set working directory
setwd('/gdrive/MyDrive/GitHub/modern-stats-r')

# 1. data frames and data types

In [None]:
%%R
# class returns the class of a vector's elements when executed on a vector where each element has the same class
numbers <- c(6, 9, 12)
class(numbers)

[1] "numeric"


In [None]:
%%R
# R coerces all of the elements to the type character for a vector with multiple types where an element is a character
x <- 6
y <- "Scotland"
z <- TRUE

all_together <- c(x, y, z)
all_together
class(all_together)

[1] "character"


In [None]:
%%R
# having a look at data stored in a matrix
class(WorldPhones)

[1] "matrix" "array" 


In [None]:
%%R
WorldPhones

     N.Amer Europe Asia S.Amer Oceania Africa Mid.Amer
1951  45939  21574 2876   1815    1646     89      555
1956  60423  29990 4708   2568    2366   1411      733
1957  64721  32510 5230   2695    2526   1546      773
1958  68484  35218 6662   2845    2691   1663      836
1959  71799  37598 6856   3000    2868   1769      911
1960  76036  40341 8220   3145    3054   1905     1008
1961  79831  43173 9053   3338    3224   2005     1076


In [None]:
%%R
# and in a data frame
class(airquality)

[1] "data.frame"


In [None]:
%%R
airquality

    Ozone Solar.R Wind Temp Month Day
1      41     190  7.4   67     5   1
2      36     118  8.0   72     5   2
3      12     149 12.6   74     5   3
4      18     313 11.5   62     5   4
5      NA      NA 14.3   56     5   5
6      28      NA 14.9   66     5   6
7      23     299  8.6   65     5   7
8      19      99 13.8   59     5   8
9       8      19 20.1   61     5   9
10     NA     194  8.6   69     5  10
11      7      NA  6.9   74     5  11
12     16     256  9.7   69     5  12
13     11     290  9.2   66     5  13
14     14     274 10.9   68     5  14
15     18      65 13.2   58     5  15
16     14     334 11.5   64     5  16
17     34     307 12.0   66     5  17
18      6      78 18.4   57     5  18
19     30     322 11.5   68     5  19
20     11      44  9.7   62     5  20
21      1       8  9.7   59     5  21
22     11     320 16.6   73     5  22
23      4      25  9.7   61     5  23
24     32      92 12.0   61     5  24
25     NA      66 16.6   57     5  25
26     NA   

In [None]:
%%R
# and a tibble :>
class(msleep)

[1] "tbl_df"     "tbl"        "data.frame"


In [None]:
%%R
msleep

# A tibble: 83 × 11
   name   genus vore  order conservation sleep_total sleep_rem sleep_cycle awake
   <chr>  <chr> <chr> <chr> <chr>              <dbl>     <dbl>       <dbl> <dbl>
 1 Cheet… Acin… carni Carn… lc                  12.1      NA        NA      11.9
 2 Owl m… Aotus omni  Prim… <NA>                17         1.8      NA       7  
 3 Mount… Aplo… herbi Rode… nt                  14.4       2.4      NA       9.6
 4 Great… Blar… omni  Sori… lc                  14.9       2.3       0.133   9.1
 5 Cow    Bos   herbi Arti… domesticated         4         0.7       0.667  20  
 6 Three… Brad… herbi Pilo… <NA>                14.4       2.2       0.767   9.6
 7 North… Call… carni Carn… vu                   8.7       1.4       0.383  15.3
 8 Vespe… Calo… <NA>  Rode… <NA>                 7        NA        NA      17  
 9 Dog    Canis carni Carn… domesticated        10.1       2.9       0.333  13.9
10 Roe d… Capr… herbi Arti… lc                   3        NA        NA      21  
# ℹ 73 m

## **exercise 3.1**: data types and structures

In [None]:
%%R
# 1 - double or single quotes make no difference
a <- 'that\'s manslaughter'
class(a)

[1] "character"


In [None]:
%%R
# 2 - numeric datatypes
x <- 1 + 2 # numeric
y <- 1L + 2 # numeric
z <- 1L + 2L # integer
class(z)

[1] "integer"


In [None]:
%%R
# 3 - character + numeric
b <- "pikmin" + 5 # throws an error for a non-numeric argument





Error in "pikmin" + 5 : non-numeric argument to binary operator


In [None]:
%%R
# 4 - logic * numeric
c <- 4 * TRUE # numeric
class(c)

[1] "numeric"


## **exercise 3.2:** `ncol`, `nrow`, `dim`, `names`, `row.names`

In [None]:
%%R
# applying functions to a dataframe
ncol(airquality)

[1] 6


In [None]:
%%R
nrow(airquality)

[1] 153


In [None]:
%%R
dim(airquality)

[1] 153   6


In [None]:
%%R
names(airquality)

[1] "Ozone"   "Solar.R" "Wind"    "Temp"    "Month"   "Day"    


In [None]:
%%R
row.names(airquality)

  [1] "1"   "2"   "3"   "4"   "5"   "6"   "7"   "8"   "9"   "10"  "11"  "12" 
 [13] "13"  "14"  "15"  "16"  "17"  "18"  "19"  "20"  "21"  "22"  "23"  "24" 
 [25] "25"  "26"  "27"  "28"  "29"  "30"  "31"  "32"  "33"  "34"  "35"  "36" 
 [37] "37"  "38"  "39"  "40"  "41"  "42"  "43"  "44"  "45"  "46"  "47"  "48" 
 [49] "49"  "50"  "51"  "52"  "53"  "54"  "55"  "56"  "57"  "58"  "59"  "60" 
 [61] "61"  "62"  "63"  "64"  "65"  "66"  "67"  "68"  "69"  "70"  "71"  "72" 
 [73] "73"  "74"  "75"  "76"  "77"  "78"  "79"  "80"  "81"  "82"  "83"  "84" 
 [85] "85"  "86"  "87"  "88"  "89"  "90"  "91"  "92"  "93"  "94"  "95"  "96" 
 [97] "97"  "98"  "99"  "100" "101" "102" "103" "104" "105" "106" "107" "108"
[109] "109" "110" "111" "112" "113" "114" "115" "116" "117" "118" "119" "120"
[121] "121" "122" "123" "124" "125" "126" "127" "128" "129" "130" "131" "132"
[133] "133" "134" "135" "136" "137" "138" "139" "140" "141" "142" "143" "144"
[145] "145" "146" "147" "148" "149" "150" "151" "152" "153"


## **exercise 3.3:** matrices

In [None]:
%%R
# construct a matrix from vectors
# ?matrix
x <- 1:6
matrix(x, nrow = 2)

     [,1] [,2] [,3]
[1,]    1    3    5
[2,]    2    4    6


In [None]:
%%R
matrix(x, ncol = 2)

     [,1] [,2]
[1,]    1    4
[2,]    2    5
[3,]    3    6


# 2. vectors in data frames
## **exercise 3.4:** `[i, j]` notation

In [None]:
%%R
# 1 - indexing without an [i]
airquality[, 3] # returns the third column - [j] is the column index

  [1]  7.4  8.0 12.6 11.5 14.3 14.9  8.6 13.8 20.1  8.6  6.9  9.7  9.2 10.9 13.2
 [16] 11.5 12.0 18.4 11.5  9.7  9.7 16.6  9.7 12.0 16.6 14.9  8.0 12.0 14.9  5.7
 [31]  7.4  8.6  9.7 16.1  9.2  8.6 14.3  9.7  6.9 13.8 11.5 10.9  9.2  8.0 13.8
 [46] 11.5 14.9 20.7  9.2 11.5 10.3  6.3  1.7  4.6  6.3  8.0  8.0 10.3 11.5 14.9
 [61]  8.0  4.1  9.2  9.2 10.9  4.6 10.9  5.1  6.3  5.7  7.4  8.6 14.3 14.9 14.9
 [76] 14.3  6.9 10.3  6.3  5.1 11.5  6.9  9.7 11.5  8.6  8.0  8.6 12.0  7.4  7.4
 [91]  7.4  9.2  6.9 13.8  7.4  6.9  7.4  4.6  4.0 10.3  8.0  8.6 11.5 11.5 11.5
[106]  9.7 11.5 10.3  6.3  7.4 10.9 10.3 15.5 14.3 12.6  9.7  3.4  8.0  5.7  9.7
[121]  2.3  6.3  6.3  6.9  5.1  2.8  4.6  7.4 15.5 10.9 10.3 10.9  9.7 14.9 15.5
[136]  6.3 10.9 11.5  6.9 13.8 10.3 10.3  8.0 12.6  9.2 10.3 10.3 16.6  6.9 13.2
[151] 14.3  8.0 11.5


In [None]:
%%R
# 2 - extract the first 5 rows of airquality
airquality[1:5,]

  Ozone Solar.R Wind Temp Month Day
1    41     190  7.4   67     5   1
2    36     118  8.0   72     5   2
3    12     149 12.6   74     5   3
4    18     313 11.5   62     5   4
5    NA      NA 14.3   56     5   5


In [None]:
%%R
# 3 - correlation between Temp and Wind using indexing
cor(airquality[,4],airquality[,3])

[1] -0.4579879


In [None]:
%%R
# 4 - extract all columns besides Temp and Wind
airquality[1:10,-c(3,4)] # had to check this one! added a row index for convenience

   Ozone Solar.R Month Day
1     41     190     5   1
2     36     118     5   2
3     12     149     5   3
4     18     313     5   4
5     NA      NA     5   5
6     28      NA     5   6
7     23     299     5   7
8     19      99     5   8
9      8      19     5   9
10    NA     194     5  10


## **exercise 3.5:** transforming data frames

In [None]:
%%R
# 1 - add a new variable
# construct df
age <- c(28, 48, 47, 71, 22, 80, 48, 30, 31)
purchase <- c(20, 59, 2, 12, 22, 160, 34, 34, 29)
bookstore <- data.frame(age, purchase)
bookstore$visit_length <- c(5, 2, 20, 22, 12, 31, 9, 10, 11)

bookstore$rev_per_minute <- bookstore$purchase / bookstore$visit_length
bookstore

  age purchase visit_length rev_per_minute
1  28       20            5      4.0000000
2  48       59            2     29.5000000
3  47        2           20      0.1000000
4  71       12           22      0.5454545
5  22       22           12      1.8333333
6  80      160           31      5.1612903
7  48       34            9      3.7777778
8  30       34           10      3.4000000
9  31       29           11      2.6363636


In [None]:
%%R
# 2 - replace a value
bookstore[6,2] <- 16
bookstore

  age purchase visit_length rev_per_minute
1  28       20            5      4.0000000
2  48       59            2     29.5000000
3  47        2           20      0.1000000
4  71       12           22      0.5454545
5  22       22           12      1.8333333
6  80       16           31      5.1612903
7  48       34            9      3.7777778
8  30       34           10      3.4000000
9  31       29           11      2.6363636


## **exercise 3.6:** checking conditions

In [None]:
%%R
# 1 - coldest day
names(airquality)
airquality[which.min(airquality$Temp), ] # 5th may

  Ozone Solar.R Wind Temp Month Day
5    NA      NA 14.3   56     5   5


In [None]:
%%R
# 2 - number of days with wind speed > 17mph
nrow(airquality[airquality$Wind > 17, ])

[1] 3


In [None]:
%%R
# 3 - number of missing Ozone values
nrow(airquality[is.na(airquality$Ozone), ])

[1] 37


In [None]:
%%R
# 4 - number of days with temperature below 70 and and wind speed above 10
nrow(airquality[airquality$Temp < 70 & airquality$Wind > 10, ])

[1] 22


## **exercise 3.7:** the `cut` function

In [None]:
%%R
# ?cut
airquality$TempCat <- cut(airquality$Temp, c(50, 70, 90, 110))
airquality$TempCat

  [1] (50,70]  (70,90]  (70,90]  (50,70]  (50,70]  (50,70]  (50,70]  (50,70] 
  [9] (50,70]  (50,70]  (70,90]  (50,70]  (50,70]  (50,70]  (50,70]  (50,70] 
 [17] (50,70]  (50,70]  (50,70]  (50,70]  (50,70]  (70,90]  (50,70]  (50,70] 
 [25] (50,70]  (50,70]  (50,70]  (50,70]  (70,90]  (70,90]  (70,90]  (70,90] 
 [33] (70,90]  (50,70]  (70,90]  (70,90]  (70,90]  (70,90]  (70,90]  (70,90] 
 [41] (70,90]  (90,110] (90,110] (70,90]  (70,90]  (70,90]  (70,90]  (70,90] 
 [49] (50,70]  (70,90]  (70,90]  (70,90]  (70,90]  (70,90]  (70,90]  (70,90] 
 [57] (70,90]  (70,90]  (70,90]  (70,90]  (70,90]  (70,90]  (70,90]  (70,90] 
 [65] (70,90]  (70,90]  (70,90]  (70,90]  (90,110] (90,110] (70,90]  (70,90] 
 [73] (70,90]  (70,90]  (90,110] (70,90]  (70,90]  (70,90]  (70,90]  (70,90] 
 [81] (70,90]  (70,90]  (70,90]  (70,90]  (70,90]  (70,90]  (70,90]  (70,90] 
 [89] (70,90]  (70,90]  (70,90]  (70,90]  (70,90]  (70,90]  (70,90]  (70,90] 
 [97] (70,90]  (70,90]  (70,90]  (70,90]  (70,90]  (90,110] (70,

# 3. importing data

In [5]:
%%R
# where am i?
getwd()
# oh no that's not what we want
setwd('/gdrive/MyDrive/GitHub/modern-stats-r')
getwd()
# home sweet home

[1] "/gdrive/MyDrive/GitHub/modern-stats-r"


In [None]:
%%R
# read in a csv
imported_data <- read.csv("data/philosophers.csv")
str(imported_data)

'data.frame':	6 obs. of  5 variables:
 $ Name       : chr  "Aristotle" "Basilides" "Cercops" "Dexippus" ...
 $ Description: chr  "Pretty influential, as philosophers go." "Denied the existence of incorporeal entities." "An Orphic poet" "Neoplatonic!" ...
 $ Born       : int  -384 -175 NA 235 50 80
 $ Deceased   : chr  "322 BC" "125 BC" "" "375 AD" ...
 $ Rating     : num  4.8 4 3.2 2.7 5 4.7


In [None]:
%%R
# does colab let you read in data with file.choose?
# imported_data2 <- file.choose()
# it prompts you for text input, which would be useful but my working directory isn't the same as the directory my data's in
file_path <- "data/philosophers.csv"
imported_data2 <- read.csv(file_path)
str(imported_data2)

'data.frame':	6 obs. of  5 variables:
 $ Name       : chr  "Aristotle" "Basilides" "Cercops" "Dexippus" ...
 $ Description: chr  "Pretty influential, as philosophers go." "Denied the existence of incorporeal entities." "An Orphic poet" "Neoplatonic!" ...
 $ Born       : int  -384 -175 NA 235 50 80
 $ Deceased   : chr  "322 BC" "125 BC" "" "375 AD" ...
 $ Rating     : num  4.8 4 3.2 2.7 5 4.7


In [None]:
%%R
# tuberculosis data from a URL
tb_data <- read.csv("https://tinyurl.com/whotbdata")
str(tb_data)

'data.frame':	1720 obs. of  20 variables:
 $ country                         : chr  "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
 $ iso2                            : chr  "AF" "AF" "AF" "AF" ...
 $ iso3                            : chr  "AFG" "AFG" "AFG" "AFG" ...
 $ iso_numeric                     : int  4 4 4 4 4 4 4 4 8 8 ...
 $ g_whoregion                     : chr  "EMR" "EMR" "EMR" "EMR" ...
 $ year                            : int  2015 2016 2017 2018 2019 2020 2021 2022 2015 2016 ...
 $ source_rr_new                   : chr  "Model" "Model" "Model" "Model" ...
 $ e_rr_pct_new                    : num  4.8 4.7 4.6 4.5 4.4 4.3 4.3 4.2 1.9 1.9 ...
 $ e_rr_pct_new_lo                 : num  0.42 0.45 0.48 0.51 0.53 0.53 0.53 0.55 1.2 1.2 ...
 $ e_rr_pct_new_hi                 : num  19 17 17 17 17 16 15 15 2.8 2.7 ...
 $ source_rr_ret                   : chr  "Surveillance" "Surveillance" "Surveillance" "Surveillance" ...
 $ e_rr_pct_ret                    : num  6 8.

In [None]:
%%R
# read from xlsx
file_path <- 'data/philosophers.xlsx'
imported_from_excel <- read.xlsx(file_path)
str(imported_from_excel)

'data.frame':	6 obs. of  5 variables:
 $ Name       : chr  "Aristotle" "Basilides" "Cercops" "Dexippus" ...
 $ Description: chr  "Pretty influential, as philosophers go." "Denied the existence of incorporeal entities." "An Orphic poet" "Neoplatonic!" ...
 $ Born       : num  -384 -175 NA 235 50 80
 $ Deceased   : chr  "322 BC" "125 BC" NA "375 AD" ...
 $ Rating     : chr  "4.8" "4" "3.2" "2.7" ...


## **exercise 3.8:** `read.csv` arguments

In [None]:
%%R
# import the data after opening it locally
vas <- read.csv('data/vas.csv', sep = ';', dec = ',', skip = 4)
vas[1:10, ]
# 1 - X and X.1 are columns without names or values?
# answer: column X is an empty column, X.1 has two comments in it

    ID       Date Visit  X  VAS X.1
1  201 2018-02-01     1 NA 10.0    
2  201 2018-02-02    NA NA  7.0    
3  201 2018-02-03    NA NA  5.9    
4  201 2018-02-04    NA NA  7.2    
5  201 2018-02-05    NA NA  8.6    
6  201 2018-02-06    NA NA  6.4    
7  201 2018-02-07    NA NA  9.1    
8  201 2018-02-08    NA NA  6.8    
9  201 2018-02-09    NA NA  8.4    
10 201 2018-02-10    NA NA  6.7    


In [None]:
%%R
# 2 - without sep argument
vas <- read.csv('data/vas.csv', dec = ',', skip = 4)
vas[1:10, ]
# throws error about duplicate row names - probably because it doesn't parse properly

  duplicate 'row.names' are not allowed




Error in read.table(file = file, header = header, sep = sep, quote = quote,  : 
  duplicate 'row.names' are not allowed


In [None]:
%%R
# 3 - without dec argument
vas <- read.csv('data/vas.csv', sep = ';', skip = 4)
class(vas$VAS)
# returns VAS as chr instead of num because it reads the comma as text and not as a decimal point

[1] "character"


In [None]:
%%R
# 4 - without skip argument
vas <- read.csv('data/vas.csv', sep = ';', dec = ',')
str(vas)
# returns metadata-type text as initial rows
# column names are all wrong, actual column names are treated as values, all columns (except X.2) are chr

'data.frame':	2369 obs. of  6 variables:
 $ Data.updated.2020.04.25: chr  "" "ID 213 not yet included" "" "ID" ...
 $ X                      : chr  "" "" "" "Date" ...
 $ X.1                    : chr  "" "" "" "Visit" ...
 $ X.2                    : logi  NA NA NA NA NA NA ...
 $ X.3                    : chr  "" "" "" "VAS" ...
 $ X.4                    : chr  "" "" "" "" ...


In [None]:
%%R
# 5 - skip 5 instead of 4
vas <- read.csv('data/vas.csv', sep = ';', dec = ',', skip = 5)
str(vas)
# column names are replaced with the first values

'data.frame':	2364 obs. of  6 variables:
 $ X201       : int  201 201 201 201 201 201 201 201 201 201 ...
 $ X2018.02.01: chr  "2018-02-02" "2018-02-03" "2018-02-04" "2018-02-05" ...
 $ X1         : int  NA NA NA NA NA NA NA NA NA NA ...
 $ X          : logi  NA NA NA NA NA NA ...
 $ X10        : num  7 5.9 7.2 8.6 6.4 9.1 6.8 8.4 6.7 6.1 ...
 $ X.1        : chr  "" "" "" "" ...


## **exercise 3.9:** more excel data

In [107]:
%%R
# 1 - importing data from a specific excel tab
# ?read.xlsx
email_import <- read.xlsx('data/projects-email.xlsx', sheet = 'Email') # works with names too!
email_import[1:10, ]

    ID               Name                           E-mail
1  591      Ashlyn Cortes      ashlyn.cortes@r-project.org
2  592      Darien Sutton      darien.sutton@r-project.org
3  593      Khalid Palmer      khalid.palmer@r-project.org
4  594          Tj Franks          tj.franks@r-project.org
5  595 Alexandros Merrill alexandros.merrill@r-project.org
6  596       Seren Graves       seren.graves@r-project.org
7  597      Liyana Bowers      liyana.bowers@r-project.org
8  598       Matias Melia       matias.melia@r-project.org
9  599       Rhianne Hall       rhianne.hall@r-project.org
10 600       Lukas Weston       lukas.weston@r-project.org


In [119]:
%%R
# 2 - unique emails
# ?unique
unique_emails <- unique(email_import$'E-mail')
length(unique_emails) < nrow(email_import) # works :)

[1] TRUE


## **exercise 3.10:** transposed data

In [6]:
%%R
# 1 - import it!
vas_transposed <- read.csv('data/vas-transposed.csv')
vas_transposed[, 1:10] # did the index in i at first, oops

      X         V1         V2         V3         V4         V5         V6
1    ID        201        201        201        201        201        201
2  Date 2018-02-01 2018-02-02 2018-02-03 2018-02-04 2018-02-05 2018-02-06
3 Visit          1       <NA>       <NA>       <NA>       <NA>       <NA>
4   VAS       10.0        7.0        5.9        7.2        8.6        6.4
          V7         V8         V9
1        201        201        201
2 2018-02-07 2018-02-08 2018-02-09
3       <NA>       <NA>       <NA>
4        9.1        6.8        8.4


In [9]:
%%R
# 2 - import with row names
vas_transposed <- read.csv('data/vas-transposed.csv', row.names=1)
vas_transposed[, 1:10]

              V1         V2         V3         V4         V5         V6
ID           201        201        201        201        201        201
Date  2018-02-01 2018-02-02 2018-02-03 2018-02-04 2018-02-05 2018-02-06
Visit          1       <NA>       <NA>       <NA>       <NA>       <NA>
VAS         10.0        7.0        5.9        7.2        8.6        6.4
              V7         V8         V9        V10
ID           201        201        201        201
Date  2018-02-07 2018-02-08 2018-02-09 2018-02-10
Visit       <NA>       <NA>       <NA>       <NA>
VAS          9.1        6.8        8.4        6.7


In [12]:
%%R
# 3 - transpose it back
vas <- as.data.frame(t(vas_transposed)) # adding the as.data.frame sorts out the types
vas[1:10, ]

[1] "data.frame"
