/
R_for_data_analysis.R
236 lines (176 loc) · 6.51 KB
/
R_for_data_analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
# R for Data Analysis
# A tidyverse approach
# Ryan Womack, rwomack@rutgers.edu
# 2024-01-29 version
# let's look at a few preliminaries before analyzing the data
# R is case sensitive
Getwd()
getwd()
getwd
# the console as calculator
2+2
# creating your own functions is easy
funkyadd<-function(x,y)
{
x+y+1
}
funkyadd(2,2)
# R has a full range of statistical methods such as sampling and probability distributions
sample(1:100,10)
rnorm(10)
rnorm(10, mean=100, sd=20)
rnorm(5, mean=1000, sd=20)
# the R help system is easy to access
?sample
?rnorm
ls()
# install packages
install.packages("tidyverse", dependencies = TRUE)
# update packages
update.packages()
# load packages
library(tidyverse)
# check help again
?tidyverse
library(help=tidyverse)
??tidy
# also see https://tidyverse.org for complete documentation
# grab data
# the is the World Bank Gender Statistics database
# see https://genderdata.worldbank.org for background
getOption("timeout")
options(timeout=6000)
download.file("https://databank.worldbank.org/data/download/Gender_Stats_CSV.zip", "gender.zip")
unzip("gender.zip")
# this is the base R read command
gender_data<-read.csv("Gender_StatsData.csv")
# from tidyverse, read_csv is a simpler alternative
gender_data <- read_csv("Gender_StatsData.csv")
# matrix notation [row,column]
gender_data[1,1]
gender_data[1:10,1:6]
gender_data[1:10,-1]
# drop columns
names(gender_data)
gender_data <- gender_data[,c(-2,-4)]
names(gender_data)
gender_data <- gender_data[,-66]
names(gender_data)
# from tidyr
# gather and spread used to be the commands to generate long or wide data
# these are no longer being developed
# pivot_longer and pivot_wider are the future
# if you are on Rstudio cloud or a system with limited memory
# I recommend you reduce the size of the dataset
# This will avoid crashes due to running out of memory
# gender_data<-gender_data[1:3000,]
# create data in long form
gender_data2 <- pivot_longer(gender_data, 3:65, names_to = "Year", values_to = "Value")
# alternative with pipes is below
# gender_data3 <-
# gender_data %>%
# pivot_longer(3:64, names_to = "Year", values_to = "Value")
#filter
gender_data2021 <-
gender_data2 %>%
filter(Year=="2021")
gender_data2021 <- gender_data2021[,-3]
gender_data2021wide <-
gender_data2021 %>%
pivot_wider(names_from = "Indicator Name", values_from = "Value")
# if we wanted to write output, we use a write function like write.csv
# write.csv(gender_data2020wide,"genderout.csv")
# now that we have created the dataset
# get information about data with summary
ls()
summary(gender_data)
summary(gender_data2021)
summary(gender_data2021wide)
# summarise is the tidyverse way, from dplyr
gender_data2021wide %>%
summarise_if(is.numeric, mean, na.rm=TRUE)
# ls lists items in the workspace, while rm removes them
# do not execute the command below unless you are sure! removes all files!
# rm(list = ls())
# studying the life expectancy variable
mean(gender_data2021wide$`Life expectancy at birth, female (years)`, na.rm=TRUE)
summary(gender_data2021wide$`Life expectancy at birth, female (years)`, na.rm=TRUE)
# attaching data
attach(gender_data2021wide)
mean(`Life expectancy at birth, female (years)`, na.rm=TRUE)
mean(`Life expectancy at birth, male (years)`, na.rm=TRUE)
summary(`Life expectancy at birth, female (years)`, na.rm=TRUE)
summary(`Life expectancy at birth, male (years)`, na.rm=TRUE)
# visualizing the data (very briefly)
plot(`Life expectancy at birth, female (years)`~`Life expectancy at birth, male (years)`)
abline(0,1)
# computing a new variable
lifespread <- `Life expectancy at birth, female (years)` - `Life expectancy at birth, male (years)`
mean(lifespread, na.rm=TRUE)
summary(lifespread, na.rm=TRUE)
plot(lifespread)
plot(lifespread~`GDP per capita (constant 2010 US$)`)
# R has a range of quick functions for descriptive statistics
sd(lifespread) # generates error
sd(lifespread, na.rm=TRUE)
var(lifespread, na.rm=TRUE)
median(lifespread, na.rm=TRUE)
quantile(lifespread, na.rm=TRUE)
summary(lifespread, na.rm=TRUE)
# even a histogram
hist(lifespread)
# creating a quick table to count observations
table(`Country Name`)
# we can create categorical variables
gender_data2021wide$hi_spread <- lifespread>5
gender_data2021wide$hi_age <- `Life expectancy at birth, female (years)`>78
# reattaching the data to make the freshly computed variables available
attach(gender_data2021wide)
# table and cross-tab on the categorical data
table(hi_spread)
table(hi_age)
table(hi_spread,hi_age)
# statistical tests in R are easy if you locate the correct command (in the help)
# t-test usage illustrated below
t.test(lifespread)
t.test(lifespread, mu=5)
t.test(lifespread, mu=4, conf.level=.99)
?t.test
# a two-sample t-test
t.test(lifespread~hi_age)
# we can also test
t.test(`Life expectancy at birth, female (years)`,`Life expectancy at birth, male (years)`)
# other tests like chisq.test are easily available
# consult the help
# linear regression
lm(lifespread~`GDP per capita (constant 2010 US$)`)
# with no intercept, add -1
lm(lifespread~`GDP per capita (constant 2010 US$)`-1)
# checking additional relationships
lm(lifespread~`Fertility rate, total (births per woman)`)
lm(`Fertility rate, total (births per woman)`~`GDP per capita (constant 2010 US$)`)
# to get full regression output, use summary
summary(lm(lifespread~`GDP per capita (constant 2010 US$)`))
summary(lm(lifespread~`Fertility rate, total (births per woman)`))
summary(lm(`Fertility rate, total (births per woman)`~`GDP per capita (constant 2010 US$)`))
# multiple explanatory variables? just use +
summary(lm(`Fertility rate, total (births per woman)`~`GDP per capita (constant 2010 US$)`+lifespread))
# we can store (and use) a regression output as an R object
regoutput<-lm(`Fertility rate, total (births per woman)`~`GDP per capita (constant 2010 US$)`+lifespread)
names(regoutput)
regoutput$residuals
# numerous quick functions are available to build off of the regression results
# predicted values
predict(regoutput)
# analysis of variance (anova)
anova(regoutput)
# correlation
cor(lifespread,`Fertility rate, total (births per woman)`)
cor(lifespread,`Fertility rate, total (births per woman)`, na.rm=TRUE)
# why won't those first two work?
# defaults and options of R commands are not very standardized
cor(lifespread,`Fertility rate, total (births per woman)`, use="complete.obs")
cor.test(lifespread,`Fertility rate, total (births per woman)`)
# we can also easily pull up regression diagnostic plots
plot(regoutput, pch=3)
# to be continued in parts 2 (data visualization) and 3 (data manipulation/wrangling)