# Data manipulation in `R` with `dplyr`


First, load `dplyr` and create an example data frame

In [21]:
library(dplyr)

stop_data <- data.frame(
    date=as.Date(rep('2012-07-15', 9)),
    precinct=c(72, 10, 10, 72, 3, 72, 3, 3, 3),
    race=c('black', 'white', 'black', 'black', 'black', 'white', 'black', 'black', 'black'),
    weapon=c(FALSE, TRUE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE),
    age=c(26, 62, 18, 37, 54, 47, 34, 29, 17)
)

stop_data

date,precinct,race,weapon,age
2012-07-15,72,black,False,26
2012-07-15,10,white,True,62
2012-07-15,10,black,False,18
2012-07-15,72,black,False,37
2012-07-15,3,black,True,54
2012-07-15,72,white,False,47
2012-07-15,3,black,False,34
2012-07-15,3,black,False,29
2012-07-15,3,black,False,17


## Filter

In [19]:
stop_data %>%
  filter(precinct == 3)

date,precinct,race,weapon
2012-07-15,3,black,True
2012-07-15,3,black,False
2012-07-15,3,black,False
2012-07-15,3,black,False


## Select

In [20]:
stop_data %>%
  select(date, precinct)

date,precinct
2012-07-15,72
2012-07-15,10
2012-07-15,10
2012-07-15,72
2012-07-15,3
2012-07-15,72
2012-07-15,3
2012-07-15,3
2012-07-15,3


## Transform

In [23]:
stop_data %>%
  mutate(teen=ifelse(age < 20 & age > 12, TRUE, FALSE))

date,precinct,race,weapon,age,teen
2012-07-15,72,black,False,26,False
2012-07-15,10,white,True,62,False
2012-07-15,10,black,False,18,True
2012-07-15,72,black,False,37,False
2012-07-15,3,black,True,54,False
2012-07-15,72,white,False,47,False
2012-07-15,3,black,False,34,False
2012-07-15,3,black,False,29,False
2012-07-15,3,black,False,17,True


## Aggregate

In [24]:
stop_data %>%
  summarize(hit_rate=mean(weapon))

hit_rate
0.2222


## Sort

In [25]:
stop_data %>%
  arrange(age)

date,precinct,race,weapon,age
2012-07-15,3,black,False,17
2012-07-15,10,black,False,18
2012-07-15,72,black,False,26
2012-07-15,3,black,False,29
2012-07-15,3,black,False,34
2012-07-15,72,black,False,37
2012-07-15,72,white,False,47
2012-07-15,3,black,True,54
2012-07-15,10,white,True,62


## Group-wise operations

In [27]:
stop_data %>% 
  group_by(race) %>%
  summarize(hit_rate=mean(weapon))

race,hit_rate
black,0.1429
white,0.5


In [28]:
stop_data %>% 
  group_by(precinct) %>%
  summarize(hit_rate=mean(weapon))

precinct,hit_rate
3,0.25
10,0.5
72,0.0


In [29]:
stop_data %>%
  group_by(precinct, race) %>%
  summarize(hit_rate=mean(weapon))

precinct,race,hit_rate
3,black,0.25
10,black,0.0
10,white,1.0
72,black,0.0
72,white,0.0


## Joins

In [38]:
users <- data.frame(
  user=c(23, 789, 234, 7, 26, 567, 2),
  sex=c('male', 'female', 'female', 'male', 'male', 'female', 'female')
)

activity <- data.frame(
  user=c(23, 789, 234, 7, 2, 8),
  activity=c(3, 2, 1, 1, 1, 3)
)

users
activity

user,sex
23,male
789,female
234,female
7,male
26,male
567,female
2,female


user,activity
23,3
789,2
234,1
7,1
2,1
8,3


### *Left* join

In [39]:
left_join(users, activity)

Joining, by = "user"


user,sex,activity
23,male,3.0
789,female,2.0
234,female,1.0
7,male,1.0
26,male,
567,female,
2,female,1.0


### *Inner* join

In [40]:
inner_join(users, activity)

Joining, by = "user"


user,sex,activity
23,male,3
789,female,2
234,female,1
7,male,1
2,female,1


### *Full* join

In [41]:
full_join(users, activity)

Joining, by = "user"


user,sex,activity
23,male,3.0
789,female,2.0
234,female,1.0
7,male,1.0
26,male,
567,female,
2,female,1.0
8,,3.0
