cheatsheet for ggplot2, compiled mostly from the book, ggplot2: Elegant Graphics for Data Analysis
Documentation for ggplot is available here.
Table of Contents
- Loading ggplot
- Basic use with qplot
- Build a plot layer by layer
- Basic plot types
- Displaying distributions
- Deal with overplotting
- Surface plots
- Drawing maps
- [Choropleth map](#choropleth-maphttpenwikipediaorgwikichoropleth_map)
- Annotating a plot
- Faceting
opts_chunk$set(warning=FALSE, message=FALSE, fig.width=8, fig.height=4)
library(ggplot2)
Load the sample data
set.seed(1410) # make the sample reproducible
head(diamonds)
## carat cut color clarity depth table price x y z
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.29 Premium I VS2 62.4 58 334 4.20 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
dsmall <- diamonds[sample(nrow(diamonds), 100), ]
qplot(carat, price, data=diamonds)
qplot(carat, price, data=dsmall, color=color, shape=cut, alpha=I(1/2))
qplot(carat, price, data=dsmall, geom=c("point", "smooth"))
There are many different smoother that can be used with method
argument.
qplot(carat, price, data=dsmall, geom=c("point", "smooth"), method="lm")
qplot(color, price/carat, data=diamonds, geom="jitter")
qplot(color, price/carat, data=diamonds, geom="boxplot")
qplot(carat, data=diamonds, geom="histogram", fill=color)
qplot(carat, data=diamonds, geom="density", color=color)
Change the amount of smoothing with binwidth
argument.
qplot(carat, data=diamonds, geom="histogram", binwidth=1)
qplot(carat, data=diamonds, geom="histogram", binwidth=0.1)
qplot(carat, data=diamonds, geom="histogram", binwidth=0.01)
qplot(color, data=diamonds, geom="bar")
# bar plot of diamond color weighted by carat
qplot(color, data=diamonds, geom="bar", weight=carat) +
scale_y_continuous("carat")
head(economics)
## date pce pop psavert uempmed unemploy
## 1 1967-06-30 507.8 198712 9.8 4.5 2944
## 2 1967-07-31 510.9 198911 9.8 4.7 2945
## 3 1967-08-31 516.7 199113 9.0 4.6 2958
## 4 1967-09-30 513.3 199311 9.8 4.9 3143
## 5 1967-10-31 518.5 199498 9.7 4.7 3066
## 6 1967-11-30 526.2 199657 9.4 4.8 3018
qplot(date, unemploy/pop, data=economics, geom="line")
qplot(carat, data=diamonds, facets=color~.,
geom="histogram", binwidth=0.1, xlim=c(0,3))
xlim
andylim
: set limits for x- and y-axis (e.g.xlim=c(0,20)
)main
: main title for the plotxlab
andylab
: labels for x- and y-axis
qplot(carat, price, data=dsmall,
xlab="Price ($)",
ylab="Weight (carats)",
main="Price-weight relationship")
More complicated, multi-layer plots can be generated using ggplot()
.
df <- data.frame(x=c(3, 1, 5), y=c(2, 4, 6), label=c("a", "b", "c"))
p <- ggplot(df, aes(x, y, label=label)) + xlab(NULL) + ylab(NULL)
p + geom_point() + ggtitle("geom_point")
p + geom_bar(stat="identity") + ggtitle("geom_bar(stat=\"identity\")")
p + geom_line() + ggtitle("geom_line")
p + geom_area() + ggtitle("geom_area")
p + geom_path() + ggtitle("geom_path")
p + geom_text() + ggtitle("geom_text")
p + geom_tile() + ggtitle("geom_tile")
p + geom_polygon() + ggtitle("geom_polygon")
For 1d data, the geom is the histogram.
depth_dist <- ggplot(diamonds, aes(depth)) + xlim(58, 68)
depth_dist + geom_histogram()
To compare the distribution between groups, couple of options
depth_dist + geom_histogram(aes(y = ..density..), binwidth=0.1) +
facet_grid(cut ~ .)
depth_dist + geom_histogram(aes(fill=cut), binwidth=0.1, position="fill")
depth_dist + geom_freqpoly(aes(y = ..density.., color=cut), binwidth=0.1)
qplot(cut, depth, data=diamonds, geom="boxplot")
library(plyr)
qplot(carat, depth, data=diamonds, geom="boxplot",
group = round_any(carat, 0.1, floor), xlim=c(0, 3))
qplot(class, cty, data=mpg, geom="jitter")
qplot(class, drv, data=mpg, geom="jitter")
qplot(depth, data=diamonds, geom="density", xlim=c(54, 70))
qplot(depth, data=diamonds, geom="density", xlim=c(54, 70), fill=cut, alpha=I(0.2))
- Make the points smaller
df <- data.frame(x=rnorm(2000), y=rnorm(2000))
norm <- ggplot(df, aes(x, y))
norm + geom_point()
norm + geom_point(shape=1)
norm + geom_point(shape = ".") # pixel-sized
- Use alpha blending
library(scales)
norm + geom_point(color=alpha("black", 1/3))
norm + geom_point(color=alpha("black", 1/5))
norm + geom_point(color=alpha("black", 1/10))
- Randomly jitter if there is some discreteness
td <- ggplot(diamonds, aes(table, depth)) + xlim(50, 70) + ylim(50, 70)
td + geom_point()
td + geom_jitter()
jit <- position_jitter(width=0.5)
td + geom_jitter(position=jit)
td + geom_jitter(position=jit, color=alpha("black", 1/10))
td + geom_jitter(position=jit, color=alpha("black", 1/50))
td + geom_jitter(position=jit, color=alpha("black", 1/200))
library(maps)
data(us.cities)
big_cities <- subset(us.cities, pop>500000)
qplot(long, lat, data=big_cities) + borders("state", size=0.5)
states <- map_data("state")
arrests <- USArrests
names(arrests) <- tolower(names(arrests))
arrests$region <- tolower(rownames(USArrests))
choro <- merge(states, arrests, by="region")
# reorder the rows because order matters when drawing polygons and merge
# destroys the original ordering
choro <- choro[order(choro$order), ]
qplot(long, lat, data=choro, group=group, fill=assault, geom="polygon")
qplot(long, lat, data=choro, group=group, fill=assault/murder, geom="polygon")
Just extra data
- adding one at a time
- many at once
unemp <- qplot(date, unemploy, data=economics, geom="line",
xlab="", ylab="No. unemployed (1000s)")
presidential <- presidential[-(1:3), ]
yrng <- range(economics$unemploy)
xrng <- range(economics$date)
unemp + geom_vline(aes(xintercept=as.numeric(start)), data=presidential)
unemp + geom_rect(aes(NULL, NULL, xmin=start, xmax=end, fill=party),
ymin=yrng[1], ymax=yrng[2], data=presidential) +
scale_fill_manual(values=alpha(c("blue", "red"), 0.2))
last_plot() + geom_text(aes(x=start, y=yrng[1], label=name),
data=presidential, size=3, hjust=0, vjust=0)
caption <- paste(strwrap("Unemployment rates in the US have varied
alot over the years", 40), collapse="\n")
unemp + geom_text(aes(x, y, label=caption),
data=data.frame(x=xrng[2], y=yrng[2]),
hjust=1, vjust=1, size=4)
highest <- subset(economics, unemploy==max(unemploy))
unemp + geom_point(data=highest, size=3, color=alpha("red", 0.3))
qplot(cty, hwy, data=mpg) + facet_grid(. ~ cyl)
qplot(cty, data=mpg, geom="histogram", binwidth=2) + facet_grid(cyl ~ .)
qplot(cty, hwy, data=mpg) + facet_grid(drv ~ cyl)
p <- qplot(displ, hwy, data=mpg) + geom_smooth(method="lm", se=F)
p + facet_grid(cyl ~ drv)
p + facet_grid(cyl ~ drv, margins=T)
library(plyr)
movies$decade <- round_any(movies$year, 10, floor)
qplot(rating, ..density.., data=subset(movies, decade > 1890),
geom="histogram", binwidth=0.5) +
facet_wrap(~ decade, ncol=6)