## Grouping and summarizing


In [None]:
library(gapminder)
library(dplyr)
library(ggplot2)

In [None]:
# Summarize to find the median life expectancy
gapminder %>%
  summarize(medianLifeExp = median(lifeExp))

In [None]:
# Filter for 1957 then summarize the median life expectancy
gapminder %>%
  filter(year==1957) %>%
    summarize(medianLifeExp = median(lifeExp))

In [None]:
# Filter for 1957 then summarize the median life expectancy and the maximum GDP per capita
gapminder %>%
  filter(year==1957) %>%
    summarize(medianLifeExp = median(lifeExp), maxGdpPercap = max(gdpPercap))

In [None]:
# Find median life expectancy and maximum GDP per capita in each year
gapminder %>%
    group_by(year) %>%
      summarize(medianLifeExp = median(lifeExp), maxGdpPercap = max(gdpPercap)) 

In [None]:
# Find median life expectancy and maximum GDP per capita in each continent in 1957

gapminder %>%
  filter(year==1957) %>%
    group_by(continent) %>%
      summarize(medianLifeExp = median(lifeExp), maxGdpPercap = max(gdpPercap)) 

In [None]:
# Find median life expectancy and maximum GDP per capita in each continent/year combination
gapminder %>%
    group_by(continent, year) %>%
      summarize(medianLifeExp = median(lifeExp), maxGdpPercap = max(gdpPercap)) 

In [None]:
by_year <- gapminder %>%
  group_by(year) %>%
  summarize(medianLifeExp = median(lifeExp),
            maxGdpPercap = max(gdpPercap)) 

# Create a scatter plot showing the change in medianLifeExp over time
ggplot(by_year, aes(x=year, y=medianLifeExp)) + geom_point()+expand_limits(y=0)

In [None]:
# Summarize medianGdpPercap within each continent within each year: by_year_continent
by_year_continent <- gapminder %>%
  group_by(continent, year) %>%
    summarize(medianGdpPercap = median(gdpPercap)) 

# Plot the change in medianGdpPercap in each continent over time
ggplot(by_year_continent, aes(x=year, y=medianGdpPercap, color = continent)) + geom_point() + expand_limits(y=0)

In [None]:
# Summarize the median GDP and median life expectancy per continent in 2007
by_continent_2007 <- gapminder %>%
  filter(year==2007) %>%
    group_by(continent) %>%
      summarize(medianGdpPercap = median(gdpPercap), medianLifeExp = median(lifeExp))

# Use a scatter plot to compare the median GDP and median life expectancy
ggplot(by_continent_2007, aes(x= medianGdpPercap, y=medianLifeExp, color = continent)) + geom_point() 


## Types of visualization


### line plot

In [None]:
# Summarize the median gdpPercap by year, then save it as by_year
by_year <- gapminder %>%
            group_by(year) %>%
              summarize(medianGdpPercap = median(gdpPercap))

# Create a line plot showing the change in medianGdpPercap over time
ggplot(by_year, aes(x=year, y=medianGdpPercap)) + geom_line() + expand_limits(y=0)

In [None]:
# Summarize the median gdpPercap by year & continent, save as by_year_continent
by_year_continent <- gapminder %>%
                        group_by(year, continent) %>%
                          summarize(medianGdpPercap = median(gdpPercap)) 
# Create a line plot showing the change in medianGdpPercap by continent over time
ggplot(by_year_continent, aes(x=year, y=medianGdpPercap, color=continent)) + geom_line() + expand_limits(y=0)

#### bar plot

In [None]:
# Summarize the median gdpPercap by continent in 1952
by_continent <- gapminder %>%
  filter(year==1952) %>%
    group_by(continent) %>%
      summarize(medianGdpPercap = median(gdpPercap))

# Create a bar plot showing medianGdp by continent
ggplot(by_continent, aes(x=continent, y=medianGdpPercap)) + 
geom_col()

In [None]:
# Filter for observations in the Oceania continent in 1952
oceania_1952 <- gapminder %>%
                  filter(continent =='Oceania', year==1952) 


# Create a bar plot of gdpPercap by country
ggplot(oceania_1952, aes(x=country, y=gdpPercap)) + geom_col()

#### Histogram

In [None]:
gapminder_1952 <- gapminder %>%
  filter(year == 1952) %>%
  mutate(pop_by_mil = pop / 1000000)

# Create a histogram of population (pop_by_mil)
ggplot(gapminder_1952, aes(x = pop_by_mil)) + geom_histogram(bins=50)


In [None]:
library(gapminder)
library(dplyr)
library(ggplot2)

gapminder_1952 <- gapminder %>%
  filter(year == 1952)

# Create a histogram of population (pop), with x on a log scale
ggplot(gapminder_1952, aes(x= pop)) + geom_histogram() + scale_x_log10()

#### Boxplot

In [None]:
gapminder_1952 <- gapminder %>%
  filter(year == 1952)

# Create a boxplot comparing gdpPercap among continents
ggplot(gapminder_1952, aes(x = continent, y=gdpPercap)) + geom_boxplot() + scale_y_log10()

In [None]:
gapminder_1952 <- gapminder %>%
  filter(year == 1952)

# Add a title to this graph: "Comparing GDP per capita across continents"
ggplot(gapminder_1952, aes(x = continent, y = gdpPercap)) +
  geom_boxplot() +
  scale_y_log10() + ggtitle('Comparing GDP per capita across continents')

# Counties dataset

In [None]:
counties = readRDS('counties.rds')
glimpse(counties)

### select and count verbs

In [None]:
# Select the columns: state, county, population, poverty
counties %>%
  select(state, county, population, poverty)

In [None]:
counties %>%
  # Select the five columns : state, county, population, men, and women
  select(state, county, population, men, women) %>%
  # Add the proportion_men variable with the fraction of the county's population made up of men
  mutate(proportion_men = men / population) %>%
  # Filter for population of at least 10,000
  filter(population >= 10000) %>% 
  # Arrange proportion of men in descending order 
  arrange(desc(proportion_men))

In [None]:
counties_selected <- counties %>%
  select(region, state, population, citizens)

In [None]:
# Use count to find the number of counties in each region and sort in descending order
counties_selected %>%
  count(region, sort = TRUE)

In [None]:
# Find number of counties per state, weighted by citizens
counties_selected %>%
  count(state, wt = citizens, sort = TRUE)

In [None]:
# walk column indicates percentage of people in each county that walk to work
counties_selected %>%
  # Add population_walk containing the total number of people who walk to work 
  mutate(population_walk = population * walk / 100) %>%
  # Count weighted by the new column
  count(state, wt = population_walk, sort = TRUE)

### aggregation

In [None]:
counties_selected <- counties %>%
  select(county, population, income, unemployment)

In [None]:
# Summarize to find minimum population, maximum unemployment, and average income
counties_selected %>%
  summarize(min_population = min(population), max_unemployment = max(unemployment), average_income = mean(income))



In [None]:
# Summarize to find minimum population, maximum unemployment, and average income
counties_selected %>%
  summarize(min_population = min(population), max_unemployment = max(unemployment), average_income = mean(income))

In [None]:
counties_selected <- counties %>%
  select(state, county, population, land_area)

In [None]:
# Group by state and find the total land area and population
counties_selected %>%
  group_by(state) %>%
  summarize(total_area = sum(land_area), total_population = sum(population))
  
  


In [None]:
# Add a density column, then sort in descending order
counties_selected %>%
  group_by(state) %>%
  summarize(total_area = sum(land_area),
            total_population = sum(population)) %>%
  mutate(density = total_population / total_area) %>%
  arrange(desc(density))

In [None]:
counties_selected <- counties %>%
  select(region, state, county, population)

In [None]:
# Calculate the average_pop and median_pop columns 
counties_selected %>%
  group_by(region, state) %>%
  summarize(total_pop = sum(population)) %>%
  summarize(average_pop = mean(total_pop),
            median_pop = median(total_pop))

In [None]:
counties_selected <- counties %>%
  select(region, state, county, metro, population, walk)

In [None]:
# Find the county in each region with the highest percentage of citizens who walk to work.
counties_selected %>%
  group_by(region) %>%
  top_n(1, walk)
  

In [None]:
counties_selected <- counties %>%
  select(region, state, county, population, income)

In [None]:
# Calculate the average income (as average_income) of counties within each region and state
counties_selected %>%
  group_by(region, state) %>%
  # Calculate average income
  summarize(average_income = mean(income)) %>%
  # Find the highest income state in each region
  top_n(1, average_income)

In [None]:
counties_selected <- counties %>%
  select(state, metro, population)

### advanced selecting

In [None]:
# Glimpse the counties table
glimpse(counties)

counties %>%
  # Select state, county, population, and industry-related columns using colon
  # (professional, service, office, construction, and production)
  select(state, county, population, professional:production) %>%
  # Arrange service in descending order 
  arrange(desc(service))

In [None]:
counties %>%
  # Select the state, county, population, and those ending with "work"
  select(state, county, population, ends_with("work")) %>%
  # Filter for counties that have at least 50% of people engaged in public work
  filter(public_work >= 50)

### rename

In [None]:
# Rename the n column to num_counties
counties %>%
  count(state) %>%
  rename(num_counties = n)
  

In [None]:
## rename using select verb
# Select state, county, and poverty as poverty_rate
counties %>%
  select(state, county, poverty_rate = poverty)

### transmute = select + mutate

In [None]:
counties %>%
  # Keep the state, county, and populations columns, and add a density column
  transmute(state, county, population, density = population/land_area) %>%
  # Filter for counties with a population greater than one million 
  filter(population > 1000000) %>%
  # Sort density in ascending order 
  arrange(density)
  

In [None]:
# Change the name of the unemployment column
counties %>%
  rename(unemployment_rate = unemployment) 

# Keep the state and county columns, and the columns containing poverty
counties %>%
  select(state, county, contains("poverty"))

# Calculate the fraction_women column without dropping the other columns
counties %>%
  mutate(fraction_women = women / population)

# Keep only the state, county, and employment_rate columns
counties %>%
  transmute(state, county, employment_rate = employed / population)

# Importing data

In [None]:
dir()

In [None]:
# Import swimming_pools.csv: pools
pools <- read.csv('swimming_pools.csv')

# Print the structure of pools
print(str(pools))

In [None]:
# Import swimming_pools.csv correctly: pools
pools <- read.csv('swimming_pools.csv', stringsAsFactors=FALSE)

# Check the structure of pools
print(str(pools))

In [None]:
# Import hotdogs.txt: hotdogs
hotdogs <- read.delim('hotdogs.txt', header = FALSE)

# Summarize hotdogs
summary(hotdogs)

In [None]:
# Path to the hotdogs.txt file: path
path <- file.path('hotdogs.txt')
#path <- file.path("data", "hotdogs.txt")

# Import the hotdogs.txt file: hotdogs
hotdogs <- read.table(path, head = FALSE,
                      sep = '\t', 
                      col.names = c("type", "calories", "sodium"))

# Call head() on hotdogs
head(hotdogs)

### Importing excel data

In [None]:
# Load the readxl package
library(readxl)

# Print the names of all worksheets
excel_sheets('urbanpop.xlsx')

In [None]:
# The readxl package is already loaded

# Read the sheets, one by one
pop_1 <- read_excel("urbanpop.xlsx", sheet = 1)
pop_2 <- read_excel("urbanpop.xlsx", sheet = 2)
pop_3 <- read_excel("urbanpop.xlsx", sheet = 3)


# Put pop_1, pop_2 and pop_3 in a list: pop_list
pop_list = list(pop_1, pop_2, pop_3)

# Display the structure of pop_list
str(pop_list)

In [None]:
# Read all Excel sheets with lapply(): pop_list
pop_list <- lapply(excel_sheets("urbanpop.xlsx"), read_excel, path = "urbanpop.xlsx")

# Display the structure of pop_list
str(pop_list)