<a href="https://colab.research.google.com/github/Ferrariagustinpablo/Data-Analytics-in-R/blob/main/Data_Manipulation_with_dplyr_(1)_using_US_census_dataset_R.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import libraries

In [16]:
# Load
library(dplyr)
library(ggplot2)

In [22]:
counties = readRDS("/content/counties.rds")
str(counties)

spec_tbl_df [3,138 × 40] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
 $ census_id         : chr [1:3138] "1001" "1003" "1005" "1007" ...
 $ state             : chr [1:3138] "Alabama" "Alabama" "Alabama" "Alabama" ...
 $ county            : chr [1:3138] "Autauga" "Baldwin" "Barbour" "Bibb" ...
 $ region            : chr [1:3138] "South" "South" "South" "South" ...
 $ metro             : chr [1:3138] "Metro" "Metro" "Nonmetro" "Metro" ...
 $ population        : num [1:3138] 55221 195121 26932 22604 57710 ...
 $ men               : num [1:3138] 26745 95314 14497 12073 28512 ...
 $ women             : num [1:3138] 28476 99807 12435 10531 29198 ...
 $ hispanic          : num [1:3138] 2.6 4.5 4.6 2.2 8.6 4.4 1.2 3.5 0.4 1.5 ...
 $ white             : num [1:3138] 75.8 83.1 46.2 74.5 87.9 22.2 53.3 73 57.3 91.7 ...
 $ black             : num [1:3138] 18.5 9.5 46.7 21.4 1.5 70.7 43.8 20.3 40.3 4.8 ...
 $ native            : num [1:3138] 0.4 0.6 0.2 0.4 0.3 1.2 0.1 0.2 0.2 0.6 ...
 $ asian         

# Select()

In [106]:
# Only select some rows
counties_selected <- counties %>%
  select(state, region, county, population, private_work, public_work, self_employed, unemployment, walk)

head(counties_selected)

state,region,county,population,private_work,public_work,self_employed,unemployment,walk
<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
Alabama,South,Autauga,55221,73.6,20.9,5.5,7.6,0.5
Alabama,South,Baldwin,195121,81.5,12.3,5.8,7.5,1.0
Alabama,South,Barbour,26932,71.8,20.8,7.3,17.6,1.8
Alabama,South,Bibb,22604,76.8,16.1,6.7,8.3,0.6
Alabama,South,Blount,57710,82.0,13.5,4.2,7.7,0.9
Alabama,South,Bullock,10678,79.5,15.1,5.4,18.0,5.0


# arrange: sorts data

In [34]:
# Sort data by population descending.
counties_selected %>%
  arrange(desc(population)) %>%
  head(3)

state,county,population,private_work,public_work,self_employed,unemployment
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
California,Los Angeles,10038388,79.0,11.5,9.4,10.0
Illinois,Cook,5236393,83.9,11.5,4.5,10.7
Texas,Harris,4356362,83.4,10.1,6.3,7.5


# Filter

In [36]:
# Filter for counties in the state of California that have a population above 1000000
counties_selected %>%
  filter(state == "California", population > 1000000)
  

state,county,population,private_work,public_work,self_employed,unemployment
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
California,Alameda,1584983,78.7,13.8,7.4,8.3
California,Contra Costa,1096068,77.5,13.6,8.7,8.8
California,Los Angeles,10038388,79.0,11.5,9.4,10.0
California,Orange,3116069,81.8,10.2,7.8,7.6
California,Riverside,2298032,77.1,14.9,7.8,12.9
California,Sacramento,1465832,70.8,21.8,7.3,11.8
California,San Bernardino,2094769,76.4,16.7,6.7,12.6
California,San Diego,3223096,77.3,14.8,7.7,8.7
California,Santa Clara,1868149,84.3,9.3,6.4,7.7


In [49]:
# Filter top 5 counties with unemployment that have more than 100000 in pop
unemp <- counties_selected %>%
  filter(population > 100000) %>%
  arrange(desc(unemployment))

# Display head  
head(unemp, 5)

state,county,population,private_work,public_work,self_employed,unemployment
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
Arizona,Navajo,107656,61.1,31.3,7.4,19.8
California,Imperial,178206,66.6,27.8,5.2,17.4
California,Merced,263885,76.7,16.2,7.0,16.2
California,Stanislaus,527367,78.9,14.2,6.8,15.2
Michigan,Wayne,1778969,86.1,10.0,3.8,14.9


# Mutate

In [57]:
# Add new column of estimated unemployed people.
unemp %>%
  mutate(unemployed_pop = round(unemployment*population/100)) %>%
  arrange(desc(unemployed_pop)) %>%
  head(3)

state,county,population,private_work,public_work,self_employed,unemployment,unemployed_pop
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
California,Los Angeles,10038388,79.0,11.5,9.4,10.0,1003839
Illinois,Cook,5236393,83.9,11.5,4.5,10.7,560294
Texas,Harris,4356362,83.4,10.1,6.3,7.5,326727


# select, mutate, filter and arrange

In [59]:
counties %>%
  # Select the five columns 
  select(state, county, population, men, women) %>%
  # Add the proportion_men variable
  mutate(proportion_men = men / population) %>%
  # Filter for population of at least 10,000
  filter(population >= 10000) %>% 
  # Arrange proportion of men in descending order 
  arrange(desc(proportion_men)) %>%
  head()

state,county,population,men,women,proportion_men
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
Virginia,Sussex,11864,8130,3734,0.6852664
California,Lassen,32645,21818,10827,0.6683412
Georgia,Chattahoochee,11914,7940,3974,0.6664428
Louisiana,West Feliciana,15415,10228,5187,0.6635096
Florida,Union,15191,9830,5361,0.6470937
Texas,Jones,19978,12652,7326,0.6332966


# Count how many counties has each state

In [62]:
counties %>%
  count(state, sort=T) %>%
  head()

state,n
<chr>,<int>
Texas,253
Georgia,159
Virginia,133
Kentucky,120
Missouri,115
Kansas,105


# Count Weighted by another variable

In [68]:
counties %>%
  count(state, wt = citizens, sort = TRUE) %>%
  head(3)

state,n
<chr>,<dbl>
California,24280349
Texas,16864864
Florida,13933052


# Exercises

## "What are the US states where the most people walk to work?"

In [77]:
counties %>%
  select(region, state, population, walk) %>%
  mutate(population_walk = population * walk / 100) %>%
  # Count weighted by the new column
  count(state, wt = population_walk, sort = TRUE) %>%
  head(3)

state,n
<chr>,<dbl>
New York,1237938.2
California,1017963.7
Pennsylvania,505397.2


In [81]:
# or using group_by and summarize
counties %>%
  select(region, state, population, walk) %>%
  mutate(population_walk = population * walk / 100) %>%
  group_by(state) %>%
  summarize(pop_walk = sum(population_walk)) %>% 
  arrange(desc(state)) %>%
  head(3)

state,pop_walk
<chr>,<dbl>
Wyoming,21981.33
Wisconsin,190405.65
West Virginia,51493.53


## Extract top 2 states by density

In [91]:
# groupby and add a density column
counties %>%
  group_by(state) %>%
  summarize(total_area = sum(land_area), total_population = sum(population)) %>%
  mutate(density_pop = total_population / total_area ) %>%
  arrange(desc(density_pop)) %>%
  head()

state,total_area,total_population,density_pop
<chr>,<dbl>,<dbl>,<dbl>
New Jersey,7354.22,8904413,1210.79
Rhode Island,1033.82,1053661,1019.192


Looks like New Jersey and Rhode Island are the “most crowded” of the US states, with more than a thousand people per square mile

## Extract top 2 counties by unemploymet in each state 

In [99]:
counties %>%
  group_by(state) %>%
  top_n(2, unemployment)

census_id,state,county,region,metro,population,men,women,hispanic,white,⋯,other_transp,work_at_home,mean_commute,employed,private_work,public_work,self_employed,family_work,unemployment,land_area
<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1035,Alabama,Conecuh,South,Nonmetro,12865,6176,6689,1.6,51.0,⋯,0.3,1.3,29.7,3718,77.8,13.0,8.3,0.8,22.6,850.16
1131,Alabama,Wilcox,South,Nonmetro,11235,5376,5859,0.6,27.4,⋯,0.9,4.2,22.6,2838,68.9,26.0,5.1,0.0,20.8,888.50
2188,Alaska,Northwest Arctic Borough,West,Nonmetro,7732,4165,3567,1.8,11.8,⋯,21.2,4.6,7.4,2593,54.0,43.7,2.1,0.2,21.9,35572.58
2290,Alaska,Yukon-Koyukuk Census Area,West,Nonmetro,5644,3038,2606,1.9,21.8,⋯,20.1,4.9,8.6,2077,33.3,61.7,5.1,0.0,18.2,145504.79
4001,Arizona,Apache,West,Nonmetro,72124,35663,36461,6.3,19.3,⋯,1.4,6.3,28.8,18334,50.2,45.0,4.7,0.1,18.2,11197.52
4017,Arizona,Navajo,West,Nonmetro,107656,53984,53672,11.1,42.6,⋯,2.3,7.3,22.3,31955,61.1,31.3,7.4,0.3,19.8,9950.42
5041,Arkansas,Desha,South,Nonmetro,12379,5741,6638,4.6,46.4,⋯,3.5,1.3,19.2,4355,69.9,25.7,4.5,0.0,17.7,768.15
5107,Arkansas,Phillips,South,Nonmetro,20391,9529,10862,1.6,35.2,⋯,1.3,2.8,19.5,6932,69.4,23.6,7.0,0.0,18.1,695.66
6025,California,Imperial,West,Metro,178206,91167,87039,81.8,12.6,⋯,2.4,4.0,21.3,58391,66.6,27.8,5.2,0.4,17.4,4176.60
6047,California,Merced,West,Metro,263885,133152,130733,56.9,29.9,⋯,3.4,3.9,26.1,96170,76.7,16.2,7.0,0.2,16.2,1934.97


## Group by region and find the greatest number of citizens who walk to work

In [121]:
# Group by region and find the greatest number of citizens who walk to work
counties_selected %>%
  group_by(region) %>%
  top_n(1, walk)

state,region,county,population,private_work,public_work,self_employed,unemployment,walk
<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
Alaska,West,Aleutians East Borough,3304,78.6,15.4,5.9,2.9,71.2
New York,Northeast,New York,1629507,83.6,8.5,7.8,7.5,20.7
North Dakota,North Central,McIntosh,2759,62.7,16.6,19.5,1.2,17.5
Virginia,South,Lexington city,7071,80.3,14.9,4.8,2.7,31.7


## Finding the highest-income state in each region

In [122]:
counties %>%
  select(region, state, county, population, income) %>%
  group_by(region, state) %>% 
  summarize(mean_income = mean(income)) %>%
  top_n(1,mean_income)

`summarise()` has grouped output by 'region'. You can override using the `.groups` argument.



region,state,mean_income
<chr>,<chr>,<dbl>
North Central,North Dakota,55574.87
Northeast,New Jersey,73014.1
South,Maryland,69200.38
West,Alaska,65124.54


## In how many states do more people live in metro areas than non-metro areas?

In [128]:
# a) Find the total population for each combination of state and metro
counties %>%
  group_by(state, metro) %>%
  summarize(total_pop = sum(population)) %>%
  head(2)

`summarise()` has grouped output by 'state'. You can override using the `.groups` argument.



state,metro,total_pop
<chr>,<chr>,<dbl>
Alabama,Metro,3671377
Alabama,Nonmetro,1159243


In [130]:
# Extract the most populated row from each state, which will be either Metro or Nonmetro.
counties %>%
  group_by(state, metro) %>%
  summarize(total_pop = sum(population)) %>%
  top_n(1, total_pop) %>% 
  head(2)


`summarise()` has grouped output by 'state'. You can override using the `.groups` argument.



state,metro,total_pop
<chr>,<chr>,<dbl>
Alabama,Metro,3671377
Alaska,Metro,494990


In [132]:
# Ungroup, then count how often Metro or Nonmetro appears to see how many states have more people living in those areas.

counties %>%
  group_by(state, metro) %>%
  summarize(total_pop = sum(population)) %>%
  top_n(1, total_pop) %>% 
  ungroup() %>%
  count(metro)

`summarise()` has grouped output by 'state'. You can override using the `.groups` argument.



metro,n
<chr>,<int>
Metro,44
Nonmetro,6
