# Section 03: Selecting and Transforming Data
### `01-Selecting columns`
- Use `glimpse()` to examine all the variables in the `counties` table.
- Select the columns for `state`, `county`, `population`, and (using a colon) all five of those industry-related variables; there are five consecutive variables in the table related to the industry of people's work: `professional`, `service`, `office`, `construction`, and `production`.
- Arrange the table in descending order of `service` to find which counties have the highest rates of working in the service industry.

In [1]:
library(dplyr)
counties <- read.csv("C:\\Users\\mosman\\Desktop\\Github\\Data_Scientist_with_R\\00_Datasets\\counties.csv", 
                    header=TRUE)


Attaching package: 'dplyr'


The following objects are masked from 'package:stats':

    filter, lag


The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union




In [4]:
# Glimpse the counties table
glimpse(counties)

Rows: 3,138
Columns: 40
$ census_id          [3m[90m<int>[39m[23m 1001, 1003, 1005, 1007, 1009, 1011, 1013, 1015, 101…
$ state              [3m[90m<chr>[39m[23m "Alabama", "Alabama", "Alabama", "Alabama", "Alabam…
$ county             [3m[90m<chr>[39m[23m "\"Autauga\"", "\"Baldwin\"", "\"Barbour\"", "\"Bib…
$ region             [3m[90m<chr>[39m[23m "South", "South", "South", "South", "South", "South…
$ metro              [3m[90m<chr>[39m[23m "Metro", "Metro", "Nonmetro", "Metro", "Metro", "No…
$ population         [3m[90m<int>[39m[23m 55221, 195121, 26932, 22604, 57710, 10678, 20354, 1…
$ men                [3m[90m<int>[39m[23m 26700, 95300, 14500, 12100, 28500, 5660, 9500, 5630…
$ women              [3m[90m<int>[39m[23m 28500, 99800, 12400, 10500, 29200, 5020, 10900, 604…
$ hispanic           [3m[90m<dbl>[39m[23m 2.6, 4.5, 4.6, 2.2, 8.6, 4.4, 1.2, 3.5, 0.4, 1.5, 7…
$ white              [3m[90m<dbl>[39m[23m 75.8, 83.1, 46.2, 74.5, 87.9, 22.2, 53.

In [3]:
counties %>%
  # Select state, county, population, and industry-related columns
  select(state, county, population, professional:production) %>%
  # Arrange service in descending order 
  arrange(desc(service))

state,county,population,professional,service,office,construction,production
<chr>,<chr>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
Mississipp,"i ""Tunica""",10477,23.9,36.6,21.5,3.5,14.5
Texas,"""Kinney""",3577,30.0,36.5,11.6,20.5,1.3
Texas,"""Kenedy""",565,24.9,34.1,20.5,20.5,0.0
New York,"""Bronx""",1428357,24.3,33.3,24.2,7.1,11.0
Texas,"""Brooks""",7221,19.6,32.4,25.3,11.1,11.5
Colorado,"""Fremont""",46809,26.6,32.2,22.8,10.7,7.6
Texas,"""Culberson""",2296,20.1,32.2,24.2,15.7,7.8
California,"""Del Norte""",27788,33.9,31.5,18.8,8.9,6.8
Minnesota,"""Mahnomen""",5496,26.8,31.5,18.7,13.1,9.9
Virginia,"""Lancaster""",11129,30.3,31.2,22.8,8.1,7.6


### `02-Select helpers`
- Select the columns `state`, `county`, `population`, and all those that end with `work`.
- Filter just for the counties where at least 50% of the population is engaged in public work.

In [5]:
counties %>%
  # Select the state, county, population, and those ending with "work"
  select(state, county, population, ends_with("work")) %>%
  # Filter for counties that have at least 50% of people engaged in public work
  filter(public_work >= 50)

state,county,population,private_work,public_work,family_work
<chr>,<chr>,<int>,<dbl>,<dbl>,<dbl>
Alaska,"""Lake and Peninsula Borough""",1474,42.2,51.6,0.2
Alaska,"""Yukon-Koyukuk Census Area""",5644,33.3,61.7,0.0
California,"""Lassen""",32645,42.6,50.5,0.1
Hawaii,"""Kalawao""",85,25.0,64.1,0.0
North Dako,"ta ""Sioux""",4380,32.9,56.8,0.1
South Dako,"ta ""Todd""",9942,34.4,55.0,0.8
Wisconsin,"""Menominee""",4451,36.8,59.1,0.4


### `03-Renaming a column after count`
- Use `count()` to determine how many counties are in each state.

In [8]:
counties %>%
  # Count the number of counties in each state
  count(state)

state,n
<chr>,<int>
Alabama,67
Alaska,28
Arizona,15
Arkansas,75
California,58
Colorado,64
Connecticu,8
Delaware,3
Florida,67
Georgia,159


- Notice the n column in the output; use `rename()` to rename that to `num_counties`.

In [6]:
counties %>%
  # Count the number of counties in each state
  count(state) %>%
  # Rename the n column to num_counties
  rename(num_counties = n)

state,num_counties
<chr>,<int>
Alabama,67
Alaska,28
Arizona,15
Arkansas,75
California,58
Colorado,64
Connecticu,8
Delaware,3
Florida,67
Georgia,159


### `04-Renaming a column as part of a select`
Select the columns `state`, `county`, and `poverty` from the `counties` dataset; in the same step, rename the `poverty` column to `poverty_rate`.


In [9]:
counties %>%
  # Select state, county, and poverty as poverty_rate
  select(state, county, poverty_rate = poverty)

state,county,poverty_rate
<chr>,<chr>,<dbl>
Alabama,"""Autauga""",12.9
Alabama,"""Baldwin""",13.4
Alabama,"""Barbour""",26.7
Alabama,"""Bibb""",16.8
Alabama,"""Blount""",16.7
Alabama,"""Bullock""",24.6
Alabama,"""Butler""",25.4
Alabama,"""Calhoun""",20.5
Alabama,"""Chambers""",21.6
Alabama,"""Cherokee""",19.2


### `05-Using transmute`
- Keep only the `state`, `county`, and `population` columns, and add a new column, `density`, that contains the `population` per `land_area`.
- Filter for only counties with a population greater than one million.
- Sort the table in ascending order of density.

In [10]:
counties %>%
  # Keep the state, county, and populations columns, and add a density column
  transmute(state, county, population, density = population / land_area) %>%
  # Filter for counties with a population greater than one million
  filter(population > 1000000) %>%
  # Sort density in ascending order
  arrange(density)

state,county,population,density
<chr>,<chr>,<int>,<dbl>
California,"""San Bernardino""",2094769,104.4408
Nevada,"""Clark""",2035572,257.9612
California,"""Riverside""",2298032,318.9054
Arizona,"""Maricopa""",4018143,436.7547
Florida,"""Palm Beach""",1378806,699.9015
California,"""San Diego""",3223096,766.1269
Washington,"""King""",2045756,966.8034
Texas,"""Travis""",1121645,1132.9747
Florida,"""Hillsborough""",1302884,1277.3373
Florida,"""Orange""",1229039,1361.062


### `06-Choosing among the four verbs`

- Choose the right verb for changing the name of the `unemployment` column to `unemployment_rate`
- Choose the right verb for keeping only the columns `state`, `county`, and the ones containing `poverty`.
- Calculate a new column called `fraction_women` with the fraction of the population made up of women, without dropping any columns.
- Keep only three columns: the `state`, `county`, and `employed / population`, which you'll call `employment_rate`.

In [11]:
# Change the name of the unemployment column
counties %>%
  rename(unemployment_rate = unemployment)

# Keep the state and county columns, and the columns containing poverty
counties %>%
  select(state, county, contains("poverty"))

# Calculate the fraction_women column without dropping the other columns
counties %>%
  mutate(fraction_women = women / population)

# Keep only the state, county, and employment_rate columns
counties %>%
  transmute(state, county, employment_rate = employed / population)

census_id,state,county,region,metro,population,men,women,hispanic,white,⋯,other_transp,work_at_home,mean_commute,employed,private_work,public_work,self_employed,family_work,unemployment_rate,land_area
<int>,<chr>,<chr>,<chr>,<chr>,<int>,<int>,<int>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1001,Alabama,"""Autauga""",South,Metro,55221,26700,28500,2.6,75.8,⋯,1.3,1.8,26.5,23986,73.6,20.9,5.5,0.0,7.6,594
1003,Alabama,"""Baldwin""",South,Metro,195121,95300,99800,4.5,83.1,⋯,1.4,3.9,26.4,85953,81.5,12.3,5.8,0.4,7.5,1590
1005,Alabama,"""Barbour""",South,Nonmetro,26932,14500,12400,4.6,46.2,⋯,1.5,1.6,24.1,8597,71.8,20.8,7.3,0.1,17.6,885
1007,Alabama,"""Bibb""",South,Metro,22604,12100,10500,2.2,74.5,⋯,1.5,0.7,28.8,8294,76.8,16.1,6.7,0.4,8.3,623
1009,Alabama,"""Blount""",South,Metro,57710,28500,29200,8.6,87.9,⋯,0.4,2.3,34.9,22189,82.0,13.5,4.2,0.4,7.7,645
1011,Alabama,"""Bullock""",South,Nonmetro,10678,5660,5020,4.4,22.2,⋯,1.7,2.8,27.5,3865,79.5,15.1,5.4,0.0,18.0,623
1013,Alabama,"""Butler""",South,Nonmetro,20354,9500,10900,1.2,53.3,⋯,0.6,1.7,24.6,7813,77.4,16.2,6.2,0.2,10.9,777
1015,Alabama,"""Calhoun""",South,Metro,116648,56300,60400,3.5,73.0,⋯,1.2,2.7,24.1,47401,74.1,20.8,5.0,0.1,12.3,606
1017,Alabama,"""Chambers""",South,Nonmetro,34079,16300,17800,0.4,57.3,⋯,0.4,2.1,25.1,13689,85.1,12.1,2.8,0.0,8.9,597
1019,Alabama,"""Cherokee""",South,Nonmetro,26008,13000,13000,1.5,91.7,⋯,0.7,2.5,27.4,10155,73.1,18.5,7.9,0.5,7.9,554


state,county,poverty,child_poverty
<chr>,<chr>,<dbl>,<dbl>
Alabama,"""Autauga""",12.9,18.6
Alabama,"""Baldwin""",13.4,19.2
Alabama,"""Barbour""",26.7,45.3
Alabama,"""Bibb""",16.8,27.9
Alabama,"""Blount""",16.7,27.2
Alabama,"""Bullock""",24.6,38.4
Alabama,"""Butler""",25.4,39.2
Alabama,"""Calhoun""",20.5,31.6
Alabama,"""Chambers""",21.6,37.2
Alabama,"""Cherokee""",19.2,30.1


census_id,state,county,region,metro,population,men,women,hispanic,white,⋯,work_at_home,mean_commute,employed,private_work,public_work,self_employed,family_work,unemployment,land_area,fraction_women
<int>,<chr>,<chr>,<chr>,<chr>,<int>,<int>,<int>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1001,Alabama,"""Autauga""",South,Metro,55221,26700,28500,2.6,75.8,⋯,1.8,26.5,23986,73.6,20.9,5.5,0.0,7.6,594,0.5161080
1003,Alabama,"""Baldwin""",South,Metro,195121,95300,99800,4.5,83.1,⋯,3.9,26.4,85953,81.5,12.3,5.8,0.4,7.5,1590,0.5114775
1005,Alabama,"""Barbour""",South,Nonmetro,26932,14500,12400,4.6,46.2,⋯,1.6,24.1,8597,71.8,20.8,7.3,0.1,17.6,885,0.4604188
1007,Alabama,"""Bibb""",South,Metro,22604,12100,10500,2.2,74.5,⋯,0.7,28.8,8294,76.8,16.1,6.7,0.4,8.3,623,0.4645196
1009,Alabama,"""Blount""",South,Metro,57710,28500,29200,8.6,87.9,⋯,2.3,34.9,22189,82.0,13.5,4.2,0.4,7.7,645,0.5059782
1011,Alabama,"""Bullock""",South,Nonmetro,10678,5660,5020,4.4,22.2,⋯,2.8,27.5,3865,79.5,15.1,5.4,0.0,18.0,623,0.4701255
1013,Alabama,"""Butler""",South,Nonmetro,20354,9500,10900,1.2,53.3,⋯,1.7,24.6,7813,77.4,16.2,6.2,0.2,10.9,777,0.5355213
1015,Alabama,"""Calhoun""",South,Metro,116648,56300,60400,3.5,73.0,⋯,2.7,24.1,47401,74.1,20.8,5.0,0.1,12.3,606,0.5177971
1017,Alabama,"""Chambers""",South,Nonmetro,34079,16300,17800,0.4,57.3,⋯,2.1,25.1,13689,85.1,12.1,2.8,0.0,8.9,597,0.5223158
1019,Alabama,"""Cherokee""",South,Nonmetro,26008,13000,13000,1.5,91.7,⋯,2.5,27.4,10155,73.1,18.5,7.9,0.5,7.9,554,0.4998462


state,county,employment_rate
<chr>,<chr>,<dbl>
Alabama,"""Autauga""",0.4343637
Alabama,"""Baldwin""",0.4405113
Alabama,"""Barbour""",0.3192113
Alabama,"""Bibb""",0.3669262
Alabama,"""Blount""",0.3844914
Alabama,"""Bullock""",0.3619592
Alabama,"""Butler""",0.3838558
Alabama,"""Calhoun""",0.4063593
Alabama,"""Chambers""",0.4016843
Alabama,"""Cherokee""",0.3904568


### `The End`