# Section 01: Transforming Data with dplyr
### `01- Selecting columns`
Select the following four columns from the counties variable:

- state
- county
- population
- poverty

You don't need to save the result to a variable.
- Select the columns listed from the counties variable.



In [3]:
library(dplyr)
counties <- read.csv("C:\\Users\\mosman\\Desktop\\Github\\Data_Scientist_with_R\\00_Datasets\\counties.csv", 
                    header=TRUE)

In [4]:
counties %>%
  # Select the columns
  select(state, county, population, poverty)

state,county,population,poverty
<chr>,<chr>,<int>,<dbl>
Alabama,"""Autauga""",55221,12.9
Alabama,"""Baldwin""",195121,13.4
Alabama,"""Barbour""",26932,26.7
Alabama,"""Bibb""",22604,16.8
Alabama,"""Blount""",57710,16.7
Alabama,"""Bullock""",10678,24.6
Alabama,"""Butler""",20354,25.4
Alabama,"""Calhoun""",116648,20.5
Alabama,"""Chambers""",34079,21.6
Alabama,"""Cherokee""",26008,19.2


### `02-Arranging observations`
- Add a verb to sort the observations of the `public_work` variable in descending order.

In [5]:
counties_selected <- counties %>%
  select(state, county, population, private_work, public_work, self_employed)

counties_selected %>%
  # Add a verb to sort in descending order of public_work
  arrange(desc(public_work))

state,county,population,private_work,public_work,self_employed
<chr>,<chr>,<int>,<dbl>,<dbl>,<dbl>
Hawaii,"""Kalawao""",85,25.0,64.1,10.9
Alaska,"""Yukon-Koyukuk Census Area""",5644,33.3,61.7,5.1
Wisconsin,"""Menominee""",4451,36.8,59.1,3.7
North Dako,"ta ""Sioux""",4380,32.9,56.8,10.2
South Dako,"ta ""Todd""",9942,34.4,55.0,9.8
Alaska,"""Lake and Peninsula Borough""",1474,42.2,51.6,6.1
California,"""Lassen""",32645,42.6,50.5,6.8
South Dako,"ta ""Buffalo""",2038,48.4,49.5,1.8
South Dako,"ta ""Dewey""",5579,34.9,49.2,14.7
Texas,"""Kenedy""",565,51.9,48.1,0.0


### `03-Filtering for conditions`
- Find only the counties that have a population above one million (1000000).





In [6]:
counties_selected <- counties %>%
  select(state, county, population)

counties_selected %>%
  # Filter for counties with a population above 1000000
  filter(population > 1000000)

state,county,population
<chr>,<chr>,<int>
Arizona,"""Maricopa""",4018143
California,"""Alameda""",1584983
California,"""Contra Costa""",1096068
California,"""Los Angeles""",10038388
California,"""Orange""",3116069
California,"""Riverside""",2298032
California,"""Sacramento""",1465832
California,"""San Bernardino""",2094769
California,"""San Diego""",3223096
California,"""Santa Clara""",1868149


- Find only the counties in the state of California that also have a population above one million (`1000000`).




In [7]:
counties_selected <- counties %>%
  select(state, county, population)

counties_selected %>%
  # Filter for counties with a population above 1000000
  filter(state == "California", population > 1000000)

state,county,population
<chr>,<chr>,<int>
California,"""Alameda""",1584983
California,"""Contra Costa""",1096068
California,"""Los Angeles""",10038388
California,"""Orange""",3116069
California,"""Riverside""",2298032
California,"""Sacramento""",1465832
California,"""San Bernardino""",2094769
California,"""San Diego""",3223096
California,"""Santa Clara""",1868149


 ### `04-Filtering and arranging`

- Filter for counties in the state of Texas that have more than ten thousand people (`10000`), and sort them in descending order of the percentage of people employed in private work.



In [8]:
counties_selected <- counties %>%
  select(state, county, population, private_work, public_work, self_employed)

# Filter for Texas and more than 10000 people; sort in descending order of private_work
counties_selected %>%
  # Filter for Texas and more than 10000 people
  filter(state == "Texas", population > 10000) %>%
  # Sort in descending order of private_work
  arrange(desc(private_work))

state,county,population,private_work,public_work,self_employed
<chr>,<chr>,<int>,<dbl>,<dbl>,<dbl>
Texas,"""Gregg""",123178,84.7,9.8,5.4
Texas,"""Collin""",862215,84.1,10.0,5.8
Texas,"""Dallas""",2485003,83.9,9.5,6.4
Texas,"""Harris""",4356362,83.4,10.1,6.3
Texas,"""Andrews""",16775,83.1,9.6,6.8
Texas,"""Tarrant""",1914526,83.1,11.4,5.4
Texas,"""Titus""",32553,82.5,10.0,7.4
Texas,"""Denton""",731851,82.2,11.9,5.7
Texas,"""Ector""",149557,82.0,11.2,6.7
Texas,"""Moore""",22281,82.0,11.7,5.9


### `05-Calculating the number of government employees`
- Use `mutate()` to add a column called `public_workers` to the dataset, with the number of people employed in public (government) work.

- Sort the new column in descending order.


In [9]:
counties_selected <- counties %>%
  select(state, county, population, public_work)

counties_selected %>%
  mutate(public_workers = public_work * population / 100) %>%
  # Sort in descending order of the public_workers column
  arrange(desc(public_workers))

state,county,population,public_work,public_workers
<chr>,<chr>,<int>,<dbl>,<dbl>
California,"""Los Angeles""",10038388,11.5,1154414.6
Illinois,"""Cook""",5236393,11.5,602185.2
California,"""San Diego""",3223096,14.8,477018.2
Arizona,"""Maricopa""",4018143,11.7,470122.7
Texas,"""Harris""",4356362,10.1,439992.6
New York,"""Kings""",2595259,14.4,373717.3
California,"""San Bernardino""",2094769,16.7,349826.4
California,"""Riverside""",2298032,14.9,342406.8
California,"""Sacramento""",1465832,21.8,319551.4
California,"""Orange""",3116069,10.2,317839.0


### `06-Calculating the percentage of women in a county`
- Select the columns `state`, `county`, `population`, `men`, and `women`.
- Add a new variable called `proportion_women` with the fraction of the county's population made up of women.


In [10]:
counties_selected <- counties %>%
  # Select the columns state, county, population, men, and women
  select(state, county, population, men, women)

counties_selected

state,county,population,men,women
<chr>,<chr>,<int>,<int>,<int>
Alabama,"""Autauga""",55221,26700,28500
Alabama,"""Baldwin""",195121,95300,99800
Alabama,"""Barbour""",26932,14500,12400
Alabama,"""Bibb""",22604,12100,10500
Alabama,"""Blount""",57710,28500,29200
Alabama,"""Bullock""",10678,5660,5020
Alabama,"""Butler""",20354,9500,10900
Alabama,"""Calhoun""",116648,56300,60400
Alabama,"""Chambers""",34079,16300,17800
Alabama,"""Cherokee""",26008,13000,13000


In [11]:
counties_selected <- counties %>%
  # Select the columns state, county, population, men, and women
  select(state, county, population, men, women)

counties_selected %>%
  # Calculate proportion_women as the fraction of the population made up of women
  mutate(proportion_women = women / population)

state,county,population,men,women,proportion_women
<chr>,<chr>,<int>,<int>,<int>,<dbl>
Alabama,"""Autauga""",55221,26700,28500,0.5161080
Alabama,"""Baldwin""",195121,95300,99800,0.5114775
Alabama,"""Barbour""",26932,14500,12400,0.4604188
Alabama,"""Bibb""",22604,12100,10500,0.4645196
Alabama,"""Blount""",57710,28500,29200,0.5059782
Alabama,"""Bullock""",10678,5660,5020,0.4701255
Alabama,"""Butler""",20354,9500,10900,0.5355213
Alabama,"""Calhoun""",116648,56300,60400,0.5177971
Alabama,"""Chambers""",34079,16300,17800,0.5223158
Alabama,"""Cherokee""",26008,13000,13000,0.4998462


### `07-Select, mutate, filter, and arrange`

- Select only the columns `state`, `county`, `population`, `men`, and `women`.
- Add a variable `proportion_men` with the fraction of the county's population made up of men.
- Filter for counties with a population of at least ten thousand (`10000`).
- Arrange counties in descending order of their proportion of men.

In [12]:
counties %>%
  # Select the five columns 
  select(state, county, population, men, women) %>%
  # Add the proportion_men variable
  mutate(proportion_men = men / population) %>%
  # Filter for population of at least 10,000
  filter(population >= 10000) %>%
  # Arrange proportion of men in descending order 
  arrange(desc(proportion_men))

state,county,population,men,women,proportion_men
<chr>,<chr>,<int>,<int>,<int>,<dbl>
Virginia,"""Sussex""",11864,8130,3730,0.6852664
California,"""Lassen""",32645,21800,10800,0.6677899
Georgia,"""Chattahoochee""",11914,7940,3970,0.6664428
Louisiana,"""West Feliciana""",15415,10200,5190,0.6616932
Florida,"""Union""",15191,9830,5360,0.6470937
Texas,"""Jones""",19978,12700,7330,0.6356993
Missouri,"""DeKalb""",12782,8080,4700,0.6321389
Texas,"""Madison""",13838,8650,5190,0.6250903
Virginia,"""Greensville""",11760,7300,4460,0.6207483
Texas,"""Anderson""",57915,35500,22400,0.6129673


### `The End`