# Project List of United States cities by population

## Load Library 

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Exploring HTML

In [6]:
url = "https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population"
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36 Edg/99.0.1150.46'}
Data = requests.get(url, headers)
soup = BeautifulSoup(Data.content , 'html.parser')

In [38]:
tables = soup.find("table", class_="wikitable sortable")
# It will not appear until sorted
#https://stackoverflow.com/questions/48976977/beautifulsoup-cant-extract-a-table-from-wiki

In [75]:
# looking around found that the column headers are under the th tag
tables.select('th')

[<th scope="col">2020<br/>rank
 </th>,
 <th scope="col">City
 </th>,
 <th scope="col">State<sup class="reference" id="cite_ref-5"><a href="#cite_note-5">[c]</a></sup>
 </th>,
 <th scope="col">2020<br/>census
 </th>,
 <th scope="col">2010<br/>census
 </th>,
 <th scope="col">Change
 </th>,
 <th colspan="2" scope="col">2020 land area
 </th>,
 <th colspan="2" scope="col">2020 population density
 </th>,
 <th scope="col">Location
 </th>,
 <th scope="row">1
 </th>,
 <th scope="row">2
 </th>,
 <th scope="row">3
 </th>,
 <th scope="row">4
 </th>,
 <th scope="row">5
 </th>,
 <th scope="row">6
 </th>,
 <th scope="row">7
 </th>,
 <th scope="row">8
 </th>,
 <th scope="row">9
 </th>,
 <th scope="row">10
 </th>,
 <th scope="row">11
 </th>,
 <th scope="row">12
 </th>,
 <th scope="row">13
 </th>,
 <th scope="row">14
 </th>,
 <th scope="row">15
 </th>,
 <th scope="row">16
 </th>,
 <th scope="row">17
 </th>,
 <th scope="row">18
 </th>,
 <th scope="row">19
 </th>,
 <th scope="row">20
 </th>,
 <th scope="r

In [80]:
# all the individual data in the table
items = tables.select('tbody tr td')

In [81]:
infos = [pt.get_text() for pt in items]
infos

['New York[d]\n',
 'New York\n',
 '8,804,190\n',
 '8,175,133\n',
 '+7.69%\n',
 '300.5\xa0sq\xa0mi\n',
 '778.3\xa0km2\n',
 '29,298/sq\xa0mi\n',
 '11,312/km2\n',
 '40°40′N 73°56′W\ufeff / \ufeff40.66°N 73.93°W\ufeff / 40.66; -73.93\ufeff (1 New York City)\n',
 'Los Angeles\n',
 'California\n',
 '3,898,747\n',
 '3,792,621\n',
 '+2.80%\n',
 '469.5\xa0sq\xa0mi\n',
 '1,216.0\xa0km2\n',
 '8,304/sq\xa0mi\n',
 '3,206/km2\n',
 '34°01′N 118°25′W\ufeff / \ufeff34.01°N 118.41°W\ufeff / 34.01; -118.41\ufeff (2 Los Angeles)\n',
 'Chicago\n',
 'Illinois\n',
 '2,746,388\n',
 '2,695,598\n',
 '+1.88%\n',
 '227.7\xa0sq\xa0mi\n',
 '589.7\xa0km2\n',
 '12,061/sq\xa0mi\n',
 '4,657/km2\n',
 '41°50′N 87°41′W\ufeff / \ufeff41.83°N 87.68°W\ufeff / 41.83; -87.68\ufeff (3 Chicago)\n',
 'Houston\n',
 'Texas\n',
 '2,304,580\n',
 '2,099,451\n',
 '+9.77%\n',
 '640.4\xa0sq\xa0mi\n',
 '1,658.6\xa0km2\n',
 '3,599/sq\xa0mi\n',
 '1,390/km2\n',
 '29°47′N 95°23′W\ufeff / \ufeff29.78°N 95.39°W\ufeff / 29.78; -95.39\ufeff (4 Ho

In [144]:
# since they where scrape in order along each rows  for each columns in this list, simple math to get 
# everything group up together. ex the first data containing City is 10 value away from the next city value.
# with this in mind using indexing
cities = [city.replace('\n','') for city in infos[0::10]]
States = [state.replace('\n','') for state in infos[1::10]]
twentycensus = [census.replace('\n','') for census in infos[2::10]]  
tencensus = [census.replace('\n','') for census in infos[3::10]] 
change = [change.replace('\n','') for change in infos[4::10]] 
twentyLandAsq = [Land.replace('\n','') for Land in infos[5::10]] 
twentyLandAkm = [Land.replace('\n','') for Land in infos[6::10]] 
twentypopdensitysq = [density.replace('\n','') for density in infos[7::10]]
twentypopdensitykm = [density.replace('\n','') for density in infos[8::10]] 
location = [location.replace('\n','') for location in infos[9::10]] 

# check the length( this will tell you if you made any errors as you can't make dataframws with unequal lengths)
look =[cities, States, twentycensus, tencensus, change, twentyLandAsq,twentyLandAkm,twentypopdensitysq ,
twentypopdensitykm ,location]

for i in look:
    print(len(i))

In [154]:
# create a Dictionary

CitiesPop = {'City' : cities,
'State' : States,
'2020_Census': twentycensus,
'2010_Census': tencensus,
'Change': change,
'2020_l_area_sq' : twentyLandAsq,
'2020_l_area_km' : twentyLandAkm,
'2020_Pdensity_sq' : twentypopdensitysq,
'2020_Pdensity_km' :twentypopdensitykm ,
'Location' : location}



Cities_df = pd.DataFrame(CitiesPop)  


Cities_df.to_csv('United States cities by population.csv',index=False)


In [152]:
Cities_df.head()

Unnamed: 0,City,State,2020_Census,2010_Census,Change,2020_l_area_sq,2020_l_area_km,2020_Pdensity_sq,2020_Pdensity_km,Location
0,New York[d],New York,8804190,8175133,+7.69%,300.5 sq mi,778.3 km2,"29,298/sq mi","11,312/km2",40°40′N 73°56′W﻿ / ﻿40.66°N 73.93°W﻿ / 40.66; ...
1,Los Angeles,California,3898747,3792621,+2.80%,469.5 sq mi,"1,216.0 km2","8,304/sq mi","3,206/km2",34°01′N 118°25′W﻿ / ﻿34.01°N 118.41°W﻿ / 34.01...
2,Chicago,Illinois,2746388,2695598,+1.88%,227.7 sq mi,589.7 km2,"12,061/sq mi","4,657/km2",41°50′N 87°41′W﻿ / ﻿41.83°N 87.68°W﻿ / 41.83; ...
3,Houston,Texas,2304580,2099451,+9.77%,640.4 sq mi,"1,658.6 km2","3,599/sq mi","1,390/km2",29°47′N 95°23′W﻿ / ﻿29.78°N 95.39°W﻿ / 29.78; ...
4,Phoenix,Arizona,1608139,1445632,+11.24%,518.0 sq mi,"1,341.6 km2","3,105/sq mi","1,199/km2",33°34′N 112°05′W﻿ / ﻿33.57°N 112.09°W﻿ / 33.57...
