In [1]:
# import the necessary libraries
import re
import csv
import requests
import pandas as pd
import numpy as np
from time import sleep
from bs4 import BeautifulSoup

### Getting the density data

There are 2 wikipedia pages offering the density data:
- [List of United States cities by population](https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population)
- [List of United States cities by population density
](https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population_density)

I choose the first one because the second is sources from the 2010 consensus dataset, while the second one is based on more recent data from 2016.


In [2]:
# NOTE: files will be saved to this directory, so you need to ensure
data_dir = "../Data/"
wiki_file_name = "wiki-pop-table.html"
wiki_csv_output = "wiki-table-population.csv"

In [3]:
def download_wiki_table():
    # Url to scrape
    base_url = 'https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population'
    wiki_page = requests.get(base_url)
    
    soup = BeautifulSoup(wiki_page.text, "html.parser")

#     # To clear unicode
#     soup = soup.prettify("utf-8")

    # To store our resutls
    city_pop_density = {}

    # Grab the table from this page
    wiki_table = soup.find("table",{"class":"wikitable sortable"})

    # Open a (new) file in the state_dir to save our parsed state's html
    with open(f"{data_dir}{wiki_file_name}", "w") as file:
        print(f"Writing html output for file {wiki_file_name}")
        file.write(str(wiki_table.prettify()))

# Write to disk to don't have to repeat each time
download_wiki_table()

Writing html output for file wiki-pop-table.html


In [4]:
def int_repr(string_num):
    if "," in string_num:
        return int(string_num.replace(',', ''))
    else:
        return int(string_num)
    
def remove_end_brackets(string_brackets):
    if "[" in string_brackets:
        return string_brackets[:-3]
    else:
        return string_brackets
    

In [5]:
# Run this when you want to get the items from the downloaded wiki table again
def get_city_pop_dens_data(): 
    
    # Load saved html file as bs object, by it's name we saved
    wiki_table = BeautifulSoup(open(f"{data_dir}{wiki_file_name}"), "html.parser")
    
    with open(f"{data_dir}{wiki_csv_output}", 'w', newline='') as file:
        # Create the csv writer and add the column names
        writer = csv.writer(file)
        writer.writerow(["city", "state", "pop_est_2019", "pop_est_2010",
                         "pop_dens_2016_mi", "pop_dens_2016_km"])
        
        # For the stats
        row_count = 1
 
        # Rename cities needed to match later on
        rename_cities = {
            "Fort Lauderdale":"Ft. Lauderdale",
            "Saint Paul": "St. Paul",
            "New York City":"New York",
        }

        # Grab eachrow from the table, first row is the header of the table
        for row in wiki_table.findAll("tr")[1:]:

            # Grab the all the html columns for this row
            row_items = row.findAll('td')

            # Make sure it's valid
            if row_items is not None and len(row_items) > 8:
                
                # Grabs each row's information we're after
                city_name = row_items[1].get_text(strip=True)
                state_name = row_items[2].get_text(strip=True)
                pop_est_2019 = row_items[3].get_text(strip=True)
                pop_est_2010 = row_items[4].get_text(strip=True)
                pop_dens_2016_mi_raw = row_items[8].get_text(strip=True)
                pop_dens_2016_km_raw = row_items[9].get_text(strip=True)
                
                if len(pop_dens_2016_mi_raw) > 15 or len(pop_dens_2016_km_raw) > 15:
                    print(f"ERROR Wrongly formatted city -> '{city_name}'")
                    print(f"Pop density miles -> '{pop_dens_2016_mi_raw}'")
                    print(f"Pop density km -> '{pop_dens_2016_km_raw}' \n")
                    row_count -= 1
                    continue
                
                # Remove brackets from cityname if existend
                city_name = remove_end_brackets(city_name)
                
                # Exceptions
                if city_name in rename_cities:
                    print(F"Renaming -> {city_name}")
                    city_name = rename_cities[city_name]

                
                # Remove unicode space characters,see:
                # https://stackoverflow.com/a/26068871/8970591
                pop_dens_2016_mi_raw = pop_dens_2016_mi_raw.replace(u'\xa0', ' ')
                pop_dens_2016_km_raw = pop_dens_2016_km_raw.replace(u'\xa0', ' ')

                # Get square miles as amount 
                split_pop_dens_2016_mi = pop_dens_2016_mi_raw.split('/sq')
                pop_dens_2016_mi = int_repr(split_pop_dens_2016_mi[0])
        
                # Get square km as amount 
                split_pop_dens_2016_km = pop_dens_2016_km_raw.split('/km2')
                pop_dens_2016_km = int_repr(split_pop_dens_2016_km[0])

                # Validate converted intergers
                # print(city_name, pop_dens_2016_mi, pop_dens_2016_km)
                
                # Append the row with the data
                writer.writerow([city_name, state_name, pop_est_2019,
                                 pop_est_2010, pop_dens_2016_mi, pop_dens_2016_km])

                row_count += 1
            else:
                # Not finding stats and/or county names, so need to start next iteration
                print("Error not matching this row")
                continue
                
        print(f"Done creaing city information csv with {row_count} city rows of total 317 US cities")

get_city_pop_dens_data()


Renaming -> New York City
Renaming -> Saint Paul
Renaming -> Fort Lauderdale
ERROR Wrongly formatted city -> 'South Carolina'
Pop density miles -> '573/km2'
Pop density km -> '32°55′04″N80°03′54″W﻿ / ﻿32.9178°N 80.0650°W﻿ /32.9178; -80.0650﻿ (North Charleston)' 

Done creaing city information csv with 316 city rows of total 317 US cities
