In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests, re, time, urllib3, nltk
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from tqdm import tqdm

## Source 

https://www.citytowninfo.com/places

## State by State Info

In [2]:
url  = 'https://www.citytowninfo.com/places'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

In [3]:
state_dict = {'state':[], 'state_link':[], 'population':[], 'capital':[], 'capital_link':[]}
for row in soup.find('table').find_all('tr')[2:]:
    state_dict['state'].append(row.select_one("td:nth-of-type(1)").text)
    state_dict['state_link'].append(row.select_one("td:nth-of-type(1)").find('a')['href'])
    state_dict['population'].append(row.select_one("td:nth-of-type(2)").text)
    state_dict['capital'].append(row.select_one("td:nth-of-type(3)").text)
    state_dict['capital_link'].append(row.select_one("td:nth-of-type(3)").find('a')['href'])

In [4]:
state_info_dict = {}
for link, state in zip(state_dict['state_link'], state_dict['state']):
    print('{} - {} {}\r'.format(state, link, " "*40), end="")
    response = requests.get(link)
    soup = BeautifulSoup(response.text, 'html.parser')
    tables = soup.find('div', attrs = {'id':'population_data'}).find_all('table')
    state_info_dict[state] = []
    for table in tables:
        for row in table.find_all('tr'):
            state_info_dict[state].append({row.select_one("td:nth-of-type(1)").text : row.select_one("td:nth-of-type(2)").text})

Wyoming - https://www.citytowninfo.com/places/wyoming                                                                   

In [5]:
for state in state_info_dict.keys():
    state_info_dict[state] = dict(map(dict.popitem, state_info_dict[state])) 

In [6]:
state_data = pd.DataFrame(state_info_dict).transpose()
state_data = state_data.dropna(thresh=len(state_data)-1, axis=1).drop(['+ Show More'], axis = 1)

In [7]:
state_data

Unnamed: 0,% Above Poverty Level,% Below Poverty Level,% Using Public Transportation,% Walking and Biking to Work,% Working from Home,% of people married,African,Age of the Population,Arab,Austrian,...,Population % with Bachelor Degree or Higher,Portuguese,Russian,Scottish,Senior Citizens,Swedish,Swiss,Total Population,Welsh,West Indian
Alaska,89.8%,10.2%,1.5%,8.9%,4.4%,48.9%,0.5%,33.9,0.2%,0.3%,...,29%,0.3%,1.3%,2.4%,10.1%,2.3%,0.4%,738565,0.8%,0.2%
Arizona,83%,17%,2%,2.9%,5.9%,47.3%,0.6%,37.2,0.6%,0.2%,...,28.4%,0.2%,0.8%,1.8%,16.2%,1.4%,0.3%,6809946,0.6%,0.2%
Arkansas,81.9%,18.1%,0.4%,1.9%,3.2%,50.1%,0.4%,37.9,0.2%,0.1%,...,22%,0.1%,0.2%,1.7%,16%,0.5%,0.2%,2977944,0.5%,0.1%
California,84.9%,15.1%,5.2%,3.8%,5.6%,46.7%,0.7%,36.1,0.8%,0.2%,...,32.6%,0.9%,1%,1.3%,13.2%,1%,0.3%,38982848,0.4%,0.2%
Colorado,88.5%,11.5%,3.2%,4.1%,7.4%,50.8%,0.8%,36.5,0.4%,0.4%,...,39.4%,0.2%,1.1%,2.6%,13%,2.3%,0.4%,5436519,0.9%,0.2%
Connecticut,89.9%,10.1%,4.9%,3.2%,4.8%,47.9%,1%,40.8,0.6%,0.4%,...,38.4%,1.4%,1.7%,1.7%,16%,1.6%,0.2%,3594478,0.4%,2.6%
Delaware,87.9%,12.1%,2.8%,2.4%,4.5%,47.7%,1.3%,39.8,0.4%,0.2%,...,31%,0.2%,0.8%,1.5%,17%,0.8%,0.2%,943732,0.8%,1.1%
District of Columbia,82.6%,17.4%,35.4%,17.8%,5.7%,28.5%,3.1%,33.9,0.7%,0.4%,...,56.6%,0.3%,1.5%,1.3%,11.9%,0.7%,0.3%,672391,0.4%,1.4%
Florida,84.5%,15.5%,2%,2.2%,5.6%,46.4%,0.6%,41.8,0.6%,0.2%,...,28.5%,0.4%,1%,1.5%,19.4%,0.7%,0.2%,20278448,0.5%,4.5%
Georgia,83.1%,16.9%,2.1%,1.7%,5.2%,47%,1.8%,36.4,0.3%,0.1%,...,29.9%,0.1%,0.4%,1.8%,12.7%,0.4%,0.1%,10201635,0.4%,1.2%


## City By City Data

In [8]:
driver = webdriver.Chrome(executable_path ="chromedriver.exe") 
wait = WebDriverWait(driver, 100)

city_dict = {}
i = 1

for link, state in zip(state_dict['state_link'], state_dict['state']):
    
    print('{} - {}           \r'.format(i, state), end="")
    i += 1
    
    
    driver.get(link) 
    driver.maximize_window()
    driver.implicitly_wait(4)

    city_dict[state] = {
        'city':[],
        'city_link':[]
    }
    
    cities_click = driver.find_element_by_class_name('quidget_article_content').find_element_by_xpath('//*[@id="top-box"]').find_elements_by_class_name('city_alphabet_range')
    for a in cities_click:
        driver.execute_script("arguments[0].click();", a)
        table = driver.find_element_by_xpath('//*[@id="city_list"]')
        bodies = table.find_elements_by_tag_name("tbody") # get all of the rows in the table
        for body in bodies:
            rows = body.find_elements_by_tag_name("tr")
            for row in rows:
                # Get the columns (all the column 2)       
                try:
                    city_dict[state]['city'].append(row.find_elements_by_tag_name("td")[0].text)
                    city_dict[state]['city_link'].append(row.find_element_by_tag_name('a').get_attribute('href'))
                except:
                    pass
driver.close()

50 - Wyoming                       

## Save as JSON

In [9]:
import json

json_data = json.dumps(city_dict)
f = open("dict_of_city_links.json","w")
f.write(json_data)
f.close()

In [10]:
with open('dict_of_city_links.json') as f: 
    city_dict = json.load(f) 

## Get City By City Data

In [11]:
city_info_by_state = {}
for state_num, state in enumerate(city_dict.keys()):
    city_info_by_state[state] = {}
    i = 1
    for link, city in zip(city_dict[state]['city_link'], city_dict[state]['city']):
        print('{}) {}: City {} of {} -- {} ({}){}\r'.format(state_num+1, state, i, len(city_dict[state]['city']), city, link, " "*80), end="")
        i += 1
        
        response = requests.get(link)
        soup = BeautifulSoup(response.text, 'html.parser')
        city_info_by_state[state][city] = []
        
        try:
            tables = soup.find('div', attrs = {'id':'population_data'}).find_all('table')
            for table in tables:
                for row in table.find_all('tr'):
                    city_info_by_state[state][city].append({row.select_one("td:nth-of-type(1)").text : row.select_one("td:nth-of-type(2)").text})
        except:
            pass
        
        try:
            tables = soup.find('div', attrs = {'id':'almanac'}).find('table')
            for table in tables:
                for row in table.find_all('tr'):
                    city_info_by_state[state][city].append({row.select_one("td:nth-of-type(1)").text : row.select_one("td:nth-of-type(2)").text})
        except:
            pass
            
        try:
            div = soup.find('div', attrs = {'id':'climate'})
            table = div.findNext('table') # Find the first <table> tag that follows it
            rows = table.findAll('tr')
            for row in rows:
                city_info_by_state[state][city].append({row.select_one("td:nth-of-type(1)").text : row.select_one("td:nth-of-type(2)").text})
        except:
            pass
        
for state in city_info_by_state.keys():
    for city in city_info_by_state[state].keys():
        city_info_by_state[state][city] = dict(map(dict.popitem, city_info_by_state[state][city])) 

50) Wyoming: City 95 of 95 -- Yoder (https://www.citytowninfo.com/places/wyoming/yoder)                                                                                                                                                                                         

## Save as JSON

In [12]:
import json

json_data = json.dumps(city_info_by_state)
f = open("dict_of_city_data.json","w")
f.write(json_data)
f.close()

In [None]:
with open('dict_of_city_data.json') as f: 
    city_info_by_state = json.load(f) 

## Convert Dict to DataFrame

In [29]:
city_data = pd.DataFrame.from_dict(
    {
        (i,j): city_info_by_state[i][j] 
        for i in city_info_by_state.keys() 
        for j in city_info_by_state[i].keys()
    }, 
    orient='index'
)
percent_to_keep = .8
city_data = city_data.dropna(thresh=int(percent_to_keep*len(city_data)), axis = 1).drop(['+ Show More', 'Station', 'Distance', ' Category'], axis = 1).dropna(subset=['Total Population'])

## Save as CSV

In [30]:
city_data.to_csv('all_city_data.csv')

In [34]:
city_data = pd.read_csv('all_city_data.csv')
city_data = city_data.rename(columns={'Unnamed: 0': 'State', 'Unnamed: 1': 'City'})
city_data = city_data.set_index(['State', 'City'], drop=True)
city_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Total Population,Male Share of the Population,Female Share of the Population,Senior Citizens,Employed Population 16+,Age of the Population,% of people married,Average household size,Population % with Bachelor Degree or Higher,Median Earnings25+,...,Phone Area Codes,Time Zone,Approximate Latitude,Approximate Longitude,Jan Temp,Apr Temp,Jul Temp,Oct Temp,Annual Precip,Standard Zip Codes
State,City,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Alaska,Akhiok,53,56.6%,43.4%,%,17,36.0,42.5%,3,%,"$21,750",...,907,America/Anchorage,57.79,-152.40,25 to 35 F,32 to 43 F,49 to 60 F,34 to 46 F,75.3 inches,99615
Alaska,Akiak,397,55.7%,44.3%,3.8%,111,25.0,42.7%,5,3%,"$18,214",...,907,,60.90,-161.23,1 to 12 F,18 to 33 F,49 to 63 F,25 to 35 F,16.1 inches,
Alaska,Akutan,801,68.8%,31.2%,5.5%,737,45.0,46.5%,2,11.7%,"$26,710",...,907,,54.12,-165.83,26 to 37 F,30 to 41 F,45 to 57 F,36 to 48 F,57.3 inches,
Alaska,Alakanuk,735,51.3%,48.7%,7.6%,150,21.0,39.9%,4,2.9%,"$13,250",...,907,America/Nome,62.71,-164.64,-4 to 11 F,14 to 32 F,49 to 62 F,19 to 34 F,12.4 inches,
Alaska,Aleknagik,190,51.6%,48.4%,8.4%,61,34.0,35.4%,3,9.5%,"$32,813",...,907,,59.24,-158.62,9 to 20 F,25 to 37 F,49 to 61 F,28 to 38 F,26 inches,
Alaska,Allakaket,186,54.8%,45.2%,12.4%,66,29.0,28.8%,3,17.1%,"$15,000",...,907,,66.57,-152.95,-19 to -3 F,11 to 34 F,50 to 71 F,12 to 25 F,13.9 inches,
Alaska,Ambler,299,53.2%,46.8%,9.4%,92,27.0,32.1%,4,1.3%,"$22,321",...,907,America/Nome,67.08,-157.90,-9 to 4 F,3 to 20 F,49 to 60 F,19 to 28 F,10 inches,
Alaska,Anaktuvuk Pass,290,59%,41%,9%,69,25.0,26.6%,3,4.8%,"$20,313",...,907,,68.15,-151.71,-19 to -3 F,11 to 34 F,50 to 71 F,12 to 25 F,13.9 inches,
Alaska,Anderson,143,58%,42%,14%,90,42.0,51.1%,2,34%,"$72,250",...,907,America/Anchorage,64.31,-149.16,-18 to 2 F,19 to 42 F,51 to 72 F,13 to 30 F,12.8 inches,
Alaska,Angoon,535,59.1%,40.9%,11.6%,286,42.0,40.1%,3,12%,"$40,395",...,907,,57.44,-134.48,18 to 28 F,33 to 47 F,47 to 61 F,37 to 46 F,116.2 inches,


In [17]:
city_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17843 entries, 0 to 17842
Data columns (total 41 columns):
Total Population                                       17843 non-null object
Male Share of the Population                           17843 non-null object
Female Share of the Population                         17843 non-null object
Senior Citizens                                        17843 non-null object
Employed Population 16+                                17843 non-null object
Age of the Population                                  17843 non-null float64
% of people married                                    17843 non-null object
Average household size                                 17843 non-null int64
Population % with Bachelor Degree or Higher            17843 non-null object
Median Earnings25+                                     17843 non-null object
Median Earnings Without High School Qualification      17843 non-null object
Median Earnings With High School Degree     