<h1>Import Dependencies</h1>

In [1]:
import requests
import lxml.html as lh
import pandas as pd

<h1>Get Website Data</h1>

In [2]:
# Store website in a url variable
url = "https://www.california-demographics.com/counties_by_population"

In [3]:
# Use the request method on the url
req = requests.get(url)

In [4]:
# Store the contents of the website using the html lxml.html parser lh
doc = lh.fromstring(req.content)

<h1>Get Table Data</h1>

In [5]:
#Parse data that is stored between HTML table row tags <tr>..</tr>
tr_elements = doc.xpath('//tr')

In [6]:
# Confirm that you are gathering tabular data
# You can do that by inspecting the length of 
# the rows by using a list comprehension
# make sure that each row has the same number of columns
[len(T) for T in tr_elements[:10]]

[3, 3, 3, 3, 3, 3, 3, 3, 3, 3]

<h1>Get Table Header Data</h1>

In [7]:
#Create empty list
table_data=[]
i=0
#For each row, store each first element (header) and an empty list
for t in tr_elements[0]:
    i+=1

    #lxlm text_content() method extracts text values
    name=t.text_content()
    #Check you are gathering the header information with print statement
    #This is optional   
    print (i,name) 
    
    #use the List .append() method to add the text from each row
    #into the empty list col you created
    table_data.append((name,[]))

1 Rank
2 County
3 Population


<h1>Get Table Data</h1>

In [8]:
# The header data was stored in the first row 
# From above code(index 0 = tr_elements[0])
# data is stored on the second row onwards
# Use a for loop to iterate through the remaining tr elements
# Make sure that each row is tabular, if not break out
# For tabular data (row with equal columms) 
# Store the data into your table_data list.
for j in range(1,len(tr_elements)):
    
    #T is our j'th row
    T=tr_elements[j]
    
    #Count the number of columns in the table and assign to variable
    #In this example, we have 3 columns with data     
    col_nbr = 3
    
    #If row is not of size 28 (# columns), 
    #the //tr data is not from our table 
    if len(T)!=col_nbr:
        break
    
    #i is the index of our column
    i=0
    
    #Iterate through each element of the row
    #This for loop uses the lxlm .iterchildren() method
    for t in T.iterchildren():
        #This code uses the lxlm .text_content() method         
        data=t.text_content() 
        
        #Check if row is empty
        if i>0:
        #Convert any numerical value to integers
            try:
                data=int(data)
            except:
                pass
        #Append the data to the empty list of the i'th column
        table_data[i][1].append(data)
        #Increment i for the next column
        i+=1

In [9]:
# Quality control - confirm that you gathered columns with equal rows
[len(C) for (title,C) in table_data]

[58, 58, 58]

<h1>Add Data to Dataframe</h1>

In [10]:
# Create the dataframe
Dict={title:column for (title,column) in table_data}
df=pd.DataFrame(Dict)
df = df.drop('Rank', axis=1)
df.head()

Unnamed: 0,County,Population
0,\n Los Angeles County\n,"\n 10,081,570\n\n"
1,\n San Diego County\n,"\n 3,316,073\n\n"
2,\n Orange County\n,"\n 3,168,044\n\n"
3,\n Riverside County\n,"\n 2,411,439\n\n"
4,\n San Bernardino County\n,"\n 2,149,031\n\n"


<h1>Drop Unnecesary Columns and remove substrings and commas</h1>

In [11]:
df = df.replace(',','', regex=True)
df.head()

Unnamed: 0,County,Population
0,\n Los Angeles County\n,\n 10081570\n\n
1,\n San Diego County\n,\n 3316073\n\n
2,\n Orange County\n,\n 3168044\n\n
3,\n Riverside County\n,\n 2411439\n\n
4,\n San Bernardino County\n,\n 2149031\n\n


In [12]:
for column in df:
    df[column] = df[column].map(lambda x: x.strip())
    df[column] = df[column].map(lambda x: x.lstrip('\n'))
    df[column] = df[column].map(lambda x: x.rstrip('\n'))

<h1>Convert Columns to Integers and Replace NaN with Zero</h1>

In [13]:
df.Population = df.Population.astype(int)

In [14]:
df.County = df.County.replace('County', '', regex=True)
df.head()

Unnamed: 0,County,Population
0,Los Angeles,10081570
1,San Diego,3316073
2,Orange,3168044
3,Riverside,2411439
4,San Bernardino,2149031


In [15]:
df.fillna(0, inplace=True) # convert NaN to zero

<h1>Save Dataframe to CSV</h1>

In [16]:
df.to_csv("data/ca_county_pop.csv", encoding='utf-8', index=False)