# Data Preparation
#### By Xinqian Zhai, Cliff Gong, Gen Ho

1. Webscrap property tax data
2. Prepare Population growth rate before and since pandemic data

## 1. Webscraping Property Tax Dataset
To see the raw data, you can click the web link below:

https://smartasset.com/taxes/texas-property-tax-calculator#texas

In [1]:
# import useful libraries
import pandas as pd 
import requests
import warnings
warnings.filterwarnings('ignore')

In [2]:
def get_tax_table(tax_state):

    state_prefix = {'CA':'california', 
                  'TX':'texas', 
                  'NY':'new-york'}
    url_link = 'https://smartasset.com/taxes/{}-property-tax-calculator'.format(state_prefix[tax_state])

    r = requests.get(url_link)
    html_to_dataframe = pd.read_html(r.text)[6]
    
    # only get the prpperty tax rate
    tax_rate_df = html_to_dataframe.iloc[:, [0,-1]]
    tax_rate_df['Average Effective Property Tax Rate'] = round(tax_rate_df['Average Effective Property Tax Rate'].str[:4].astype('float')/100, 4)

    return tax_rate_df

# These will contain the tax table for that state
ca_tax = get_tax_table('CA')
ny_tax = get_tax_table('NY')
tx_tax = get_tax_table('TX')


# write property tax rate data to csv files

ca_tax.to_csv('./data/PropertyTaxRateCA.csv', index = False)
ny_tax.to_csv('./data/PropertyTaxRateNY.csv', index = False)
tx_tax.to_csv('./data/PropertyTaxRateTX.csv', index = False)


In [3]:
ca = pd.read_csv('./data/PropertyTaxRateCA.csv')
ca.head()

Unnamed: 0,County,Average Effective Property Tax Rate
0,Alameda County,0.0078
1,Alpine County,0.0082
2,Amador County,0.0074
3,Butte County,0.0074
4,Calaveras County,0.0081


## 2. Preparing Population Growth Rate Before and Since Pandemic Dataset
- 2018 population at county level. Data from: https://www.census.gov/data/datasets/time-series/demo/popest/2010s-counties-total.html 
- 2021 population at county level. Data from: https://worldpopulationreview.com/us-counties/states/ca 

In [4]:
# get 2018 population at county level for the three states
def get_population18_data():
    state_prefix = [('CA',' California'), 
                    ('TX',' Texas'), 
                    ('NY',' New York')]
    df_list = []
    for item in state_prefix:
        df =  pd.read_csv('./demographic_data/population'+item[0]+'18.csv', header = 3, 
                            usecols = ['Unnamed: 0','2018'], skipfooter = 5)
        df = df.iloc[1:, : ]
        df.columns = ['county','2018']
        df['county'] = df['county'].str[1:].replace(item[1], item[0], regex = True)
        df_list.append(df) 
    df = pd.concat(df_list).reset_index(drop=True)
    df['2018'] = df['2018'].str.split(',').str.join('').astype(int)

    return df

In [5]:
# web scrap 2021 population data at county level for three states
def get_population21_table(tax_state):
    state_prefix = {'CA':'ca',
                    'NY':'ny',
                    'TX':'tx'}
    url_link = 'https://worldpopulationreview.com/us-counties/states/{}'.format(state_prefix[tax_state])

    r = requests.get(url_link)
    html_to_dataframe = pd.read_html(r.text)[0]

    # only get the 2021 population
    df = html_to_dataframe.iloc[:, [0,1]]
    df.columns = ['county', '2021']
    df['county'] = df['county'] + ',' + tax_state
    df = df.sort_values('county', ascending=True).reset_index(drop=True)

    return df

# get population for the states
pop21_ca = get_population21_table('CA')
pop21_ny = get_population21_table('NY')
pop21_tx = get_population21_table('TX')

# combine all to form one dtaframe
pop21_all = pop21_ca.append(pop21_ny, ignore_index=True).append(pop21_tx, ignore_index=True)

In [6]:
# combine population 2018 and 2021 to calculate population growth rate
def get_pop_growth_rate():
    pop18_all = get_population18_data()

    population = pop18_all.merge(pop21_all, on = 'county')
    population['growth_rate'] = round((population['2021'] - population['2018'])/population['2018'], 3)
    
    # align populations of counties with the counties in the rentals, which only keep 49 counties
    returns = pd.read_csv("./output/returns.csv", index_col=0)
    counties_to_keep = returns.columns.values
    population = population[population['county'].isin(counties_to_keep)]

    return population

pop_growth_rate = get_pop_growth_rate()

In [7]:
# write population growth rate to demographic_data folder
pop_growth_rate.to_csv('./demographic_data/pop_growth_rate.csv', index = False)

In [8]:
df = pd.read_csv('./demographic_data/pop_growth_rate.csv')
df.head()


Unnamed: 0,county,2018,2021,growth_rate
0,"Alameda County,CA",1666756,1680480,0.008
1,"Contra Costa County,CA",1150519,1159540,0.008
2,"Kern County,CA",893758,913090,0.022
3,"Los Angeles County,CA",10073906,9969510,-0.01
4,"Marin County,CA",259662,257154,-0.01


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=718fdf0e-933b-4ec5-90a8-5b2fe887b720' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>