In [1]:
import pandas as pd
import psycopg2

def fetch_query_records(query, columns):
    """
    Creates a connection to database, returns query from specified table.

    Input: query: a SQL query (string)

    Returns: response: cursos.fetchall() object in array form
    """
    DB_USER="citrics"
    DB_PASSWORD="BnDW2WupbFpgZSewsZm7"
    DB_NAME="postgres"
    DB_HOST="citricsads.cav8gkdxva9e.us-east-1.rds.amazonaws.com"

    # Creating Connection Object
    conn = psycopg2.connect(
        dbname=DB_NAME,
        user=DB_USER,
        password=DB_PASSWORD,
        host=DB_HOST)
    # Creating Cursor Object
    cursor = conn.cursor()
    # Fetch query
    query = query
    # Execute query
    cursor.execute(query)
    # Query results
    response = list(cursor.fetchall())
    # Closing Connection
    conn.close()

    return response

def fetch_query(query, columns):
    """
    Creates a connection to database, returns query from specified table
    as a list of dictionaries.

    Input: query: a SQL query (string)

    Returns: pairs: dataframe of cursor.fetchall() response in JSON pairs
    """
    
    # Fetch query
    response = fetch_query_records(query, columns)

    # Key-value pair names for df columns
    columns = columns
    # List of tuples to DF
    df = pd.DataFrame(response, columns=columns)
    # DF to dictionary
    pairs = df.to_json(orient='records')

    return pairs

# Goal: Compile `census`, `bls_jobs`, `walkability`, and `rental` into single, merged dataset
- Should align with front-end form where end user can enter preferences for:
    - Population
    - Average Rent
    - WalkScore
    - Primary Job Industry

### Start by creating df instances from each table in DB via SQL queries

In [8]:
### Rental Data

query = """
SELECT *
FROM rental
WHERE "month" = '2020-08-01'
"""

columns = ["month", "city", "state", "Studio", "onebr", "twobr", "threebr", "fourbr"]

rental = pd.read_json(fetch_query(query, columns))

rental = rental.drop("month", axis=1)

rental.columns = [x.lower() for x in rental.columns.to_list()]

print(rental.shape)
rental.head()

(444, 7)


Unnamed: 0,city,state,studio,onebr,twobr,threebr,fourbr
0,King of Prussia,PA,1038,1232,1488,1862,2072
1,Las Cruces,NM,514,599,726,1045,1280
2,North East,MD,909,1079,1303,1630,1814
3,St. Cloud,MN,578,712,883,1218,1543
4,Pasadena,CA,1350,1606,2064,2806,3093


In [9]:
### BLS Data

query = """
SELECT DISTINCT ON (j.city) j.*
FROM bls_jobs j
ORDER BY j.city, j.loc_quotient DESC
"""

columns = [
    "city",
    "state",
    "occ_title",
    "jobs_1000",
    "loc_quotient",
    "hourly_wage",
    "annual_wage"]

bls = pd.read_json(fetch_query(query, columns))

bls = bls.drop(["jobs_1000","loc_quotient"], axis=1)

print(bls.shape)
bls.head()

(379, 5)


Unnamed: 0,city,state,occ_title,hourly_wage,annual_wage
0,Abilene,TX,Wellhead Pumpers,30.14,62680
1,Aguadilla-Isabela,PR,Telemarketers,8.16,16980
2,Akron,OH,"Patternmakers, Metal and Plastic",17.27,35930
3,Albany,OR,Fallers,35.98,74850
4,Albany-Schenectady-Troy,NY,"Physical Scientists, All Other",46.86,97470


In [21]:
### WalkScores

query = """
SELECT *
FROM WALKABILITY
"""

columns = ["city", "walkscore"]

walkscores = pd.read_json(fetch_query(query, columns))

walkscores["state"] = [x[-2:] for x in walkscores.city.to_list()]
walkscores["city"] = [x.split(",")[0] for x in walkscores.city.to_list()]

walkscores = walkscores[["city","state","walkscore"]]

print(walkscores.shape)
walkscores.head()

(444, 3)


Unnamed: 0,city,state,walkscore
0,Houston,TX,86.28
1,Philadelphia,PA,97.7
2,Phoenix,AZ,71.76
3,San Antonio,TX,65.46
4,San Diego,CA,80.94


In [23]:
### Census Data

query = """
SELECT 
    city,
    state,
    popestimate2019
FROM census
"""

columns = [
    "city",
    "state",
    "popestimate2019"]

census = pd.read_json(fetch_query(query, columns))

print(census.shape)
census.head()

(81434, 3)


Unnamed: 0,city,state,popestimate2019
0,AL,AL,4903185
1,Abbeville city,AL,2560
2,Adamsville city,AL,4281
3,Addison town,AL,718
4,Akron town,AL,328
