In [1]:
#Import required libraries
import requests
import polars as pl
import pandas as pd
import time

from urllib.parse import urlencode
from sqlalchemy import create_engine
from sqlalchemy import text

In [2]:
#Define API details and parameters
CENSUS_API_URL = "https://api.census.gov/data/2022/acs/acs5"

CENSUS_API_KEY = "00200b6fdc213ea1ae3272478057c94cb3815637"

PARAMS = {
    "get": "NAME,B01003_001E",
    "for": "state:*",
    "key": CENSUS_API_KEY
}

full_url = CENSUS_API_URL + "?" + urlencode(PARAMS)
print("API URL:", full_url)


API URL: https://api.census.gov/data/2022/acs/acs5?get=NAME%2CB01003_001E&for=state%3A%2A&key=00200b6fdc213ea1ae3272478057c94cb3815637


In [3]:
#Fetch raw JSON data from API
response = requests.get(CENSUS_API_URL, params=PARAMS)
response.raise_for_status()

payload = response.json()

# Preview raw data
payload[:5]

[['NAME', 'B01003_001E', 'state'],
 ['Alabama', '5028092', '01'],
 ['Alaska', '734821', '02'],
 ['Arizona', '7172282', '04'],
 ['Arkansas', '3018669', '05']]

In [4]:
#Clean & structure the API response
headers = payload[0]
rows = payload[1:]

df = pl.DataFrame(
    rows,
    schema=headers,
    orient="row"
)

df = df.rename({
    "NAME": "state_name",
    "B01003_001E": "total_population",
    "state": "state_code"
})

df = df.with_columns(
    pl.col("total_population").cast(pl.Int64, strict=False)
)

df.head()

state_name,total_population,state_code
str,i64,str
"""Alabama""",5028092,"""01"""
"""Alaska""",734821,"""02"""
"""Arizona""",7172282,"""04"""
"""Arkansas""",3018669,"""05"""
"""California""",39356104,"""06"""


In [5]:
#Connect Python to PostgreSQL
DB_USER = "postgres"
DB_PASSWORD = "Barik1010"
DB_HOST = "localhost"
DB_PORT = "5432"
DB_NAME = "census_db"

engine = create_engine(
    f"postgresql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
)

engine


Engine(postgresql://postgres:***@localhost:5432/census_db)

In [6]:
#Store cleaned data into database

df_pandas = df.to_pandas()

df_pandas.to_sql(
    name="census_population",
    con=engine,
    if_exists="replace",
    index=False   
)

print("Data loaded into PostgreSQL table: census_population")

Data loaded into PostgreSQL table: census_population


In [7]:
#Measure performance before indexing
query = """
SELECT *
FROM census_population
WHERE total_population > 10000000
"""

start = time.time()

with engine.connect() as conn:
    result = conn.execute(text(query)).fetchall()

end = time.time()

print(f"Rows returned: {len(result)}")
print(f"Query time without index: {end - start:.6f} seconds")



Rows returned: 10
Query time without index: 0.002839 seconds


In [8]:
from sqlalchemy import text

create_index_sql = """
CREATE INDEX idx_population
ON census_population (total_population);
"""

with engine.connect() as conn:
    conn.execute(text(create_index_sql))
    conn.commit()

print("Index created on total_population")


Index created on total_population


In [9]:
start = time.time()

with engine.connect() as conn:
    result = conn.execute(text(query)).fetchall()

end = time.time()

print(f"Rows returned: {len(result)}")
print(f"Query time with index: {end - start:.6f} seconds")


Rows returned: 10
Query time with index: 0.002300 seconds
