In [1]:
import json, os
for k,v in json.load(open("local.settings.json"))["Values"].items():
    os.environ[k] = v

In [None]:
import pandas as pd
import numpy as np
from azure.storage.filedatalake import FileSystemClient

# Load the uploaded JSON files
adfs = FileSystemClient.from_connection_string(
    conn_str=os.environ["DATALAKE_CONN_STR"],
    file_system_name="general",
)
output_file = adfs.get_file_client("datalake/census/data.parquet")
if not output_file.exists():
    output_file.create_file()

# Load the data from the json files
adfs.get_file_client("datalake/census/data.parquet").upload_data(
    pd.merge(
        pd.read_json(
            adfs.get_file_client(
                file_path="datalake/census/basic.json.gz"
            ).download_file(),
            compression="gzip",
        ).pipe(
            lambda df: df.rename(columns=df.iloc[0])
            .drop(df.index[0])
            .reset_index(drop=True)
        ),
        pd.read_json(
            adfs.get_file_client(
                file_path="datalake/census/age.json.gz"
            ).download_file(),
            compression="gzip",
        ).pipe(
            lambda df: df.rename(columns=df.iloc[0])
            .drop(df.index[0])
            .reset_index(drop=True)
        ),
        on="zip code tabulation area",
        how="left",
    )
    .rename(
        columns={
            "zip code tabulation area": "ZCTA",
            "B01003_001E": "Population",
            "B19013_001E": "Median Income",
            "B25077_001E": "Median Home Value",
            "B19001_001E": "Households",
            "B19001_002E": "<$10K",
            "B19001_003E": "$10K-$15K",
            "B19001_004E": "$15K-$25K",
            "B19001_005E": "$25K-$35K",
            "B19001_006E": "$35K-$50K",
            "B19001_007E": "$50K-$75K",
            "B19001_008E": "$75K-$100K",
            "B19001_009E": "$100K-$125K",
            "B19001_010E": "$125K-$150K",
            "B19001_011E": "$150K-$200K",
            "B19001_012E": "$200K-$250K",
            "B19001_013E": "$250K-$300K",
            "B19001_014E": "$300K-$400K",
            "B19001_015E": "$400K-$500K",
            "B19001_016E": "$500K-$1M",
            "B19001_017E": "$>1M",
            "B02001_001E": "Race Population",
            "B02001_002E": "White",
            "B02001_003E": "Black",
            "B02001_004E": "Native American",
            "B02001_005E": "Asian",
            "B02001_006E": "Pacific Islander",
            "B03003_001E": "Hispanic Population",
            "B03003_002E": "Non Hispanic",
            "B03003_003E": "Hispanic",
            "B01001_003E": "Male <5",
            "B01001_004E": "Male 5-9",
            "B01001_005E": "Male 10-14",
            "B01001_006E": "Male 15-17",
            "B01001_007E": "Male 18-19",
            "B01001_008E": "Male 20",
            "B01001_009E": "Male 21",
            "B01001_010E": "Male 22-24",
            "B01001_011E": "Male 25-29",
            "B01001_012E": "Male 30-34",
            "B01001_013E": "Male 35-39",
            "B01001_014E": "Male 40-44",
            "B01001_015E": "Male 45-49",
            "B01001_016E": "Male 50-54",
            "B01001_017E": "Male 55-59",
            "B01001_018E": "Male 60-61",
            "B01001_019E": "Male 62-64",
            "B01001_020E": "Male 65-66",
            "B01001_021E": "Male 67-69",
            "B01001_022E": "Male 70-74",
            "B01001_023E": "Male 75-79",
            "B01001_024E": "Male 80-84",
            "B01001_025E": "Male 85+",
            "B01001_027E": "Female <5",
            "B01001_028E": "Female 5-9",
            "B01001_029E": "Female 10-14",
            "B01001_030E": "Female 15-17",
            "B01001_031E": "Female 18-19",
            "B01001_032E": "Female 20",
            "B01001_033E": "Female 21",
            "B01001_034E": "Female 22-24",
            "B01001_035E": "Female 25-29",
            "B01001_036E": "Female 30-34",
            "B01001_037E": "Female 35-39",
            "B01001_038E": "Female 40-44",
            "B01001_039E": "Female 45-49",
            "B01001_040E": "Female 50-54",
            "B01001_041E": "Female 55-59",
            "B01001_042E": "Female 60-61",
            "B01001_043E": "Female 62-64",
            "B01001_044E": "Female 65-66",
            "B01001_045E": "Female 67-69",
            "B01001_046E": "Female 70-74",
            "B01001_047E": "Female 75-79",
            "B01001_048E": "Female 80-84",
            "B01001_049E": "Female 85+",
        }
    )
    .pipe(lambda df: df.astype({col: "int" for col in df.columns if col != "ZCTA"}))
    .pipe(lambda df: df.astype({col: "str" for col in df.columns if col == "ZCTA"}))
    .pipe(lambda df: df[[col for col in ['ZCTA'] + [c for c in df.columns if c != 'ZCTA']]])  # Move 'ZCTA' to the front
    .replace(-666666666, np.nan)
    .to_parquet(index=None, compression="gzip"),
    overwrite=True,
)

In [None]:
import pandas as pd

# Load the data
df = pd.read_csv("combined_movers_2024-09-03.csv")

# Drop the first unnamed column and rename keycode2 to keycode
df = df.drop(df.columns[0], axis=1).rename(columns={"keycode2": "keycode"})

# Convert zip to string, filling NaN with empty strings
df["zip"] = df["zip"].astype(str)

# Convert zip4 to string, first filling NaN with a placeholder value (like 0) or empty string
df["zip4"] = df["zip4"].fillna(0).astype(int).astype(str)

# Convert keycode and oldzip to string, handling NaN values similarly
df["keycode"] = df["keycode"].astype(str)
df["oldzip"] = df["oldzip"].fillna(0).astype(int).astype(str)

# Ensure zip and oldzip columns have 5 digits, padding with leading zeros
df["zip"] = df["zip"].str.zfill(5)
df["oldzip"] = df["oldzip"].str.zfill(5)

# Ensure zip4 column has 4 digits, padding with leading zeros
df["zip4"] = df["zip4"].str.zfill(4)

# Save the modified DataFrame to a compressed CSV file
df.to_csv("backfill.csv.gz", index=None, compression="gzip")
