# 01 DATA INGESTION

## American Community Survey Data

In [1]:
import os
import pandas as pd

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

Get in the correct working directory

In [3]:
%pwd

'/Users/chrissunderland/Desktop/starbucks_store_predictions/research'

In [4]:
os.chdir("../")

In [5]:
%pwd

'/Users/chrissunderland/Desktop/starbucks_store_predictions'

### Ingest the census bureau's acs datasets

In [6]:
from src.StarbucksProject.constants import *
from src.StarbucksProject.utils.common import read_yaml, create_directories
from src.StarbucksProject.utils.common import get_size
from src.StarbucksProject import logger

In [7]:
import urllib.request as request
import zipfile
from pathlib import Path

In [8]:
config = read_yaml(CONFIG_FILE_PATH)

[2024-07-09 18:53:13,427: INFO: common: yaml file: config/config.yaml loaded successfully]


In [9]:
create_directories([config.artifacts_root, config.data_ingestion.root_dir])

[2024-07-09 18:53:13,432: INFO: common: created directory at: artifacts]
[2024-07-09 18:53:13,433: INFO: common: created directory at: artifacts/data_ingestion]


Download the file

In [10]:
if not os.path.exists(config.data_ingestion.local_zip_file):
    
    filename, headers = request.urlretrieve(url=config.data_ingestion.acs_source_url,
                                            filename=config.data_ingestion.local_zip_file)
    logger.info(f"{filename} downloaded with following info: \n{headers}")
else:

    logger.info(f"File already exists of size: {get_size(Path(config.data_ingestion.local_zip_file))}")

[2024-07-09 18:53:13,441: INFO: 3030093056: File already exists of size: ~ 1857 KB]


Extract zip file

In [11]:
unzip_path = config.data_ingestion.root_dir

with zipfile.ZipFile(config.data_ingestion.local_zip_file, 'r') as zip_ref:

    zip_ref.extractall(unzip_path)

Define function to clean the ACS csv files

In [12]:
def clean_acs_csv(df):

    clean_df = df.copy()

    clean_df.rename(columns={"Label (Grouping)": "ZCTA"}, inplace=True)

    clean_df.dropna(axis=1, how='all', inplace=True)

    cols = clean_df.columns

    clean_df.rename(columns= dict(zip(clean_df.columns[1:], [" ".join(col.split("!!")[1:]) for col in list(cols[1:])])), inplace=True)

    clean_df = clean_df.loc[:, ~clean_df.columns.duplicated()]

    clean_df.iloc[:, 1:] = clean_df.iloc[:, 1:].shift(periods=-1)

    clean_df["ZCTA"] = clean_df["ZCTA"].str.strip()

    clean_df = clean_df[clean_df["ZCTA"].str.startswith("Z")]

    clean_df["ZCTA"] = clean_df["ZCTA"].apply(lambda x: x.split()[1])

    clean_df.reset_index(drop=True, inplace=True)

    return clean_df    

Clean the CSV files

In [13]:
# CSV 1 - ACS 'selected social characteristics'
co_social = pd.read_csv('artifacts/data_ingestion/acs_data/co_social.csv')
co_social_clean = clean_acs_csv(co_social)

In [14]:
co_social_clean.shape

(530, 153)

In [15]:
# CSV 2 - ACS 'selected economic characteristics'
co_econ = pd.read_csv('artifacts/data_ingestion/acs_data/co_econ.csv')
co_econ_clean = clean_acs_csv(co_econ)

In [16]:
co_econ_clean.shape

(530, 136)

In [17]:
# CSV 3 - ACS 'selected housing characteristics'
co_housing = pd.read_csv('artifacts/data_ingestion/acs_data/co_housing.csv')
co_housing_clean = clean_acs_csv(co_housing)

In [18]:
co_housing_clean.shape

(530, 134)

In [19]:
# CSV 4 - ACS 'demographic & housing estimates'
co_demo_housing = pd.read_csv('artifacts/data_ingestion/acs_data/co_demo_housing.csv')
co_demo_housing.drop('Total housing units', axis=1, inplace=True)
co_demo_housing_clean = clean_acs_csv(co_demo_housing)

In [20]:
co_demo_housing_clean.shape

(530, 88)

Add the cleaned CSVs to the project's 'artifacts' folder

In [21]:
create_directories([config.data_ingestion.acs_data_clean])

[2024-07-09 18:53:13,988: INFO: common: created directory at: artifacts/data_ingestion/acs_data_clean]


In [22]:
co_social_clean.to_csv(config.data_ingestion.acs_social_clean)
co_econ_clean.to_csv(config.data_ingestion.acs_econ_clean)
co_housing_clean.to_csv(config.data_ingestion.acs_housing_clean)
co_demo_housing_clean.to_csv(config.data_ingestion.acs_demo_housing_clean)