#### retrieve_food_product
Retrieves `usda_2022_branded_food_product` from the SQL server and downloads it locally as a CSV.
Only the columns required for downstream processing are saved.

In [18]:
import os

import pandas as pd
from dotenv import load_dotenv
from sqlalchemy import create_engine

load_dotenv()  # Take environment variables from .env

True

In [19]:
# Data path
output_path = "../data/raw/usda_2022_branded_food_product_INGREDIENTS_ONLY.csv"
# Credentials
host = os.getenv("POSTGRES_HOST")
port = os.getenv("POSTGRES_PORT")
database = os.getenv("POSTGRES_DATABASE")
user = os.getenv("POSTGRES_USER")
password = os.getenv("POSTGRES_PASSWORD")
# Create the connection engine
engine = create_engine(f"postgresql+psycopg2://{user}:{password}@{host}:{port}/{database}")
# Test the connection
try:
    with engine.connect() as conn:
        print("✅ Connected successfully!")
except Exception as e:
    print("❌ Connection failed:", e)

✅ Connected successfully!


In [20]:
# There are a lot of columns, so only take what is needed (pre-filter)
query = """
        SELECT fdc_id,
               ingredients
        FROM usda_2022_branded_food_product
        WHERE fdc_id IS NOT NULL
          AND ingredients IS NOT NULL; \
        """
df = pd.read_sql(query, engine)

In [21]:
# Sanity check: make sure have the right columns
df.columns

Index(['fdc_id', 'ingredients'], dtype='object')

In [22]:
# Sanity check: inspect the number of rows
len(df)

1702125

In [23]:
# Sort by id for convenience in future processing
sorted_df = df.sort_values(by=["fdc_id", "ingredients"])

In [25]:
# Note fdc_id is sorted
sorted_df.tail(5)

Unnamed: 0,fdc_id,ingredients
144015,2340755,"FILTERED WATER, ORGAIN ORGANIC PROTEIN BLEND (..."
144016,2340756,"FILTERED WATER, ORGAIN ORGANIC PROTEIN BLEND (..."
144017,2340757,"FILTERED WATER, ORGAIN ORGANIC PROTEIN BLEND (..."
144018,2340758,"FILTERED WATER, ORGAIN ORGANIC PROTEIN BLEND (..."
144019,2340759,"COLOMBIAN COFFEE, MILK PROTEIN ISOLATE, COCOA ..."


In [26]:
sorted_df.to_csv(output_path, index=False)  # save database