Goal: Retrieve DB

In [1]:
import os

import pandas as pd
from dotenv import load_dotenv
from sqlalchemy import create_engine

load_dotenv()  # Take environment variables from .env

True

In [2]:
# Credentials
host = os.getenv("POSTGRES_HOST")
port = os.getenv("POSTGRES_PORT")
database = os.getenv("POSTGRES_DATABASE")
user = os.getenv("POSTGRES_USER")
password = os.getenv("POSTGRES_PASSWORD")
# Create the connection engine
engine = create_engine(f"postgresql+psycopg2://{user}:{password}@{host}:{port}/{database}")
# Test the connection
try:
    with engine.connect() as conn:
        print("✅ Connected successfully!")
except Exception as e:
    print("❌ Connection failed:", e)

✅ Connected successfully!


In [3]:
# There's a lot of columns so only take what is needed and pre-filter
query = """
        SELECT fdc_id,
               description
        FROM usda_2022_food_branded_experimental
        WHERE fdc_id IS NOT NULL
          AND description IS NOT NULL; \
        """
df = pd.read_sql(query, engine)

In [4]:
df.columns

Index(['fdc_id', 'description'], dtype='object')

In [5]:
len(df)  # check number of rows

1766279

In [6]:
sorted_df = df.sort_values(by=["fdc_id", "description"])  # sort by id because looks nice

In [7]:
sorted_df.to_csv("data/usda_2022_food_branded_experimental_DESCRIPTION_ONLY.csv", index=False)  # save database

In [8]:
sorted_df.tail(5)  # note ids are sorted

Unnamed: 0,fdc_id,description
1766274,2353625,Feed efficiency of lactating Holstein cows is ...
1766275,2353626,Contrast Study on Secondary Metabolite Profile...
1766276,2353627,Deriving information from complex data sets: I...
1766277,2353628,Identification of High and Low Branched-Chain ...
1766278,2353629,Effects of Differences in Resistant Starch Con...
