# Instacart Market Basket Database Setup

## Download Dataset from Kagglehub

In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("psparks/instacart-market-basket-analysis")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Downloading from https://www.kaggle.com/api/v1/datasets/download/psparks/instacart-market-basket-analysis?dataset_version_number=1...


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 197M/197M [00:18<00:00, 11.1MB/s] 

Extracting files...





Path to dataset files: C:\Users\rk\.cache\kagglehub\datasets\psparks\instacart-market-basket-analysis\versions\1


In [1]:
import os

dataset_path = r"C:\Users\rk\.cache\kagglehub\datasets\psparks\instacart-market-basket-analysis\versions\1"

# List all files
for file in os.listdir(dataset_path):
    print(file)

aisles.csv
departments.csv
orders.csv
order_products__prior.csv
order_products__train.csv
products.csv


## Load Dataset to PostgreSQL

In [1]:
import os
import pandas as pd
from sqlalchemy import create_engine
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Read DB config from environment
DB_USER = os.getenv("DB_USER")
DB_PASSWORD = os.getenv("DB_PASSWORD")
DB_HOST = os.getenv("DB_HOST", "localhost")
DB_PORT = os.getenv("DB_PORT", "5432")
DB_NAME = os.getenv("DB_NAME")

# Validate env vars early
missing = [k for k in ["DB_USER", "DB_PASSWORD", "DB_NAME"] if not os.getenv(k)]
if missing:
    raise RuntimeError(f"Missing env vars: {missing}")

# Create SQLAlchemy engine
engine = create_engine(
    f"postgresql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
)

# Dataset path
dataset_path = r"C:\Users\rk\.cache\kagglehub\datasets\psparks\instacart-market-basket-analysis\versions\1"

files = [
    ('aisles.csv', 'aisles'),
    ('departments.csv', 'departments'),
    ('products.csv', 'products'),
    ('orders.csv', 'orders'),
    ('order_products__prior.csv', 'order_products_prior'),
    ('order_products__train.csv', 'order_products_train'),
]

for csv_file, table_name in files:
    file_path = os.path.join(dataset_path, csv_file)
    print(f"Loading {csv_file} â†’ {table_name}")

    df = pd.read_csv(file_path)

    if 'order_products' in table_name:
        df.to_sql(
            table_name,
            engine,
            if_exists='replace',
            index=False,
            chunksize=50_000,
            method="multi"
        )
    else:
        df.to_sql(
            table_name,
            engine,
            if_exists='replace',
            index=False
        )

    print(f"{len(df):,} rows loaded")

print("\nInstacart data loaded successfully!")

Loading aisles.csv â†’ aisles
134 rows loaded
Loading departments.csv â†’ departments
21 rows loaded
Loading products.csv â†’ products
49,688 rows loaded
Loading orders.csv â†’ orders
3,421,083 rows loaded
Loading order_products__prior.csv â†’ order_products_prior
32,434,489 rows loaded
Loading order_products__train.csv â†’ order_products_train
1,384,617 rows loaded

Instacart data loaded successfully!


## Database Constraints and Performance Indexes

In [None]:
import psycopg2

conn = psycopg2.connect(
    host=DB_HOST,
    port=DB_PORT,
    database=DB_NAME,
    user=DB_USER,
    password=DB_PASSWORD
)

cursor = conn.cursor()

sql_commands = [
    # Primary keys
    "ALTER TABLE aisles ADD PRIMARY KEY (aisle_id);",
    "ALTER TABLE departments ADD PRIMARY KEY (department_id);",
    "ALTER TABLE products ADD PRIMARY KEY (product_id);",
    "ALTER TABLE orders ADD PRIMARY KEY (order_id);",
    
    # Foreign keys - products
    "ALTER TABLE products ADD FOREIGN KEY (aisle_id) REFERENCES aisles(aisle_id);",
    "ALTER TABLE products ADD FOREIGN KEY (department_id) REFERENCES departments(department_id);",
    
    # Foreign keys - order_products_prior
    "ALTER TABLE order_products_prior ADD FOREIGN KEY (order_id) REFERENCES orders(order_id);",
    "ALTER TABLE order_products_prior ADD FOREIGN KEY (product_id) REFERENCES products(product_id);",
    
    # Foreign keys - order_products_train
    "ALTER TABLE order_products_train ADD FOREIGN KEY (order_id) REFERENCES orders(order_id);",
    "ALTER TABLE order_products_train ADD FOREIGN KEY (product_id) REFERENCES products(product_id);",
    
    # Indexes for performance
    "CREATE INDEX idx_orders_user ON orders(user_id);",
    "CREATE INDEX idx_op_prior_order ON order_products_prior(order_id);",
    "CREATE INDEX idx_op_prior_product ON order_products_prior(product_id);",
    "CREATE INDEX idx_products_aisle ON products(aisle_id);",
    "CREATE INDEX idx_products_dept ON products(department_id);",
]

print("Adding constraints and indexes...")
for i, cmd in enumerate(sql_commands, 1):
    try:
        cursor.execute(cmd)
        print(f"[{i}/{len(sql_commands)}] {cmd[:70]}...")
    except Exception as e:
        print(f"Error on command {i}: {e}")
        conn.rollback()
        continue

print("\nDatabase setup complete!")

Adding constraints and indexes...
âœ… [1/15] ALTER TABLE aisles ADD PRIMARY KEY (aisle_id);...
âœ… [2/15] ALTER TABLE departments ADD PRIMARY KEY (department_id);...
âœ… [3/15] ALTER TABLE products ADD PRIMARY KEY (product_id);...
âœ… [4/15] ALTER TABLE orders ADD PRIMARY KEY (order_id);...
âœ… [5/15] ALTER TABLE products ADD FOREIGN KEY (aisle_id) REFERENCES aisles(aisl...
âœ… [6/15] ALTER TABLE products ADD FOREIGN KEY (department_id) REFERENCES depart...
âœ… [7/15] ALTER TABLE order_products_prior ADD FOREIGN KEY (order_id) REFERENCES...
âœ… [8/15] ALTER TABLE order_products_prior ADD FOREIGN KEY (product_id) REFERENC...
âœ… [9/15] ALTER TABLE order_products_train ADD FOREIGN KEY (order_id) REFERENCES...
âœ… [10/15] ALTER TABLE order_products_train ADD FOREIGN KEY (product_id) REFERENC...
âœ… [11/15] CREATE INDEX idx_orders_user ON orders(user_id);...
âœ… [12/15] CREATE INDEX idx_op_prior_order ON order_products_prior(order_id);...
âœ… [13/15] CREATE INDEX idx_op_prior_product ON or

# Test with a sample query

In [None]:
print("TEST QUERY - Top 5 Most Ordered Products:")
cursor.execute("""
    SELECT p.product_name, COUNT(*) as order_count
    FROM order_products_prior op
    JOIN products p ON op.product_id = p.product_id
    GROUP BY p.product_name
    ORDER BY order_count DESC
    LIMIT 5;
""")

for row in cursor.fetchall():
    print(f"  {row[0]:45} {row[1]:>10,} orders")

cursor.close()
conn.close()

print("\nDatabase ready for text-to-SQL agent!")


TEST QUERY - Top 5 Most Ordered Products:
  Banana                                           472,565 orders
  Bag of Organic Bananas                           379,450 orders
  Organic Strawberries                             264,683 orders
  Organic Baby Spinach                             241,921 orders
  Organic Hass Avocado                             213,584 orders

Database ready for text-to-SQL agent!
