In [None]:
!!pip install --upgrade pip
!!pip install psycopg2-binary
!!pip install SQLAlchemy
!pip install spacy
!python -m spacy download en_core_web_sm

In [None]:
import socket

import pandas as pd
from sqlalchemy import create_engine

In [None]:
host = "awesome-hw.sdsc.edu"
port = 5432  # PostgreSQL default port

try:
    socket.create_connection((host, port), timeout=5)
    print("✅ Host is reachable — network connection works.")
except Exception as e:
    print("❌ Cannot reach host:", e)

In [None]:
# Connection parameters
host = "awesome-hw.sdsc.edu"
port = 5432
database = "nourish"
user = "username"  # <–– given this by TA
password = "password"  # <–– your PostgreSQL password

# Create the connection engine
engine = create_engine(f"postgresql+psycopg2://{user}:{password}@{host}:{port}/{database}")

# Test the connection
try:
    with engine.connect() as conn:
        print("✅ Connected successfully!")
except Exception as e:
    print("❌ Connection failed:", e)

In [None]:
query = """
        SELECT table_name
        FROM information_schema.tables
        WHERE table_schema = 'public'
        ORDER BY table_name; \
        """

tables = pd.read_sql(query, engine)
print(tables)

In [None]:
# query 50 rows
query = "SELECT * FROM \"FoodKG\" LIMIT 50;"  # quotes protect uppercase table names

df = pd.read_sql(query, engine)

# display nicely
pd.set_option('display.max_columns', None)  # show all columns
pd.set_option('display.max_rows', 50)  # show up to 50 rows
display(df)

In [None]:
# Only fetch the column names without data
columns = pd.read_sql("SELECT * FROM \"FoodKG\" LIMIT 0;", engine).columns
print(columns)

In [None]:
import spacy
import re

nlp = spacy.load("en_core_web_sm")

# Common measurement units to remove
units = ['c', 'cup', 'cups', 'tbsp', 'tsp', 'oz', 'g', 'kg', 'ml', 'l', 'pinch', 'slice', 'clove', 'cloves']
pattern_units = r'\b(?:' + '|'.join(units) + r')\b'

# Process every recipe
all_cleaned_ingredients = []

for ingredient_list in df['ingredients']:
    recipe_ingredients = []
    for ing in ingredient_list:
        # lowercase
        ing_clean = ing.lower()
        # remove numbers and fractions
        ing_clean = re.sub(r'\d+\/\d+|\d+', '', ing_clean)
        # remove units
        ing_clean = re.sub(pattern_units, '', ing_clean)
        # remove punctuation and extra spaces
        ing_clean = re.sub(r'[^a-zA-Z\s]', '', ing_clean)
        ing_clean = re.sub(r'\s+', ' ', ing_clean).strip()

        # use SpaCy to extract nouns
        doc = nlp(ing_clean)
        nouns = [token.text for token in doc if token.pos_ == 'NOUN']

        if nouns:
            recipe_ingredients.append(' '.join(nouns))

    all_cleaned_ingredients.append(recipe_ingredients)

# Add it back to the dataframe as a new column
df['cleaned_ingredients'] = all_cleaned_ingredients

# Display first 5 recipes
df[['ingredients', 'cleaned_ingredients']].head()


In [None]:
query = 'SELECT COUNT(*) AS total_rows FROM "FoodKG";'

result = pd.read_sql(query, engine)
print(result["total_rows"].iloc[0])
