In [0]:
import os

# Use absolute paths for file existence check
menu_items_path = "/Workspace/Users/bab045@ensign.edu/-csai382_lab_2_4_-Batchimeg-/data/menu_items.csv"
order_details_path = "/Workspace/Users/bab045@ensign.edu/-csai382_lab_2_4_-Batchimeg-/data/order_details.csv"

print("menu_items.csv exists:", os.path.exists(menu_items_path))
print("order_details.csv exists:", os.path.exists(order_details_path))

In [0]:
%pip freeze > requirements.txt

In [0]:
import os
import random
import numpy as np

os.environ['PYTHONHASHSEED'] = '0'
random.seed(0)
np.random.seed(0)

print("Random seeds set for reproducibility.")

In [0]:
import hashlib
import json
import os
from datetime import datetime

# Use absolute paths for hashing
csv_files = [
    "/Workspace/Users/bab045@ensign.edu/-csai382_lab_2_4_-Batchimeg-/data/menu_items.csv",
    "/Workspace/Users/bab045@ensign.edu/-csai382_lab_2_4_-Batchimeg-/data/order_details.csv"
]
data_hashes = {}

for csv_file in csv_files:
    if os.path.exists(csv_file):
        with open(csv_file, 'rb') as f:
            file_bytes = f.read()
            sha256_hash = hashlib.sha256(file_bytes).hexdigest()
            data_hashes[os.path.basename(csv_file)] = sha256_hash
    else:
        data_hashes[os.path.basename(csv_file)] = None
        print(f"File not found: {csv_file}")

# Use a timestamped filename to avoid overwriting
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
filename = f'data_hashes_{timestamp}.json'
with open(filename, 'w') as f:
    json.dump(data_hashes, f, indent=2)

print(f"SHA-256 hashes computed and saved to {filename}:")
print(json.dumps(data_hashes, indent=2))

In [0]:
import pandas as pd

# Use absolute paths for loading CSVs
menu_items_path = "/Workspace/Users/bab045@ensign.edu/-csai382_lab_2_4_-Batchimeg-/data/menu_items.csv"
order_details_path = "/Workspace/Users/bab045@ensign.edu/-csai382_lab_2_4_-Batchimeg-/data/order_details.csv"

try:
    menu_items_df = pd.read_csv(menu_items_path)
    print(f"Loaded {menu_items_path} with shape {menu_items_df.shape}")
except Exception as e:
    print(f"Failed to load {menu_items_path}: {e}")
    menu_items_df = None

try:
    order_details_df = pd.read_csv(order_details_path)
    print(f"Loaded {order_details_path} with shape {order_details_df.shape}")
except Exception as e:
    print(f"Failed to load {order_details_path}: {e}")
    order_details_df = None

In [0]:
def clean_dataframe(df):
    # Trim whitespace from string columns
    str_cols = df.select_dtypes(include=['object']).columns
    for col in str_cols:
        df[col] = df[col].astype(str).str.strip()
    # Convert date columns to datetime
    for col in df.columns:
        if 'date' in col.lower() or 'time' in col.lower():
            try:
                df[col] = pd.to_datetime(df[col], errors='coerce')
                print(f"Converted {col} to datetime.")
            except Exception as e:
                print(f"Could not convert {col} to datetime: {e}")
    # Convert numeric columns to appropriate types
    num_cols = df.select_dtypes(include=['object']).columns
    for col in num_cols:
        try:
            df[col] = pd.to_numeric(df[col], errors='ignore')
        except Exception:
            pass
    return df

if menu_items_df is not None:
    menu_items_df = clean_dataframe(menu_items_df)
    print("menu_items_df cleaned sample:")
    print(menu_items_df.head())
if order_details_df is not None:
    order_details_df = clean_dataframe(order_details_df)
    print("order_details_df cleaned sample:")
    print(order_details_df.head())


In [0]:
# Join menu_items_df and order_details_df on menu_item_id = item_id
if menu_items_df is not None and order_details_df is not None:
    joined_df = pd.merge(menu_items_df, order_details_df, left_on='menu_item_id', right_on='item_id', how='inner')
    print(f"Joined DataFrame shape: {joined_df.shape}")
    print("Joined DataFrame sample:")
    print(joined_df.head())
else:
    print("Cannot join: one or both DataFrames are missing.")


In [0]:
if 'joined_df' in locals():
    # Top 5 items by order frequency (row count)
    top_items = joined_df.groupby(['item_name', 'item_id', 'menu_item_id']).size().sort_values(ascending=False).head(5)
    top_items_df = top_items.reset_index(name='order_count')
    print("Top 5 items by order frequency:")
    print(top_items_df)
else:
    print("joined_df is not defined. Please run the join cell first.")


In [0]:
if 'joined_df' in locals():
    # Compute revenue for each row (since no quantity column, assume 1 per row)
    joined_df['revenue'] = joined_df['price']
    # Group by category and sum revenue
    revenue_by_category = joined_df.groupby('category')['revenue'].sum().sort_values(ascending=False)
    revenue_by_category_df = revenue_by_category.reset_index()
    print("Revenue by category:")
    print(revenue_by_category_df)
else:
    print("joined_df is not defined. Please run the join cell first.")


In [0]:
if 'joined_df' in locals():
    # Prefer 'order_time' if available, otherwise use 'order_date'
    if 'order_time' in joined_df.columns:
        joined_df['hour'] = pd.to_datetime(joined_df['order_time'], errors='coerce').dt.hour
    elif 'order_date' in joined_df.columns:
        joined_df['hour'] = pd.to_datetime(joined_df['order_date'], errors='coerce').dt.hour
    else:
        print("No time or date column found in joined_df.")
        joined_df['hour'] = None

    if joined_df['hour'].notnull().any():
        busiest_hour = joined_df.groupby('hour').size().sort_values(ascending=False).head(1)
        busiest_hour_df = busiest_hour.reset_index(name='order_count')
        print("Busiest hour of day:")
        print(busiest_hour_df)
    else:
        print("Could not extract hour information from time/date columns.")
else:
    print("joined_df is not defined. Please run the join cell first.")


In [0]:
import os
from datetime import datetime

# Output directory and filename (Workspace path)
output_dir = '/Workspace/Users/bab045@ensign.edu/-csai382_lab_2_4_-Batchimeg-/notebooks/lab2_4_etl_output_metrics'
os.makedirs(output_dir, exist_ok=True)
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
output_path = f'{output_dir}/busiest_hour_of_day_{timestamp}.csv'

# Save the DataFrame
busiest_hour_df.to_csv(output_path, index=False)
print(f"Busiest hour of day saved to {output_path}")


In [0]:
import os
from datetime import datetime

# Output directory and filename (Workspace path)
output_dir = '/Workspace/Users/bab045@ensign.edu/-csai382_lab_2_4_-Batchimeg-/notebooks/lab2_4_etl_output_metrics'
os.makedirs(output_dir, exist_ok=True)
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
output_path = f'{output_dir}/revenue_by_category_{timestamp}.csv'

# Save the DataFrame
revenue_by_category_df.to_csv(output_path, index=False)
print(f"Revenue by category saved to {output_path}")


In [0]:
import os
from datetime import datetime

# Output directory and filename (Workspace path)
output_dir = '/Workspace/Users/bab045@ensign.edu/-csai382_lab_2_4_-Batchimeg-/notebooks/lab2_4_etl_output_metrics'
os.makedirs(output_dir, exist_ok=True)
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
output_path = f'{output_dir}/top_5_items_by_order_frequency_{timestamp}.csv'

# Save the DataFrame
if 'top_items_df' in locals() and not top_items_df.empty:
    top_items_df.to_csv(output_path, index=False)
    print(f"Top 5 items by order frequency saved to {output_path}")
else:
    print("top_items_df is not defined or empty. Please run the metric cell first.")
