# Exercise 4.4 – Data Wrangling & Subsetting
key data wrangling skills: dropping columns, renaming columns, checking data types, handling missing values, and creating subsets. Created a data dictionary from a transposed CSV file.

## 1. Imports and Folder Path

In [1]:
# Import libraries
import os
import pandas as pd
import numpy as np

# Set up the project path
path = '/Users/josephadamski/Instacart Basket Analysis'

## 2. Importing Orders and Products Data

In [28]:
# Import selected columns from orders.csv using vars_list
vars_list = ['order_id', 'user_id', 'order_number', 'order_dow', 'order_hour_of_day', 'days_since_prior_order']
df_ords = pd.read_csv(os.path.join(path, 'Data', 'Original Data', 'orders.csv'), usecols=vars_list)

# Import products.csv
df_prods = pd.read_csv(os.path.join(path, 'Data', 'Original Data', 'products.csv'))

## 3. Initial Exploration

In [32]:
# View the first few rows of orders
df_ords.head()

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


In [33]:
# View the first 20 rows of products
df_prods.head(20)

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3
5,6,Dry Nose Oil,11,11,2.6
6,7,Pure Coconut Water With Orange,98,7,4.4
7,8,Cut Russet Potatoes Steam N' Mash,116,1,1.1
8,9,Light Strawberry Blueberry Yogurt,120,16,7.0
9,10,Sparkling Orange Juice & Prickly Pear Beverage,115,7,8.4


In [34]:
# View the last 35 rows of products
df_prods.tail(35)

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
49658,49654,"Teriyaki Sauce, Sesame, Original",5,13,4.0
49659,49655,Apple Cider,98,7,10.7
49660,49656,Masada Kosher Pocket Bread,128,3,7.1
49661,49657,Cabernet Tomatoes,83,4,8.3
49662,49658,Brie with Herbs Foil Wedge,2,16,3.9
49663,49659,Organic Creamed Coconut,17,13,3.1
49664,49660,Professionals Sleek Shampoo,22,11,6.7
49665,49661,Porto,134,5,8.2
49666,49662,Bacon Cheddar Pretzel Pieces,107,19,3.6
49667,49663,Ultra Protein Power Crunch Peanut Butter N' Ho...,57,14,10.2


In [35]:
# View column names of products
df_prods.columns

Index(['product_id', 'product_name', 'aisle_id', 'department_id', 'prices'], dtype='object')

In [36]:
# View shape (rows, columns) of products
df_prods.shape

(49693, 5)

## 4. Data Cleaning and Transformation

In [37]:
# Check for missing values in 'days_since_prior_order'
df_ords['days_since_prior_order'].value_counts(dropna=False)

# Rename 'order_dow' to 'orders_day_of_week'
df_ords.rename(columns={'order_dow': 'orders_day_of_week'}, inplace=True)

# Convert 'order_id' and 'user_id' to string
df_ords['order_id'] = df_ords['order_id'].astype('str')
df_ords['user_id'] = df_ords['user_id'].astype('str')

## 5. Transposing Departments Data and Creating a Dictionary

In [38]:
# Import and transpose departments.csv
df_dep = pd.read_csv(os.path.join(path, 'Data', 'Original Data', 'departments.csv'))
df_dep_t = df_dep.T.reset_index()

# Use first row as header
new_header = df_dep_t.iloc[0]
df_dep_t_new = df_dep_t[1:]
df_dep_t_new.columns = new_header

# Convert to dictionary
data_dict = df_dep_t_new.to_dict('index')

## 6. Subsetting DataFrames

In [39]:
# Create breakfast subset
breakfast_id = int(df_dep_t_new[df_dep_t_new['department'] == 'breakfast']['department_id'].iloc[0])
df_breakfast = df_prods[df_prods['department_id'] == breakfast_id]

# Create dinner party subset
dinner_ids = df_dep_t_new[df_dep_t_new['department'].isin(['alcohol', 'deli', 'beverages', 'meat/seafood'])]['department_id'].astype(int).tolist()
df_dinner = df_prods[df_prods['department_id'].isin(dinner_ids)]

# Tasks

### Task 4 – Busiest Hour

In [40]:
# Count orders per hour
df_ords['order_hour_of_day'].value_counts().sort_index()

order_hour_of_day
0      22758
1      12398
2       7539
3       5474
4       5527
5       9569
6      30529
7      91868
8     178201
9     257812
10    288418
11    284728
12    272841
13    277999
14    283042
15    283639
16    272553
17    228795
18    182912
19    140569
20    104292
21     78109
22     61468
23     40043
Name: count, dtype: int64

### Task 5 – Lookup for Department ID 4

In [41]:
# Use data dictionary to look up department 4
data_dict.get('4')

### Task 10 – Behavior Summary for User 1

In [42]:
# Define safe_mode to avoid empty idxmax error
def safe_mode(series):
    s = series.dropna()
    return s.value_counts().idxmax() if not s.empty else None

# Create summary for user 1
user1 = df_ords[df_ords['user_id'] == '1']
stats_user1 = {
    'n_orders': int(user1.shape[0]),
    'mean_days_between_orders': float(user1['days_since_prior_order'].mean()) if not user1.empty else None,
    'median_days_between_orders': float(user1['days_since_prior_order'].median()) if not user1.empty else None,
    'busiest_day_of_week': safe_mode(user1['orders_day_of_week']),
    'busiest_hour': safe_mode(user1['order_hour_of_day'])
}
stats_user1

{'n_orders': 11,
 'mean_days_between_orders': 19.0,
 'median_days_between_orders': 19.5,
 'busiest_day_of_week': np.int64(4),
 'busiest_hour': np.int64(8)}

## 7. Exporting Cleaned Data

In [44]:
# Export wrangled orders and departments data
os.makedirs(os.path.join(path, 'Data', 'Prepared Data'), exist_ok=True)
df_ords.to_csv(os.path.join(path, 'Data', 'Prepared Data', 'orders_wrangled.csv'), index=False)
df_dep_t_new.to_csv(os.path.join(path, 'Data', 'Prepared Data', 'departments_wrangled.csv'), index=False)

## 8. Summary
This notebook completes the core wrangling and subsetting tasks for Exercise 4.4. I followed the steps from the course PDF to transform, clean, and organize the data for further analysis.