# Section 1: Wrangling Procedures

In [1]:
import pandas as pd
import numpy as np
import os


In [3]:
path = '/Users/dela/Documents/15-01-2025 Instacart Basket Analysis'


In [5]:
# Load orders.csv
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'orders.csv'))

# Load products.csv
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'))

# Load departments.csv
df_dep = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', '4.4_departments.csv'))


In [7]:
# Drop the 'eval_set' column
df_ords = df_ords.drop(columns=['eval_set'])


In [9]:
# Rename 'order_dow' to 'order_day_of_week'
df_ords.rename(columns={'order_dow': 'order_day_of_week'}, inplace=True)


In [11]:
# Convert 'order_id' to string
df_ords['order_id'] = df_ords['order_id'].astype('str')


In [13]:
# Transpose the departments dataframe
df_dep_t = df_dep.T

# Set the first row as the new header
new_header = df_dep_t.iloc[0]
df_dep_t_new = df_dep_t[1:]
df_dep_t_new.columns = new_header


In [15]:
# Convert the departments dataframe to a dictionary
data_dict = df_dep_t_new.to_dict('index')


# Section 2: Task-Specific Steps


## Step 2: Identify and Convert a Numeric Identifier

In [17]:
# Convert 'order_id' to string
df_ords['order_id'] = df_ords['order_id'].astype('str')


## Step 3: Rename an Unintuitive Column

In [19]:
# Rename 'order_dow' to 'order_day_of_week'
df_ords_renamed = df_ords.rename(columns={'order_dow': 'order_day_of_week'})


## Step 4: Identify the Busiest Hour for Orders

In [21]:
# Find the frequency of each hour
hourly_orders = df_ords['order_hour_of_day'].value_counts()

# Identify the busiest hour and its frequency
busiest_hour = hourly_orders.idxmax()
busiest_hour_frequency = hourly_orders.max()

print(f"The busiest hour for orders is {busiest_hour} with {busiest_hour_frequency} orders.")


The busiest hour for orders is 10 with 288418 orders.


## Step 5: Use Data Dictionary to Find Meaning of department_id = 4

In [23]:
# Find the meaning of department_id 4
department_4 = data_dict.get('4', {}).get('department', 'Unknown')
print(f"Department 4 refers to: {department_4}")


Department 4 refers to: produce


## Step 6: Create a Subset for Breakfast Items

In [25]:
# Subset for breakfast department
df_breakfast = df_prods[df_prods['department_id'] == 4]

# Check the first few rows of the subset
df_breakfast.head()


Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
30,31,White Pearl Onions,123,4,7.5
42,43,Organic Clementines,123,4,11.5
44,45,European Cucumber,83,4,14.3
65,66,European Style Spring Mix,123,4,11.7
88,89,Yogurt Fruit Dip Sliced Apples,123,4,12.6


## Step 7: Create a Subset for Dinner Party Items

In [27]:
# Define the target departments
target_departments = ['alcohol', 'deli', 'beverages', 'meat/seafood']

# Get department_ids for the target departments
target_ids = [key for key, value in data_dict.items() if value.get('department') in target_departments]

# Subset for dinner party items
df_dinner_party = df_prods[df_prods['department_id'].isin(target_ids)]

# Check the first few rows of the subset
df_dinner_party.head()


Unnamed: 0,product_id,product_name,aisle_id,department_id,prices


## Step 8: Count Rows in the Last Subset

In [29]:
# Count the total rows in the dinner party subset
total_rows = df_dinner_party.shape[0]
print(f"The dinner party subset contains {total_rows} rows.")


The dinner party subset contains 0 rows.


## Step 9: Extract Information for user_id = 1

In [31]:
# Subset for user_id = 1
df_user_1 = df_ords[df_ords['user_id'] == 1]

# Display all information for this user
df_user_1

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
5,3367565,1,6,2,7,19.0
6,550135,1,7,1,9,20.0
7,3108588,1,8,1,14,14.0
8,2295261,1,9,1,16,0.0
9,2550362,1,10,4,8,30.0


## Step 10: Provide Basic Stats About User Behavior

In [33]:
# Basic statistics for user_id = 1
total_orders = df_user_1.shape[0]
busiest_order_hour = df_user_1['order_hour_of_day'].value_counts().idxmax()
most_common_day = df_user_1['order_day_of_week'].value_counts().idxmax()

print(f"User 1 has placed a total of {total_orders} orders.")
print(f"The busiest hour for User 1's orders is {busiest_order_hour}.")
print(f"The most common day for User 1's orders is {most_common_day}.")


User 1 has placed a total of 11 orders.
The busiest hour for User 1's orders is 8.
The most common day for User 1's orders is 4.


## Step 12: Export df_ords

In [35]:
# Export df_ords
df_ords.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_wrangled.csv'), index=False)
print("df_ords exported as 'orders_wrangled.csv'.")


df_ords exported as 'orders_wrangled.csv'.


## Step 13: Export df_dep_t_new

In [37]:
# Export df_dep_t_new
df_dep_t_new.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'departments_wrangled.csv'), index=False)
print("df_dep_t_new exported as 'departments_wrangled.csv'.")


df_dep_t_new exported as 'departments_wrangled.csv'.
