#STRATIFY

DECODING THE BASKET 

TASK 1 Exploratory Data Analysis

This task will help you understand customer purchasing behavior, product popularity, and
temporal ordering patterns, which helps in optimizing inventory, personalizing marketing
strategies, and improving customer retention through better product recommendations and
timing of promotions.:
1. Total number of orders and unique customers
2. Number of unique products, aisles, and departments
3. Top 20 most frequently ordered products
4. Top departments and aisles by order volume
5. Identify most reordered products (products with highest reorder rate)
6. Average basket size (number of items per order)
    1Order trends by hour of the day, day of the week, and days since prior order



In [1]:
# =========================
# Task 1: Exploratory Data Analysis
# =========================

!pip show pandas

# Importing Libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

print(pd.__version__)


# Load datasets
orders = pd.read_csv("order_metadata.csv")
order_items = pd.read_csv("order_items_train.csv")
products = pd.read_csv("product_catalog.csv")
departments = pd.read_csv("department_info.csv")
aisles = pd.read_csv("aisle_info.csv")

# Merge datasets for easier analysis
product_merged = products.merge(departments, on="department_id", how="left") \
                         .merge(aisles, on="aisle_id", how="left")
order_items = order_items.merge(product_merged, on="product_id", how="left") \
                         .merge(orders, on="order_id", how="left")


Name: pandas
Version: 2.2.2
Summary: Powerful data structures for data analysis, time series, and statistics
Home-page: https://pandas.pydata.org
Author: 
Author-email: The Pandas Development Team <pandas-dev@python.org>
License: BSD 3-Clause License

Copyright (c) 2008-2011, AQR Capital Management, LLC, Lambda Foundry, Inc. and PyData Development Team
All rights reserved.

Copyright (c) 2011-2023, Open source contributors.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

* Redistributions of source code must retain the above copyright notice, this
  list of conditions and the following disclaimer.

* Redistributions in binary form must reproduce the above copyright notice,
  this list of conditions and the following disclaimer in the documentation
  and/or other materials provided with the distribution.

* Neither the name of the copyright holder nor the names of its
  contributors may be u

ModuleNotFoundError: No module named 'pandas'

In [None]:
#1. Total number of orders and unique customers

total_orders = orders["order_id"].nunique()
unique_customers = orders["user_id"].nunique()

print("Total Orders:", total_orders)
print("Unique Customers:", unique_customers)


In [None]:
#2. Number of unique products, aisles, and departments

unique_products = products["product_id"].nunique()
unique_aisles = products["aisle_id"].nunique()
unique_departments = products["department_id"].nunique()

print("Unique Products:", unique_products)
print("Unique Aisles:", unique_aisles)
print("Unique Departments:", unique_departments)


In [None]:
#3.Top 20 most frequently ordered products

top_products = order_items["product_name"].value_counts().head(20)
print(top_products)


In [None]:
#4. Top departments and aisles by order volume (PLOT)

# Departments
dept_orders = order_items.groupby("department")["order_id"].count().sort_values(ascending=False).head(10)
plt.figure(figsize=(10,5))
sns.barplot(x=dept_orders.values, y=dept_orders.index, palette="Blues_r")
plt.title("Top Departments by Order Volume")
plt.xlabel("Number of Orders")
plt.show()

# Aisles
aisle_orders = order_items.groupby("aisle")["order_id"].count().sort_values(ascending=False).head(10)
plt.figure(figsize=(10,5))
sns.barplot(x=aisle_orders.values, y=aisle_orders.index, palette="Greens_r")
plt.title("Top Aisles by Order Volume")
plt.xlabel("Number of Orders")
plt.show()


In [None]:
#5. Identify most reordered products (highest reorder rate)

reorder_rate = order_items.groupby("product_name")["reordered"].mean().sort_values(ascending=False).head(20)
print(reorder_rate)


In [None]:
#6. Average basket size (number of items per order)

basket_size = order_items.groupby("order_id")["product_id"].count().mean()
print("Average Basket Size:", basket_size)


In [None]:
#7. Order trends by hour, day of week, and days since prior order (PLOTS)

# Hour of the day
plt.figure(figsize=(8,5))
sns.countplot(x="order_hour_of_day", data=orders, color="skyblue")
plt.title("Orders by Hour of the Day")
plt.xlabel("Hour")
plt.ylabel("Order Count")
plt.show()

# Day of the week
plt.figure(figsize=(8,5))
sns.countplot(x="order_dow", data=orders, color="lightgreen")
plt.title("Orders by Day of Week (0=Sunday)")
plt.xlabel("Day of Week")
plt.ylabel("Order Count")
plt.show()

# Days since prior order
plt.figure(figsize=(8,5))
sns.histplot(orders["days_since_prior_order"].dropna(), bins=30, color="salmon", kde=False)
plt.title("Distribution of Days Since Prior Order")
plt.xlabel("Days Since Prior Order")
plt.ylabel("Count")
plt.show()

