In [2]:
# Goal

# Learn how to:
# Turn assumptions into testable statements
# Validate ideas with data
# Avoid common analytical biases
# Dataset: Retail Transactions

In [4]:
# Core Rule (MEMORIZE)

# Never trust an assumption until data confirms it.

In [6]:
# Step 1: What Is an Assumption?

# An assumption is a belief not yet proven by data.
# Examples:
# “Electronics customers spend more.”
# “Repeat buyers are more profitable.”
# “Discounts increase revenue.”

In [8]:
# Step 2: Convert Assumption → Testable Statement
# Bad
# “Young customers spend more.”

# Good
# “Average transaction value of Young Adults is higher than other age groups.”
# Now it’s measurable.

In [14]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

customer_df = pd.read_csv('Datasets/retail_sales_dataset.csv')

In [18]:
# Step 3: Test Assumptions with Data
# Assumption 1
# “Electronics drives higher transaction value.”

customer_df.groupby('Product Category')['Total Amount'].mean()

Product Category
Beauty         467.475570
Clothing       443.247863
Electronics    458.786550
Name: Total Amount, dtype: float64

In [22]:
# Age Group
customer_df['age_group'] = pd.cut(
    customer_df['Age'],
    bins=[0, 18, 35, 60, 100],
    labels=['Teen', 'Young Adult', 'Adult', 'Senior']
)

In [24]:
# Assumption 3
# “Adults generate more revenue than other age groups.”

customer_df.groupby('age_group')['Total Amount'].sum()

  customer_df.groupby('age_group')['Total Amount'].sum()


age_group
Teen            11215
Young Adult    171815
Adult          239745
Senior          33225
Name: Total Amount, dtype: int64

In [26]:
# Step 4: Understand Biases (VERY IMPORTANT)
# 1. Confirmation Bias
# Looking only for data that supports your belief
# “Electronics is best, so I only analyzed Electronics.”

In [28]:
# 2. Survivorship Bias

# Ignoring failed / missing cases
# Example:
# Looking only at repeat buyers and ignoring churned customers

In [30]:
# Aggregation Bias (Simpson’s Paradox)
# Overall trend hides subgroup behavior
# Example:

customer_df.groupby('Gender')['Total Amount'].mean()
customer_df.groupby(['Gender','Product Category'])['Total Amount'].mean()

# Different stories → dig deeper.

Gender  Product Category
Female  Beauty              450.783133
        Clothing            467.097701
        Electronics         451.382353
Male    Beauty              487.127660
        Clothing            419.802260
        Electronics         466.104651
Name: Total Amount, dtype: float64

In [32]:
# Step 5: Use Control Comparisons

# Always compare against:
# Another segment
# A baseline
# Previous period

# Example:
# “Repeat buyers spend more compared to one-time buyers.”