In [0]:
import pandas as pd
import numpy as np

# Assignment 1: Groupby

Can you return a table containing the top 10 stores by total transactions in the data?

Make sure they’re sorted from highest to lowest.

Thanks!

In [0]:
# Read in transactions data -- parse dates specified here for help with later problem

transactions = pd.read_csv("../retail/transactions.csv", parse_dates=["date"])

transactions.head()

In [0]:
# Group transactions by store_nbr number. Perform sum aggregation on transactions column

(transactions
 .groupby(["store_nbr"])[["transactions"]] 
 .sum() 
 .sort_values("transactions", ascending=False)  # sort in descending order
 .iloc[:10]  # Grab top 10 rows (will be highest values due to sort descending)
)

# Assignment 2: Groupby Multiple Columns

Can you get me the total transactions by store and month? 

Sort the table from first month to last, then by highest transactions to lowest within each month. 


In [0]:
# helper code to extract month date part from date column

transactions["month"] = transactions["date"].dt.month

transactions.head()

In [0]:
# group by store number and month, sum trasactions

(transactions
 .groupby(["store_nbr", "month"])[["transactions"]]
 .sum()
 .sort_values(["month", "transactions"], ascending=[True, False] # ascending month, descending transactions
))


# Assignment 3: Multi-Index DataFrames


Can you help me access rows and columns with multiple indices? I’ve been struggling with multi-index DataFrames.

Access:
* Grab Store 3, Month 1
* Then, select the column storing the mean of transactions

Fix:
* Drop the outer layer of the column Index
* Reset the row index so it is the default integer index

In [0]:
# Ross' grouped DataFrame code, run this first

grouped = (
    transactions.groupby(["store_nbr", "month"])
    .agg({"transactions": ["sum", "mean"]})
    .sort_values(by=["month", ("transactions", "sum")], ascending=[True, False])
)

In [0]:
# df grouped by store and month with two aggregation columns

grouped.head()

In [0]:
# Grab store 3, month 1 from multi-index (both values are integers)

grouped.loc[(3, 1)]

In [0]:
# Grab mean column in column multi-index

grouped.loc[:, [("transactions", "mean")]]

In [0]:
# Drop level from column index (axis=1), then reset index

grouped.droplevel(0, axis=1).reset_index()

# Assignment 4: The Agg Method

Calculate the mean of target met by store, and the sum of bonuses to be paid to each store.

Sort them by highest to lowest bonus payout.

Then, do the same for day of week and month.

In [0]:
# Recreate table from section 3

transactions = transactions.assign(
    target_pct=transactions["transactions"] / 2500,
    met_target=(transactions["transactions"] / 2500) >= 1,
    bonus_payable=((transactions["transactions"] / 2500) >= 1) * 100,
    month=transactions.date.dt.month,
    day_of_week=transactions.date.dt.dayofweek,
)

transactions.head()

In [0]:
# Group transactions by store number to get store level stats 
# use agg to calculate mean of met_target, sum of transactions by store
# sort values by bonus payable in descending order

(transactions.groupby("store_nbr")
 .agg({"met_target": "mean", "bonus_payable": "sum"})
 .sort_values(by=["bonus_payable"], ascending=False)
)

In [0]:
# Group transactions by month to get month level stats 
# use agg to calculate mean of met_target, sum of transactions by store
# sort values by bonus payable in descending order

(transactions.groupby("month")
 .agg({"met_target": "mean", "bonus_payable": "sum"})
 .sort_values(by=["bonus_payable"], ascending=False)
)

In [0]:
# Group transactions by day of week to get day of week level stats 
# use agg to calculate mean of met_target, sum of transactions by store
# sort values by bonus payable in descending order

(transactions
 .groupby(["day_of_week"])
 .agg({"met_target": "mean", "bonus_payable": "sum"})
 .sort_values("bonus_payable", ascending=False)
)

# Assignment 4: Transform

Calculate the mean of transactions by store number and day of week while keeping row numbers. 

Then compare the performance of each row to its day of week average. (difference between transactions and daily avg)

In [0]:
# Use trasnactions table created in Assignment 3

transactions.head()

In [0]:
# Assign method for column creation

transactions.assign(
    # use transform to create store level average transactions by day
    avg_store_transactions = (transactions
                              .groupby(["store_nbr", "day_of_week"])
                              ["transactions"]
                              .transform("mean")),
    
    # Then create a difference column subtracting the store average from that days transactions
    difference = lambda x: x["transactions"] - x["avg_store_transactions"]
)

In [0]:
# Take a peek at store level aggregate values for store 25 to validate column

(transactions
 .query("store_nbr == 25")
 .groupby(["day_of_week"])
 .agg({"transactions": "mean"})
)

# Assignment 5: Pivot

Pivot transactions with store number as index, columns day of week, with the sum of bonus payable as cells.

Filter to stores that had a non-zero bonus payable and create a heatmap.

Then unpivot (melt) the table so we have one row for each store and day of the week with the corresponding total owed. 


In [0]:
# Use transactions table (ok if includes columns from assignment 4 or not)

transactions.head()

In [0]:
# Filter out all rows with 0 bonus. 

(transactions.loc[transactions["bonus_payable"] != 0]
.pivot_table(
    index="store_nbr",       # store number as rows
    columns="day_of_week",   # day of week as columns
    values="bonus_payable",  # aggregate bonus payable column
    aggfunc="sum")           # sum bonus payable
.iloc[:10]                   # optional filter to reduce table size
.style.background_gradient(cmap="RdYlGn", axis=1 ))  # apply RdYlGn colormap to pivoted DF

In [0]:
# Build pivot table from above then melt

(transactions
 .loc[transactions["bonus_payable"] != 0]
 .pivot_table(index="store_nbr",
              columns="day_of_week",
              values="bonus_payable",
              aggfunc="sum")
 .reset_index()  # get necessary columns out of index before melting
 .melt(id_vars="store_nbr", value_name="bonus_payable"))  # specify store_nbr to melt around and name bonus column