# Aggregating DataFrames

You'll be working with data on Walmart stores, which is a chain of department stores in the US. 

The dataset contains weekly sales in US dollars in various stores. Each store has an ID number and a specific store type. The sales are also separated by department ID. 

Along with weekly sales, there is information about whether it was a holiday week or not, the average temperature during the week in that location, the average fuel price in dollars per liter that week, and the national unemployment rate that week.

In [None]:
import pandas as pd
sales = pd.read_csv('/work/data_science_notes/3. Data manipulation with pandasw/data/sales_subset.csv', index_col=0)

## Summary statistics

### Mean and median

In [None]:
# Explore your new DataFrame first by printing the first few rows of the sales DataFrame
sales.head()

Unnamed: 0,store,type,department,date,weekly_sales,is_holiday,temperature_c,fuel_price_usd_per_l,unemployment
0,1,A,1,2010-02-05,24924.5,False,5.727778,0.679451,8.106
1,1,A,1,2010-03-05,21827.9,False,8.055556,0.693452,8.106
2,1,A,1,2010-04-02,57258.43,False,16.816667,0.718284,7.808
3,1,A,1,2010-05-07,17413.94,False,22.527778,0.748928,7.808
4,1,A,1,2010-06-04,17558.09,False,27.05,0.714586,7.808


In [None]:
# Print information about the columns in sales
sales.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10774 entries, 0 to 10773
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   store                 10774 non-null  int64  
 1   type                  10774 non-null  object 
 2   department            10774 non-null  int64  
 3   date                  10774 non-null  object 
 4   weekly_sales          10774 non-null  float64
 5   is_holiday            10774 non-null  bool   
 6   temperature_c         10774 non-null  float64
 7   fuel_price_usd_per_l  10774 non-null  float64
 8   unemployment          10774 non-null  float64
dtypes: bool(1), float64(4), int64(2), object(2)
memory usage: 768.1+ KB


In [None]:
# Print the mean of the weekly_sales column
sales['weekly_sales'].mean()

23843.95014850566

In [None]:
# Print the median of the weekly_sales column
sales['weekly_sales'].median()

12049.064999999999

### Summarizing dates

In [None]:
# Print the maximum of the date column
sales['date'] = pd.to_datetime(sales['date'])
sales.date.max()

Timestamp('2012-10-26 00:00:00')

In [None]:
# Print the minimum of the date column
sales.date.min()

Timestamp('2010-02-05 00:00:00')

### Efficient summaries

In [None]:
# Use the custom iqr function to print the IQR of the temperature_c column of sales

def iqr(column):
    return column.quantile(0.75) - column.quantile(0.25)

sales['temperature_c'].agg(iqr)

16.583333333333336

In [None]:
# Update the column selection to use the custom iqr function with .agg() to print the IQR of temperature_c, fuel_price_usd_per_l, and unemployment, in that order
sales[['temperature_c', 'fuel_price_usd_per_l', 'unemployment']].agg(iqr)

temperature_c           16.583333
fuel_price_usd_per_l     0.073176
unemployment             0.565000
dtype: float64

In [None]:
# Update the aggregation functions called by .agg(): include iqr and np.median in that order
import numpy as np

sales[['temperature_c', 'fuel_price_usd_per_l', 'unemployment']].agg([iqr, np.median])

Unnamed: 0,temperature_c,fuel_price_usd_per_l,unemployment
iqr,16.583333,0.073176,0.565
median,16.966667,0.743381,8.099


### Cumulative statistics

In [None]:
sales_1_1 = sales[(sales['store'] == 1) & (sales['department'] == 1)]

In [None]:
# Sort the rows of sales_1_1 by the date column in ascending order
sales_1_1 = sales_1_1.sort_values('date')

In [None]:
# Get the cumulative sum of weekly_sales and add it as a new column of sales_1_1 called cum_weekly_sales
sales_1_1['cum_weekly_sales'] = sales_1_1['weekly_sales'].cumsum()

In [None]:
# Get the cumulative maximum of weekly_sales, and add it as a column called cum_max_sales
sales_1_1['cum_max_sales'] = sales_1_1['weekly_sales'].cummax()

In [None]:
# Print the date, weekly_sales, cum_weekly_sales, and cum_max_sales columns
sales_1_1[['date', 'weekly_sales', 'cum_weekly_sales', 'cum_max_sales']]

Unnamed: 0,date,weekly_sales,cum_weekly_sales,cum_max_sales
0,2010-02-05,24924.5,24924.5,24924.5
1,2010-03-05,21827.9,46752.4,24924.5
2,2010-04-02,57258.43,104010.83,57258.43
3,2010-05-07,17413.94,121424.77,57258.43
4,2010-06-04,17558.09,138982.86,57258.43
5,2010-07-02,16333.14,155316.0,57258.43
6,2010-08-06,17508.41,172824.41,57258.43
7,2010-09-03,16241.78,189066.19,57258.43
8,2010-10-01,20094.19,209160.38,57258.43
9,2010-11-05,34238.88,243399.26,57258.43


## Counting

### Dropping duplicates

In [None]:
# Remove rows of sales with duplicate pairs of store and type and save as store_types and print the head
store_types = sales.drop_duplicates(['store', 'type'])
store_types.head()

Unnamed: 0,store,type,department,date,weekly_sales,is_holiday,temperature_c,fuel_price_usd_per_l,unemployment
0,1,A,1,2010-02-05,24924.5,False,5.727778,0.679451,8.106
901,2,A,1,2010-02-05,35034.06,False,4.55,0.679451,8.324
1798,4,A,1,2010-02-05,38724.42,False,6.533333,0.686319,8.623
2699,6,A,1,2010-02-05,25619.0,False,4.683333,0.679451,7.259
3593,10,B,1,2010-02-05,40212.84,False,12.411111,0.782478,9.765


In [None]:
# Remove rows of sales with duplicate pairs of store and department and save as store_depts and print the head
store_depts = sales.drop_duplicates(['store', 'department'])
store_depts.head()

Unnamed: 0,store,type,department,date,weekly_sales,is_holiday,temperature_c,fuel_price_usd_per_l,unemployment
0,1,A,1,2010-02-05,24924.5,False,5.727778,0.679451,8.106
12,1,A,2,2010-02-05,50605.27,False,5.727778,0.679451,8.106
24,1,A,3,2010-02-05,13740.12,False,5.727778,0.679451,8.106
36,1,A,4,2010-02-05,39954.04,False,5.727778,0.679451,8.106
48,1,A,5,2010-02-05,32229.38,False,5.727778,0.679451,8.106


In [None]:
# Subset the rows that are holiday weeks using the is_holiday column, and drop the duplicate dates, saving as holiday_dates
holiday_dates = sales[sales['is_holiday']].drop_duplicates('date')

In [None]:
# Select the date column of holiday_dates, and print
holiday_dates['date']

498    2010-09-10
691    2011-11-25
2315   2010-02-12
6735   2012-09-07
6810   2010-12-31
6815   2012-02-10
6820   2011-09-09
Name: date, dtype: datetime64[ns]

### Counting categorical variables

In [None]:
# Count the number of stores of each store type in store_types
store_counts = store_types['type'].value_counts()
store_counts

A    11
B     1
Name: type, dtype: int64

In [None]:
# Count the proportion of stores of each store type in store_types
store_props = store_types['type'].value_counts(normalize=True)
store_props

A    0.916667
B    0.083333
Name: type, dtype: float64

In [None]:
# Count the number of different departments in store_depts, sorting the counts in descending order
dept_counts_sorted = store_depts['department'].value_counts(sort=True)
dept_counts_sorted

1     12
55    12
72    12
71    12
67    12
      ..
37    10
48     8
50     6
39     4
43     2
Name: department, Length: 80, dtype: int64

In [None]:
# Count the proportion of different departments in store_depts, sorting the proportions in descending order
dept_counts_sorted = store_depts['department'].value_counts(normalize=True, sort=True)
dept_counts_sorted

1     0.012917
55    0.012917
72    0.012917
71    0.012917
67    0.012917
        ...   
37    0.010764
48    0.008611
50    0.006459
39    0.004306
43    0.002153
Name: department, Length: 80, dtype: float64

## Grouped summary statistics

### What percent of sales occurred at each store type?

In [None]:
# Calculate the total weekly_sales over the whole dataset
sales_all = sales['weekly_sales'].sum()
sales_all

256894718.89999998

In [None]:
# Subset for type "A" stores, and calculate their total weekly sales
sales_A = sales[sales['type'] == 'A'].weekly_sales.sum()
sales_A

233716315.01

In [None]:
# Do the same for type "B" and type "C" stores
sales_B = sales[sales['type'] == 'B'].weekly_sales.sum()
sales_B

23178403.89

In [None]:
sales_C = sales[sales['type'] == 'C'].weekly_sales.sum()
sales_C

0.0

In [None]:
# Combine the A/B/C results into a list, and divide by sales_all to get the proportion of sales by type
sales_propn_by_type = [sales_A, sales_B, sales_C] / sales_all
sales_propn_by_type

array([0.9097747, 0.0902253, 0.       ])

### Calculations with .groupby()

In [None]:
# Group sales by "type", take the sum of "weekly_sales", and store as sales_by_type
sales_by_type = sales.groupby('type').weekly_sales.sum()
sales_by_type

type
A    2.337163e+08
B    2.317840e+07
Name: weekly_sales, dtype: float64

In [None]:
# Calculate the proportion of sales at each store type by dividing by the sum of sales_by_type
sales_propn_by_type = sales_by_type / sales_by_type.sum()
sales_propn_by_type

type
A    0.909775
B    0.090225
Name: weekly_sales, dtype: float64

In [None]:
# Group sales by "type" and "is_holiday", take the sum of weekly_sales, and store as sales_by_type_is_holiday
sales_by_type_is_holiday = sales.groupby(['type', 'is_holiday']).weekly_sales.sum()
sales_by_type_is_holiday

type  is_holiday
A     False         2.336927e+08
      True          2.360181e+04
B     False         2.317678e+07
      True          1.621410e+03
Name: weekly_sales, dtype: float64

### Multiple grouped summaries

In [None]:
# Import numpy with the alias np.
# Get the min, max, mean, and median of weekly_sales for each store type using .groupby() and .agg() 
# Store this as sales_stats

import numpy as np
sales_stats = sales.groupby('type').weekly_sales.agg([np.min, np.max, np.mean, np.median])
sales_stats

Unnamed: 0_level_0,amin,amax,mean,median
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,-1098.0,293966.05,23674.667242,11943.92
B,-798.0,232558.51,25696.67837,13336.08


In [None]:
# Get the min, max, mean, and median of unemployment and fuel_price_usd_per_l for each store type 
# Store this as unemp_fuel_stats
unemp_fuel_stats = sales.groupby('type')[['unemployment', 'fuel_price_usd_per_l']].agg([np.min, np.max, np.mean, np.median])
unemp_fuel_stats

Unnamed: 0_level_0,unemployment,unemployment,unemployment,unemployment,fuel_price_usd_per_l,fuel_price_usd_per_l,fuel_price_usd_per_l,fuel_price_usd_per_l
Unnamed: 0_level_1,amin,amax,mean,median,amin,amax,mean,median
type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
A,3.879,8.992,7.972611,8.067,0.664129,1.10741,0.744619,0.735455
B,7.17,9.765,9.279323,9.199,0.760023,1.107674,0.805858,0.803348


## Pivot tables

### Pivoting on one variable

In [None]:
# Get the mean weekly_sales by type using .pivot_table() and store as mean_sales_by_type
mean_sales_by_type = sales.pivot_table(values='weekly_sales', index='type')
mean_sales_by_type

# ERROR! Review how to create pivot tables. 

Unnamed: 0_level_0,weekly_sales
type,Unnamed: 1_level_1
A,23674.667242
B,25696.67837


In [None]:
# Get the mean and median (using NumPy functions) of weekly_sales by type using .pivot_table() and store as mean_med_sales_by_type
mean_med_sales_by_type = sales.pivot_table(values='weekly_sales', index='type', aggfunc=[np.mean, np.median])
mean_med_sales_by_type

Unnamed: 0_level_0,mean,median
Unnamed: 0_level_1,weekly_sales,weekly_sales
type,Unnamed: 1_level_2,Unnamed: 2_level_2
A,23674.667242,11943.92
B,25696.67837,13336.08


In [None]:
# Get the mean of weekly_sales by type and is_holiday using .pivot_table() and store as mean_sales_by_type_holiday
mean_sales_by_type_holiday = sales.pivot_table(values='weekly_sales', index='type', columns='is_holiday', aggfunc=[np.mean])
mean_sales_by_type_holiday

Unnamed: 0_level_0,mean,mean
is_holiday,False,True
type,Unnamed: 1_level_2,Unnamed: 2_level_2
A,23768.583523,590.04525
B,25751.980533,810.705


### Fill in missing values and sum values with pivot tables

In [None]:
# Print the mean weekly_sales by department and type, filling in any missing values with 0
sales.pivot_table(values='weekly_sales', index='department', columns='type', fill_value=0)

type,A,B
department,Unnamed: 1_level_1,Unnamed: 2_level_1
1,30961.725379,44050.626667
2,67600.158788,112958.526667
3,17160.002955,30580.655000
4,44285.399091,51219.654167
5,34821.011364,63236.875000
...,...,...
95,123933.787121,77082.102500
96,21367.042857,9528.538333
97,28471.266970,5828.873333
98,12875.423182,217.428333


In [None]:
# Print the mean weekly_sales by department and type, filling in any missing values with 0 and summing all rows and columns
sales.pivot_table(values='weekly_sales', index='department', columns='type', fill_value=0, margins=True)

type,A,B,All
department,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,30961.725379,44050.626667,32052.467153
2,67600.158788,112958.526667,71380.022778
3,17160.002955,30580.655000,18278.390625
4,44285.399091,51219.654167,44863.253681
5,34821.011364,63236.875000,37189.000000
...,...,...,...
96,21367.042857,9528.538333,20337.607681
97,28471.266970,5828.873333,26584.400833
98,12875.423182,217.428333,11820.590278
99,379.123659,0.000000,379.123659


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=ccf93317-bea1-4146-b94a-20aafb784b0b' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>