# Module 7 — Tidying I
Reshaping with `melt/pivot/stack/unstack` and handling missing values.


In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
pd.set_option('display.max_rows', 20)
pd.set_option('display.width', 120)
pd.__version__

'2.2.2'

## 1) Starter datasets

In [2]:
# Wide quarterly sales dataset with missing values
sales_wide = pd.DataFrame({
    'id':[1,2,3,4],
    'product':['A','B','C','B'],
    'q1':[100,  90,  50, np.nan],
    'q2':[120, np.nan, 60, 130],
    'q3':[130, 110,  70, 140],
    'q4':[140, 120, np.nan, 150],
})
sales_wide

Unnamed: 0,id,product,q1,q2,q3,q4
0,1,A,100.0,120.0,130,140.0
1,2,B,90.0,,110,120.0
2,3,C,50.0,60.0,70,
3,4,B,,130.0,140,150.0


In [3]:
# MultiIndex-style summary from a pivot_table to use with stack/unstack
rng = pd.MultiIndex.from_product([
    ['East','West'],
    ['A','B'],
    ['q1','q2','q3']
], names=['region','product','quarter'])
np.random.seed(7)
summary_long = pd.DataFrame({
    'metric':'sales',
    'value': np.random.randint(50, 200, len(rng))
}, index=rng).reset_index()
summary_long.head()

Unnamed: 0,region,product,quarter,metric,value
0,East,A,q1,sales,75
1,East,A,q2,sales,117
2,East,A,q3,sales,153
3,East,B,q1,sales,142
4,East,B,q2,sales,192


## 2) Melt — wide → long

In [4]:
sales_wide

Unnamed: 0,id,product,q1,q2,q3,q4
0,1,A,100.0,120.0,130,140.0
1,2,B,90.0,,110,120.0
2,3,C,50.0,60.0,70,
3,4,B,,130.0,140,150.0


In [5]:
sales_long = pd.melt(
    sales_wide,
    id_vars=['id','product'],
    var_name='quarter',
    value_name='sales'
)
sales_long.sort_values(['id','quarter']).head(10)

Unnamed: 0,id,product,quarter,sales
0,1,A,q1,100.0
4,1,A,q2,120.0
8,1,A,q3,130.0
12,1,A,q4,140.0
1,2,B,q1,90.0
5,2,B,q2,
9,2,B,q3,110.0
13,2,B,q4,120.0
2,3,C,q1,50.0
6,3,C,q2,60.0


## 3) Pivot — long → wide

In [6]:
sales_pivot = sales_long.pivot(
    index=['id','product'], 
    columns='quarter', 
    values='sales').reset_index()
sales_pivot

quarter,id,product,q1,q2,q3,q4
0,1,A,100.0,120.0,130.0,140.0
1,2,B,90.0,,110.0,120.0
2,3,C,50.0,60.0,70.0,
3,4,B,,130.0,140.0,150.0


## 4) pivot_table with aggregation

In [7]:
# If duplicates exist, pivot_table handles them via aggfunc
dupes = pd.concat([sales_long, sales_long.iloc[:4]], ignore_index=True)
dupes

Unnamed: 0,id,product,quarter,sales
0,1,A,q1,100.0
1,2,B,q1,90.0
2,3,C,q1,50.0
3,4,B,q1,
4,1,A,q2,120.0
5,2,B,q2,
6,3,C,q2,60.0
7,4,B,q2,130.0
8,1,A,q3,130.0
9,2,B,q3,110.0


In [8]:
pt = pd.pivot_table(dupes, index=['id','product'], columns='quarter', values='sales', aggfunc='sum')
pt

Unnamed: 0_level_0,quarter,q1,q2,q3,q4
id,product,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,A,200.0,120.0,130.0,140.0
2,B,180.0,0.0,110.0,120.0
3,C,100.0,60.0,70.0,0.0
4,B,0.0,130.0,140.0,150.0


## 5) Stack and Unstack

In [9]:
summary_long

Unnamed: 0,region,product,quarter,metric,value
0,East,A,q1,sales,75
1,East,A,q2,sales,117
2,East,A,q3,sales,153
3,East,B,q1,sales,142
4,East,B,q2,sales,192
5,East,B,q3,sales,73
6,West,A,q1,sales,122
7,West,A,q2,sales,139
8,West,A,q3,sales,160
9,West,B,q1,sales,92


In [10]:
# Build a wide table with MultiIndex columns to demonstrate stack
pt_multi = pd.pivot_table(
    summary_long,
    index=['region','product'],
    columns=['quarter','metric'],
    values='value',
    aggfunc='sum'
)
pt_multi

Unnamed: 0_level_0,quarter,q1,q2,q3
Unnamed: 0_level_1,metric,sales,sales,sales
region,product,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
East,A,75,117,153
East,B,142,192,73
West,A,122,139,160
West,B,92,186,118


In [11]:
# Stack selected column levels into rows → long tidy table
tidy_from_multi = (
    pt_multi.stack(['quarter','metric'], future_stack=True)
           .rename('value')
           .reset_index()
)
tidy_from_multi

Unnamed: 0,region,product,quarter,metric,value
0,East,A,q1,sales,75
1,East,A,q2,sales,117
2,East,A,q3,sales,153
3,East,B,q1,sales,142
4,East,B,q2,sales,192
5,East,B,q3,sales,73
6,West,A,q1,sales,122
7,West,A,q2,sales,139
8,West,A,q3,sales,160
9,West,B,q1,sales,92


In [12]:
# Unstack: long → wide by chosen index level
long_idx = tidy_from_multi.set_index(['region','product','quarter','metric'])
wide_again = long_idx['value'].unstack(['quarter'])
wide_again.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,quarter,q1,q2,q3
region,product,metric,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
East,A,sales,75,117,153
East,B,sales,142,192,73
West,A,sales,122,139,160
West,B,sales,92,186,118


In [13]:
# Utility: flatten MultiIndex columns if needed
def flatten_columns(df):
    if isinstance(df.columns, pd.MultiIndex):
        df = df.copy()
        df.columns = ['_'.join(map(str, c)).strip('_') for c in df.columns]
    return df

In [14]:
flatten_columns(wide_again).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,quarter,q1,q2,q3
region,product,metric,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
East,A,sales,75,117,153
East,B,sales,142,192,73
West,A,sales,122,139,160
West,B,sales,92,186,118


## 6) Missing values — detection and strategies

In [15]:
# Detect
sales_long.isna().sum()

id         0
product    0
quarter    0
sales      3
dtype: int64

In [16]:
sales_long.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16 entries, 0 to 15
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   id       16 non-null     int64  
 1   product  16 non-null     object 
 2   quarter  16 non-null     object 
 3   sales    13 non-null     float64
dtypes: float64(1), int64(1), object(2)
memory usage: 644.0+ bytes


In [17]:
# Strategy 1: simple fill with 0
filled0 = sales_long.assign(sales=lambda d: d['sales'].fillna(0))
filled0.head(8)

Unnamed: 0,id,product,quarter,sales
0,1,A,q1,100.0
1,2,B,q1,90.0
2,3,C,q1,50.0
3,4,B,q1,0.0
4,1,A,q2,120.0
5,2,B,q2,0.0
6,3,C,q2,60.0
7,4,B,q2,130.0


In [18]:
# Strategy 2: impute per-product mean using groupby.transform
imputed_by_product = sales_long.copy()
imputed_by_product['sales'] = (
    imputed_by_product.groupby('product')['sales']
                      .transform(lambda s: s.fillna(s.mean()))
)
imputed_by_product.sort_values(['product','id','quarter']).head(12)

Unnamed: 0,id,product,quarter,sales
0,1,A,q1,100.0
4,1,A,q2,120.0
8,1,A,q3,130.0
12,1,A,q4,140.0
1,2,B,q1,90.0
5,2,B,q2,123.333333
9,2,B,q3,110.0
13,2,B,q4,120.0
3,4,B,q1,123.333333
7,4,B,q2,130.0


In [19]:
# Strategy 3: forward/backward fill within each id across ordered quarters
order = {'q1':1,'q2':2,'q3':3,'q4':4}
ffill_bfill = (
    sales_long.assign(q_order=lambda d: d['quarter'].map(order))
              .sort_values(['id','q_order'])
              .groupby('id', as_index=False, group_keys=False)
              .apply(lambda g: g.assign(sales=g['sales'].ffill().bfill()), include_groups=False)
              .drop(columns='q_order')
)
ffill_bfill.head(12)

Unnamed: 0,product,quarter,sales
0,A,q1,100.0
4,A,q2,120.0
8,A,q3,130.0
12,A,q4,140.0
1,B,q1,90.0
5,B,q2,90.0
9,B,q3,110.0
13,B,q4,120.0
2,C,q1,50.0
6,C,q2,60.0


In [20]:
date_rng = pd.date_range('2024-01-01', periods=8, freq='W')
date_rng

DatetimeIndex(['2024-01-07', '2024-01-14', '2024-01-21', '2024-01-28', '2024-02-04', '2024-02-11', '2024-02-18',
               '2024-02-25'],
              dtype='datetime64[ns]', freq='W-SUN')

In [21]:
ts = pd.DataFrame({'date': date_rng, 'value':[1.0, np.nan, 2.5, np.nan, np.nan, 5.0, 6.0, np.nan]})
ts

Unnamed: 0,date,value
0,2024-01-07,1.0
1,2024-01-14,
2,2024-01-21,2.5
3,2024-01-28,
4,2024-02-04,
5,2024-02-11,5.0
6,2024-02-18,6.0
7,2024-02-25,


In [22]:
# Strategy 4: time-series interpolate example
ts_interp = ts.set_index('date').interpolate().reset_index()
ts_interp

Unnamed: 0,date,value
0,2024-01-07,1.0
1,2024-01-14,1.75
2,2024-01-21,2.5
3,2024-01-28,3.333333
4,2024-02-04,4.166667
5,2024-02-11,5.0
6,2024-02-18,6.0
7,2024-02-25,6.0


## 7) Integrity checks and assertions

In [23]:
# Example integrity rules
assert sales_long['id'].notna().all(), 'IDs should not be missing'
assert set(sales_long['quarter'].dropna().unique()) <= {'q1','q2','q3','q4'}
sales_long.shape

(16, 4)

## 8) In-class exercises
Each task should be solved in the cell provided. Keep solutions in method-chaining style when possible.

### Exercise 1 — Melt and pivot back

In [24]:
# TODO: 1) From sales_wide, melt to long; 2) impute missing by product median; 3) pivot back to q1..q4 wide
ex1_long = pd.melt(sales_wide, id_vars=['id','product'], var_name='quarter', value_name='sales')
ex1_long['sales'] = ex1_long.groupby('product')['sales'].transform(lambda s: s.fillna(s.median()))
ex1_wide = ex1_long.pivot(index=['id','product'], columns='quarter', values='sales').reset_index()
ex1_wide

quarter,id,product,q1,q2,q3,q4
0,1,A,100.0,120.0,130.0,140.0
1,2,B,90.0,125.0,110.0,120.0
2,3,C,50.0,60.0,70.0,60.0
3,4,B,125.0,130.0,140.0,150.0


### Exercise 2 — Stack/unstack with MultiIndex

In [25]:
# TODO: Convert pt_multi to tidy long with stack on both levels then compute mean by region-product-quarter
ex2_long = (pt_multi
            .stack(['quarter','metric'], future_stack=True)
            .rename('value')
            .reset_index())
ex2_summary = ex2_long.groupby(['region','product','quarter'], as_index=False)['value'].mean()
ex2_summary.head()

Unnamed: 0,region,product,quarter,value
0,East,A,q1,75.0
1,East,A,q2,117.0
2,East,A,q3,153.0
3,East,B,q1,142.0
4,East,B,q2,192.0


### Exercise 3 — Missing-data report

In [26]:
# TODO: Create a function missing_report(df) that returns a DataFrame with columns: col, n_missing, pct_missing
def missing_report(df):
    n = len(df)
    return (
        df.isna().sum()
          .rename('n_missing')
          .to_frame()
          .assign(pct_missing=lambda d: d['n_missing'] / n)
          .reset_index()
          .rename(columns={'index':'col'})
          .sort_values('n_missing', ascending=False)
    )

missing_report(sales_long)

Unnamed: 0,col,n_missing,pct_missing
3,sales,3,0.1875
0,id,0,0.0
1,product,0,0.0
2,quarter,0,0.0
