# Module 7 — Tidying I
Reshaping with `melt/pivot/stack/unstack` and handling missing values.


In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
pd.set_option('display.max_rows', 20)
pd.set_option('display.width', 120)
pd.__version__

## 1) Starter datasets

In [None]:
# Wide quarterly sales dataset with missing values
sales_wide = pd.DataFrame({
    'id':[1,2,3,4],
    'product':['A','B','C','B'],
    'q1':[100,  90,  50, np.nan],
    'q2':[120, np.nan, 60, 130],
    'q3':[130, 110,  70, 140],
    'q4':[140, 120, np.nan, 150],
})
sales_wide

In [None]:
# MultiIndex-style summary from a pivot_table to use with stack/unstack
rng = pd.MultiIndex.from_product([
    ['East','West'],
    ['A','B'],
    ['q1','q2','q3']
], names=['region','product','quarter'])
np.random.seed(7)
summary_long = pd.DataFrame({
    'metric':'sales',
    'value': np.random.randint(50, 200, len(rng))
}, index=rng).reset_index()
summary_long.head()

## 2) Melt — wide → long

In [None]:
sales_long = pd.melt(
    sales_wide,
    id_vars=['id','product'],
    var_name='quarter',
    value_name='sales'
)
sales_long.sort_values(['id','quarter']).head(10)

## 3) Pivot — long → wide

In [None]:
sales_pivot = sales_long.pivot(index=['id','product'], columns='quarter', values='sales').reset_index()
sales_pivot

## 4) pivot_table with aggregation

In [None]:
# If duplicates exist, pivot_table handles them via aggfunc
dupes = pd.concat([sales_long, sales_long.iloc[:4]], ignore_index=True)
pt = pd.pivot_table(dupes, index=['id','product'], columns='quarter', values='sales', aggfunc='mean')
pt

## 5) Stack and Unstack

In [None]:
# Build a wide table with MultiIndex columns to demonstrate stack
pt_multi = pd.pivot_table(
    summary_long,
    index=['region','product'],
    columns=['quarter','metric'],
    values='value',
    aggfunc='sum'
)
pt_multi

In [None]:
# Stack selected column levels into rows → long tidy table
tidy_from_multi = (
    pt_multi.stack(['quarter','metric'], future_stack=True)
           .rename('value')
           .reset_index()
)
tidy_from_multi.head()

In [None]:
# Unstack: long → wide by chosen index level
long_idx = tidy_from_multi.set_index(['region','product','quarter','metric'])
wide_again = long_idx['value'].unstack(['quarter'])
wide_again.head()

In [None]:
# Utility: flatten MultiIndex columns if needed
def flatten_columns(df):
    if isinstance(df.columns, pd.MultiIndex):
        df = df.copy()
        df.columns = ['_'.join(map(str, c)).strip('_') for c in df.columns]
    return df

In [None]:
flatten_columns(wide_again).head()

## 6) Missing values — detection and strategies

In [None]:
# Detect
sales_long.isna().sum(), sales_long.info()

In [None]:
# Strategy 1: simple fill with 0
filled0 = sales_long.assign(sales=lambda d: d['sales'].fillna(0))
filled0.head(8)

In [None]:
# Strategy 2: impute per-product mean using groupby.transform
imputed_by_product = sales_long.copy()
imputed_by_product['sales'] = (
    imputed_by_product.groupby('product')['sales']
                      .transform(lambda s: s.fillna(s.mean()))
)
imputed_by_product.sort_values(['product','id','quarter']).head(12)

In [None]:
# Strategy 3: forward/backward fill within each id across ordered quarters
order = {'q1':1,'q2':2,'q3':3,'q4':4}
ffill_bfill = (
    sales_long.assign(q_order=lambda d: d['quarter'].map(order))
              .sort_values(['id','q_order'])
              .groupby('id', as_index=False, group_keys=False)
              .apply(lambda g: g.assign(sales=g['sales'].ffill().bfill()), include_groups=False)
              .drop(columns='q_order')
)
ffill_bfill.head(12)

In [None]:
# Strategy 4: time-series interpolate example
date_rng = pd.date_range('2024-01-01', periods=8, freq='W')
ts = pd.DataFrame({'date': date_rng, 'value':[1.0, np.nan, 2.5, np.nan, np.nan, 5.0, 6.0, np.nan]})
ts_interp = ts.set_index('date').interpolate().reset_index()
ts, ts_interp

## 7) Integrity checks and assertions

In [None]:
# Example integrity rules
assert sales_long['id'].notna().all(), 'IDs should not be missing'
assert set(sales_long['quarter'].dropna().unique()) <= {'q1','q2','q3','q4'}
sales_long.shape

## 8) In-class exercises
Each task should be solved in the cell provided. Keep solutions in method-chaining style when possible.

### Exercise 1 — Melt and pivot back

In [None]:
# TODO: 1) From sales_wide, melt to long; 2) impute missing by product median; 3) pivot back to q1..q4 wide


### Exercise 2 — Stack/unstack with MultiIndex

In [None]:
# TODO: Convert pt_multi to tidy long with stack on both levels then compute mean by region-product-quarter


### Exercise 3 — Missing-data report

In [None]:
# TODO: Create a function missing_report(df) that returns a DataFrame with columns: col, n_missing, pct_missing
