# Tidying II: Deduplication, Type Consistency, & Categorical Integrity

*Hands-on notebook with demos and exercises*

## 0. Setup

In [None]:
import pandas as pd
import numpy as np
from pandas.api.types import CategoricalDtype
pd.__version__

## 1. Create Synthetic Datasets
We will create small but realistic datasets with deliberate issues: duplicates, mixed types, messy categories.

In [None]:
# Customers: duplicates, whitespace, case inconsistencies, and missing values
customers = pd.DataFrame({
    'id': [101, 101, 102, 103, 103, 104, 105],
    'email': ['a@x.com', 'A@x.com ', 'b@x.com', 'c@x.com', 'c@x.com', 'd@x.com', ' e@x.com'],
    'state': ['tn', 'TN ', 'GA', 'ga', 'GA', 'AL', 'Tn'],
    'created_at': ['2025-09-01', '2025/09/01', '2025-09-05', '2025-09-07', '2025-09-07', '2025-09-10', '2025-09-12']
})
customers

In [None]:
# Transactions: partial duplicates by (id, ts), price as strings with symbols, and bad rows
transactions = pd.DataFrame({
    'id': [101, 101, 101, 102, 103, 103, 106],
    'ts': ['2025-09-02 10:00', '2025-09-02 10:00', '2025-09-03 09:00', '2025-09-06 16:30', '2025-09-08 08:00', '2025-09-08 08:00', '2025-09-12 12:00'],
    'price': ['$10.00', '10', '9.5', '7.25', '0', 'free', '12.5'],
    'status': ['paid', 'paid', 'paid', 'paid', 'refunded', 'refunded', 'paid']
})
transactions

## 2. Deduplication: Detecting Exact vs Partial Duplicates

In [None]:
# Exact duplicates across all columns in transactions
transactions_exact_dups_mask = transactions.duplicated(keep=False)
transactions[transactions_exact_dups_mask]

In [None]:
# Partial duplicates by a key subset: (id, ts)
subset_cols = ['id', 'ts']
dups_subset_mask = transactions.duplicated(subset=subset_cols, keep=False)
transactions[dups_subset_mask].sort_values(subset_cols)

**Note:** `keep=False` marks all occurrences of a duplicate as `True`. Use it to inspect every row involved.

## 3. Deduplication: Resolution Strategies

In [None]:
# 3.1 Keep first occurrence by key
tx_keep_first = transactions.drop_duplicates(subset=['id','ts'], keep='first')
tx_keep_first

In [None]:
# 3.2 Keep last occurrence by key
tx_keep_last = transactions.drop_duplicates(subset=['id','ts'], keep='last')
tx_keep_last

In [None]:
# 3.3 Aggregate duplicates by key (sum price after converting to numeric; keep max timestamp string for demo)
tx_tmp = transactions.copy()
tx_tmp['price_num'] = pd.to_numeric(tx_tmp['price'].str.replace('$','', regex=False), errors='coerce')
tx_agg = (
    tx_tmp.groupby(['id','ts'], as_index=False)
          .agg(price_total=('price_num','sum'), status_last=('status','last'))
)
tx_agg

**Guideline:** Document which rule is applied and why. Aggregation is appropriate when each row is a component of a single logical event.

## 4. Practical Deduplication Recipe with Assertions

In [None]:
# Sort by time and keep latest per (id, ts)
tx_sorted = transactions.sort_values('ts')
tx_unique = tx_sorted.drop_duplicates(subset=['id','ts'], keep='last')

# Verify no duplicates remain on the key
assert not tx_unique.duplicated(subset=['id','ts']).any()
tx_unique

## 5. Type Conversion: Inspecting and Enforcing Types

In [None]:
# Inspect
customers.info()
transactions.info()

In [None]:
# Convert to datetime
customers['created_at'] = pd.to_datetime(customers['created_at'], errors='coerce')
transactions['ts'] = pd.to_datetime(transactions['ts'], errors='coerce')

customers.dtypes

In [None]:
# Convert price to numeric
transactions['price_num'] = pd.to_numeric(transactions['price'].str.replace('$','', regex=False), errors='coerce')

transactions.dtypes

In [None]:
# Alternatives to 'coerce' for error handling
try:
    pd.to_numeric(pd.Series(['1','two','3']), errors='raise')
except Exception as e:
    print("errors='raise' example ->", repr(e))

# errors='ignore' leaves data unchanged if invalid
print(pd.to_numeric(pd.Series(['1','two','3']), errors='ignore'))

## 6. `convert_dtypes()` and Explicit `astype()`

In [None]:
# convert_dtypes picks best nullable dtypes
auto_customers = customers.convert_dtypes()

auto_customers.dtypes

In [None]:
auto_transactions = transactions.convert_dtypes()
auto_transactions.dtypes

In [None]:
# Explicit casting of multiple columns
casted = transactions.astype({'price_num': 'float64'})
casted.dtypes

## 7. Validating Type Integrity and Logical Constraints

In [None]:
import pandas.api.types as ptypes
assert ptypes.is_datetime64_any_dtype(customers['created_at'])
assert ptypes.is_datetime64_any_dtype(transactions['ts'])
assert ptypes.is_numeric_dtype(transactions['price_num'])

# Logical constraints
assert (transactions['price_num'].fillna(0) >= 0).all()  # Nonnegative
print("Type and logical checks passed.")

## 8. Categorical Data: Normalization, Enforcement, and Drift

In [None]:
# Normalize state values
customers['state_norm'] = customers['state'].str.upper().str.strip()

# Enforce allowed categories
state_type = CategoricalDtype(categories=['TN','GA','AL'], ordered=False)
customers['state_cat'] = customers['state_norm'].astype(state_type)

# Detect drift
invalid_mask = ~customers['state'].isin(state_type.categories)
customers[['id','state','state_norm','state_cat']][invalid_mask]

In [None]:
# Detect drift
invalid_mask = ~customers['state_norm'].isin(state_type.categories)
customers[['id','state','state_norm','state_cat']][invalid_mask]

In [None]:
# Map gender example and enforce categories
demo = pd.DataFrame({'gender': ['Male','male','M','Female','F','unknown', np.nan]})
demo['gender_std'] = demo['gender'].replace({'male':'Male','M':'Male','female':'Female','F':'Female'})

demo

In [None]:
allowed = ['Male','Female']
invalid = ~demo['gender_std'].isin(allowed)

demo[invalid]

## 9. Clean Joins After Dedup and Type Fixes

In [None]:
# Prepare customers: deduplicate by (id, email after trim+lower)
cust_norm = customers.assign(
    email_norm = customers['email'].str.strip().str.lower()
).sort_values('created_at')

cust_unique = cust_norm.drop_duplicates(subset=['id','email_norm'], keep='last')

# Verify uniqueness
assert not cust_unique.duplicated(subset=['id']).any()

# Safe join with transactions
tx_clean = transactions[['id','ts','price_num','status']].drop_duplicates(subset=['id','ts'], keep='last')
fact = tx_clean.merge(cust_unique[['id','email_norm','state_cat']], on='id', how='left')
fact.head()

## 10. Documenting Cleaning Decisions
| Step | Action                 | Columns                    | Notes                                  |
|-----:|------------------------|----------------------------|----------------------------------------|
| 1    | Drop duplicates        | id, ts                     | Keep last per (id, ts)                 |
| 2    | Convert numeric/date   | price → price_num, ts      | `errors='coerce'` for robustness       |
| 3    | Normalize categories   | state → state_norm/state_cat | Uppercase + strict category set      |
| 4    | Validate constraints   | price_num, ts              | Nonnegative prices, valid datetimes    |
| 5    | Reproducible joins     | id                         | Verified unique keys                   |

## 11. Exercises
Complete the tasks below. Answers are provided in the subsequent section.

**Exercise 1.** Identify all partial duplicates in `transactions` by keys `(id, ts)` and return a deduplicated frame keeping the **highest** `price_num` per key.

In [None]:
# TODO: Your code here
# 1) Mark duplicates by subset
# 2) Aggregate by max(price_num)
# 3) Merge back or compute directly
# expected columns: id, ts, price_num_max
pass

**Exercise 2.** Convert `customers['created_at']` to datetime with `errors='raise'`. Catch and display the error, then convert correctly with a strict `format` specification.

In [None]:
# TODO: Your code here
# 1) Try strict conversion and handle exception
# 2) Then convert again using format='%Y-%m-%d' for the rows that match, coerce others
pass

**Exercise 3.** Enforce a categorical dtype for `customers['state_norm']` limited to `['TN','GA','AL']`. Show rows that become NaN after enforcement.

In [None]:
# TODO: Your code here
# 1) Define CategoricalDtype
# 2) astype to that type
# 3) filter rows where value is NaN in enforced column
pass

**Exercise 4.** Prove that `cust_unique['id']` is unique using two independent checks.

In [None]:
# TODO: Your code here
# e.g., use .is_unique and duplicated().any()
pass