# Tidying II: Deduplication, Type Consistency, & Categorical Integrity

*Hands-on notebook with demos and exercises*

## 0. Setup

In [1]:
import pandas as pd
import numpy as np
from pandas.api.types import CategoricalDtype
pd.__version__

'2.2.2'

## 1. Create Synthetic Datasets
We will create small but realistic datasets with deliberate issues: duplicates, mixed types, messy categories.

In [2]:
# Customers: duplicates, whitespace, case inconsistencies, and missing values
customers = pd.DataFrame({
    'id': [101, 101, 102, 103, 103, 104, 105],
    'email': ['a@x.com', 'A@x.com ', 'b@x.com', 'c@x.com', 'c@x.com', 'd@x.com', ' e@x.com'],
    'state': ['tn', 'TN ', 'GA', 'ga', 'GA', 'AL', 'Tn'],
    'created_at': ['2025-09-01', '2025/09/01', '2025-09-05', '2025-09-07', '2025-09-07', '2025-09-10', '2025-09-12']
})
customers

Unnamed: 0,id,email,state,created_at
0,101,a@x.com,tn,2025-09-01
1,101,A@x.com,TN,2025/09/01
2,102,b@x.com,GA,2025-09-05
3,103,c@x.com,ga,2025-09-07
4,103,c@x.com,GA,2025-09-07
5,104,d@x.com,AL,2025-09-10
6,105,e@x.com,Tn,2025-09-12


In [3]:
# Transactions: partial duplicates by (id, ts), price as strings with symbols, and bad rows
transactions = pd.DataFrame({
    'id': [101, 101, 101, 102, 103, 103, 106],
    'ts': ['2025-09-02 10:00', '2025-09-02 10:00', '2025-09-03 09:00', '2025-09-06 16:30', '2025-09-08 08:00', '2025-09-08 08:00', '2025-09-12 12:00'],
    'price': ['$10.00', '10', '9.5', '7.25', '0', 'free', '12.5'],
    'status': ['paid', 'paid', 'paid', 'paid', 'refunded', 'refunded', 'paid']
})
transactions

Unnamed: 0,id,ts,price,status
0,101,2025-09-02 10:00,$10.00,paid
1,101,2025-09-02 10:00,10,paid
2,101,2025-09-03 09:00,9.5,paid
3,102,2025-09-06 16:30,7.25,paid
4,103,2025-09-08 08:00,0,refunded
5,103,2025-09-08 08:00,free,refunded
6,106,2025-09-12 12:00,12.5,paid


## 2. Deduplication: Detecting Exact vs Partial Duplicates

In [4]:
# Exact duplicates across all columns in transactions
transactions_exact_dups_mask = transactions.duplicated(keep=False)
transactions[transactions_exact_dups_mask]

Unnamed: 0,id,ts,price,status


In [5]:
# Partial duplicates by a key subset: (id, ts)
subset_cols = ['id', 'ts']
dups_subset_mask = transactions.duplicated(subset=subset_cols, keep=False)
transactions[dups_subset_mask].sort_values(subset_cols)

Unnamed: 0,id,ts,price,status
0,101,2025-09-02 10:00,$10.00,paid
1,101,2025-09-02 10:00,10,paid
4,103,2025-09-08 08:00,0,refunded
5,103,2025-09-08 08:00,free,refunded


**Note:** `keep=False` marks all occurrences of a duplicate as `True`. Use it to inspect every row involved.

## 3. Deduplication: Resolution Strategies

In [6]:
# 3.1 Keep first occurrence by key
tx_keep_first = transactions.drop_duplicates(subset=['id','ts'], keep='first')
tx_keep_first

Unnamed: 0,id,ts,price,status
0,101,2025-09-02 10:00,$10.00,paid
2,101,2025-09-03 09:00,9.5,paid
3,102,2025-09-06 16:30,7.25,paid
4,103,2025-09-08 08:00,0,refunded
6,106,2025-09-12 12:00,12.5,paid


In [7]:
# 3.2 Keep last occurrence by key
tx_keep_last = transactions.drop_duplicates(subset=['id','ts'], keep='last')
tx_keep_last

Unnamed: 0,id,ts,price,status
1,101,2025-09-02 10:00,10,paid
2,101,2025-09-03 09:00,9.5,paid
3,102,2025-09-06 16:30,7.25,paid
5,103,2025-09-08 08:00,free,refunded
6,106,2025-09-12 12:00,12.5,paid


In [8]:
transactions

Unnamed: 0,id,ts,price,status
0,101,2025-09-02 10:00,$10.00,paid
1,101,2025-09-02 10:00,10,paid
2,101,2025-09-03 09:00,9.5,paid
3,102,2025-09-06 16:30,7.25,paid
4,103,2025-09-08 08:00,0,refunded
5,103,2025-09-08 08:00,free,refunded
6,106,2025-09-12 12:00,12.5,paid


In [9]:
# 3.3 Aggregate duplicates by key (sum price after converting to numeric; keep max timestamp string for demo)
tx_tmp = transactions.copy()
tx_tmp['price_num'] = pd.to_numeric(tx_tmp['price'].str.replace('$','', regex=False), errors='coerce')
tx_agg = (
    tx_tmp.groupby(['id','ts'], as_index=False)
          .agg(price_total=('price_num','sum'), status_last=('status','last'))
)
tx_agg

Unnamed: 0,id,ts,price_total,status_last
0,101,2025-09-02 10:00,20.0,paid
1,101,2025-09-03 09:00,9.5,paid
2,102,2025-09-06 16:30,7.25,paid
3,103,2025-09-08 08:00,0.0,refunded
4,106,2025-09-12 12:00,12.5,paid


**Guideline:** Document which rule is applied and why. Aggregation is appropriate when each row is a component of a single logical event.

## 4. Practical Deduplication Recipe with Assertions

In [10]:
# Sort by time and keep latest per (id, ts)
tx_sorted = transactions.sort_values('ts')
tx_unique = tx_sorted.drop_duplicates(subset=['id','ts'], keep='last')

# Verify no duplicates remain on the key
assert not tx_unique.duplicated(subset=['id','ts']).any()
tx_unique

Unnamed: 0,id,ts,price,status
1,101,2025-09-02 10:00,10,paid
2,101,2025-09-03 09:00,9.5,paid
3,102,2025-09-06 16:30,7.25,paid
5,103,2025-09-08 08:00,free,refunded
6,106,2025-09-12 12:00,12.5,paid


## 5. Type Conversion: Inspecting and Enforcing Types

In [11]:
# Inspect
customers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          7 non-null      int64 
 1   email       7 non-null      object
 2   state       7 non-null      object
 3   created_at  7 non-null      object
dtypes: int64(1), object(3)
memory usage: 356.0+ bytes


In [12]:
transactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      7 non-null      int64 
 1   ts      7 non-null      object
 2   price   7 non-null      object
 3   status  7 non-null      object
dtypes: int64(1), object(3)
memory usage: 356.0+ bytes


In [13]:
# Convert to datetime
customers['created_at'] = pd.to_datetime(customers['created_at'], errors='coerce')
transactions['ts'] = pd.to_datetime(transactions['ts'], errors='coerce')

customers.dtypes

id                     int64
email                 object
state                 object
created_at    datetime64[ns]
dtype: object

In [14]:
# Convert price to numeric
transactions['price_num'] = pd.to_numeric(transactions['price'].str.replace('$','', regex=False), errors='coerce')

transactions.dtypes

id                    int64
ts           datetime64[ns]
price                object
status               object
price_num           float64
dtype: object

In [15]:
# Alternatives to 'coerce' for error handling
try:
    pd.to_numeric(pd.Series(['1','two','3']), errors='raise')
except Exception as e:
    print("errors='raise' example ->", repr(e))

# errors='ignore' leaves data unchanged if invalid
print(pd.to_numeric(pd.Series(['1','two','3']), errors='ignore'))

errors='raise' example -> ValueError('Unable to parse string "two" at position 1')
0      1
1    two
2      3
dtype: object


  print(pd.to_numeric(pd.Series(['1','two','3']), errors='ignore'))


## 6. `convert_dtypes()` and Explicit `astype()`

In [16]:
# convert_dtypes picks best nullable dtypes
auto_customers = customers.convert_dtypes()

auto_customers.dtypes

id                     Int64
email         string[python]
state         string[python]
created_at    datetime64[ns]
dtype: object

In [17]:
auto_transactions = transactions.convert_dtypes()
auto_transactions.dtypes

id                    Int64
ts           datetime64[ns]
price        string[python]
status       string[python]
price_num           Float64
dtype: object

In [18]:
# Explicit casting of multiple columns
casted = transactions.astype({'price_num': 'float64'})
casted.dtypes

id                    int64
ts           datetime64[ns]
price                object
status               object
price_num           float64
dtype: object

## 7. Validating Type Integrity and Logical Constraints

In [19]:
import pandas.api.types as ptypes
assert ptypes.is_datetime64_any_dtype(customers['created_at'])
assert ptypes.is_datetime64_any_dtype(transactions['ts'])
assert ptypes.is_numeric_dtype(transactions['price_num'])

# Logical constraints
assert (transactions['price_num'].fillna(0) >= 0).all()  # Nonnegative
print("Type and logical checks passed.")

Type and logical checks passed.


## 8. Categorical Data: Normalization, Enforcement, and Drift

In [20]:
# Normalize state values
customers['state_norm'] = customers['state'].str.upper().str.strip()

# Enforce allowed categories
state_type = CategoricalDtype(categories=['TN','GA','AL'], ordered=False)
customers['state_cat'] = customers['state_norm'].astype(state_type)

# Detect drift
invalid_mask = ~customers['state'].isin(state_type.categories)
customers[['id','state','state_norm','state_cat']][invalid_mask]

Unnamed: 0,id,state,state_norm,state_cat
0,101,tn,TN,TN
1,101,TN,TN,TN
3,103,ga,GA,GA
6,105,Tn,TN,TN


In [21]:
# Detect drift
invalid_mask = ~customers['state_norm'].isin(state_type.categories)
customers[['id','state','state_norm','state_cat']][invalid_mask]

Unnamed: 0,id,state,state_norm,state_cat


In [22]:
# Map gender example and enforce categories
demo = pd.DataFrame({'gender': ['Male','male','M','Female','F','unknown', np.nan]})
demo['gender_std'] = demo['gender'].replace({'male':'Male','M':'Male','female':'Female','F':'Female'})

demo

Unnamed: 0,gender,gender_std
0,Male,Male
1,male,Male
2,M,Male
3,Female,Female
4,F,Female
5,unknown,unknown
6,,


In [23]:
allowed = ['Male','Female']
invalid = ~demo['gender_std'].isin(allowed)

demo[invalid]

Unnamed: 0,gender,gender_std
5,unknown,unknown
6,,


## 9. Clean Joins After Dedup and Type Fixes

In [24]:
# Prepare customers: deduplicate by (id, email after trim+lower)
cust_norm = customers.assign(
    email_norm = customers['email'].str.strip().str.lower()
).sort_values('created_at')

cust_unique = cust_norm.drop_duplicates(subset=['id','email_norm'], keep='last')

# Verify uniqueness
assert not cust_unique.duplicated(subset=['id']).any()

# Safe join with transactions
tx_clean = transactions[['id','ts','price_num','status']].drop_duplicates(subset=['id','ts'], keep='last')
fact = tx_clean.merge(cust_unique[['id','email_norm','state_cat']], on='id', how='left')
fact.head()

Unnamed: 0,id,ts,price_num,status,email_norm,state_cat
0,101,2025-09-02 10:00:00,10.0,paid,a@x.com,TN
1,101,2025-09-03 09:00:00,9.5,paid,a@x.com,TN
2,102,2025-09-06 16:30:00,7.25,paid,b@x.com,GA
3,103,2025-09-08 08:00:00,,refunded,c@x.com,GA
4,106,2025-09-12 12:00:00,12.5,paid,,


## 10. Documenting Cleaning Decisions
| Step | Action                 | Columns                    | Notes                                  |
|-----:|------------------------|----------------------------|----------------------------------------|
| 1    | Drop duplicates        | id, ts                     | Keep last per (id, ts)                 |
| 2    | Convert numeric/date   | price → price_num, ts      | `errors='coerce'` for robustness       |
| 3    | Normalize categories   | state → state_norm/state_cat | Uppercase + strict category set      |
| 4    | Validate constraints   | price_num, ts              | Nonnegative prices, valid datetimes    |
| 5    | Reproducible joins     | id                         | Verified unique keys                   |

## 11. Exercises
Complete the tasks below. Answers are provided in the subsequent section.

**Exercise 1.** Identify all partial duplicates in `transactions` by keys `(id, ts)` and return a deduplicated frame keeping the **highest** `price_num` per key.

In [25]:
# TODO: Your code here
# 1) Mark duplicates by subset
# 2) Aggregate by max(price_num)
# 3) Merge back or compute directly
# expected columns: id, ts, price_num_max
pass

In [26]:
ans1 = (
    transactions
      .assign(price_num=pd.to_numeric(transactions['price'].str.replace('$','', regex=False), errors='coerce'))
      .groupby(['id','ts'], as_index=False)
      .agg(price_num_max=('price_num','max'))
)
ans1.sort_values(['id','ts']).head(10)

Unnamed: 0,id,ts,price_num_max
0,101,2025-09-02 10:00:00,10.0
1,101,2025-09-03 09:00:00,9.5
2,102,2025-09-06 16:30:00,7.25
3,103,2025-09-08 08:00:00,0.0
4,106,2025-09-12 12:00:00,12.5


**Exercise 2.** Convert `customers['created_at']` to datetime with `errors='raise'`. Catch and display the error, then convert correctly with a strict `format` specification.

In [27]:
# TODO: Your code here
# 1) Try strict conversion and handle exception
# 2) Then convert again using format='%Y-%m-%d' for the rows that match, coerce others
pass

In [28]:
try:
    pd.to_datetime(customers['created_at'], errors='raise')
except Exception as e:
    print("Strict conversion failed:", e)

# Use a mask to strictly parse only ISO-like 'YYYY-MM-DD' rows
mask_iso = customers['created_at'].astype(str).str.match(r'^\d{4}-\d{2}-\d{2}$', na=False)
created_fixed = pd.Series(pd.NaT, index=customers.index, dtype='datetime64[ns]')
created_fixed[mask_iso] = pd.to_datetime(customers.loc[mask_iso, 'created_at'], format='%Y-%m-%d', errors='coerce')
created_fixed[~mask_iso] = pd.to_datetime(customers.loc[~mask_iso, 'created_at'], errors='coerce')  # fallback
created_fixed

0   2025-09-01
1          NaT
2   2025-09-05
3   2025-09-07
4   2025-09-07
5   2025-09-10
6   2025-09-12
dtype: datetime64[ns]

**Exercise 3.** Enforce a categorical dtype for `customers['state_norm']` limited to `['TN','GA','AL']`. Show rows that become NaN after enforcement.

In [29]:
# TODO: Your code here
# 1) Define CategoricalDtype
# 2) astype to that type
# 3) filter rows where value is NaN in enforced column
pass

In [30]:
state_type2 = CategoricalDtype(categories=['TN','GA','AL'], ordered=False)
state_enforced = customers['state_norm'].astype(state_type2)
invalid_rows = customers[state_enforced.isna()][['id','state','state_norm']]
state_enforced.dtype, invalid_rows

(CategoricalDtype(categories=['TN', 'GA', 'AL'], ordered=False, categories_dtype=object),
 Empty DataFrame
 Columns: [id, state, state_norm]
 Index: [])

**Exercise 4.** Prove that `cust_unique['id']` is unique using two independent checks.

In [31]:
# TODO: Your code here
# e.g., use .is_unique and duplicated().any()
pass

In [32]:
check1 = cust_unique['id'].is_unique
check2 = not cust_unique['id'].duplicated().any()
check1, check2

(True, True)