# 03 — Data Cleaning
**Data Analysis Portfolio**

Topics: missing values, duplicates, invalid values, outlier capping, type fixing, string standardization

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(42)
print("Ready.")

## 1. Create Dirty Dataset

In [None]:
n = 200
df_raw = pd.DataFrame({
    'emp_id':     list(range(1001,1001+n)) + [1010,1025,1040],
    'name':       ['Emp_'+str(i) for i in range(n)] + ['Emp_9','Emp_24','Emp_39'],
    'age':        list(np.random.randint(22,60,n)) + [22,35,45],
    'department': list(np.random.choice(['IT','HR','Finance','IT ','hr','FINANCE'], n))
                  + ['IT','HR','Finance'],
    'salary':     list(np.random.normal(55000,15000,n).round(0)) + [55000,48000,62000],
    'experience': list(np.random.randint(0,35,n)) + [5,10,15],
    'gender':     list(np.random.choice(['Male','Female','M','F','male','FEMALE',None], n))
                  + ['Male','Female','Male'],
    'rating':     list(np.random.choice([1,2,3,4,5,None,999,-1], n)) + [3,4,5],
})
for col in ['salary','age','experience']:
    mask = np.random.choice([True,False], len(df_raw), p=[0.08,0.92])
    df_raw.loc[mask, col] = np.nan

df_raw.loc[10,'salary']     = 950000   # extreme outlier
df_raw.loc[25,'salary']     = -5000    # invalid negative
df_raw.loc[50,'age']        = 135      # impossible
df_raw.loc[75,'experience'] = 99       # impossible

print("Raw shape:", df_raw.shape)
print(df_raw.head(5))

## 2. Audit — Understand the Mess

In [None]:
missing = df_raw.isnull().sum()
pct     = (missing / len(df_raw) * 100).round(1)
audit   = pd.DataFrame({'count': missing, '%': pct})
print("Missing Values:")
print(audit[audit['count']>0])
print()
print("Duplicates:", df_raw.duplicated(subset='emp_id').sum())
print()
for col in ['department','gender']:
    print(f"{col} unique: {df_raw[col].unique()}")

## 3. Step-by-Step Cleaning

In [None]:
df = df_raw.copy()

# STEP 1 — Remove duplicates
before = len(df)
df = df.drop_duplicates(subset='emp_id', keep='first')
print(f"Step 1 — Removed {before-len(df)} duplicates | shape: {df.shape}")

In [None]:
# STEP 2 — Standardize categories
df['department'] = df['department'].str.strip().str.title().replace({'It':'IT','Hr':'HR'})
gender_map = {'M':'Male','F':'Female','male':'Male','female':'Female','FEMALE':'Female'}
df['gender'] = df['gender'].str.strip().replace(gender_map)
df['gender'] = df['gender'].where(df['gender'].isin(['Male','Female']), np.nan)
print("Step 2 — Departments:", df['department'].unique())
print("         Genders:    ", df['gender'].unique())

In [None]:
# STEP 3 — Fix invalid values
df.loc[df['age']>80,       'age']        = np.nan
df.loc[df['age']<18,       'age']        = np.nan
df.loc[df['salary']<0,     'salary']     = np.nan
df.loc[df['experience']>45,'experience'] = np.nan
df.loc[~df['rating'].isin([1,2,3,4,5]), 'rating'] = np.nan

Q1 = df['salary'].quantile(0.25)
Q3 = df['salary'].quantile(0.75)
cap = Q3 + 3*(Q3-Q1)
df.loc[df['salary']>cap, 'salary'] = cap
print(f"Step 3 — Salary capped at {cap:,.0f}")

In [None]:
# STEP 4 — Fill missing values
for col in ['age','experience']:
    df[col] = df[col].fillna(df[col].median())
df['salary']  = df.groupby('department')['salary'].transform(lambda x: x.fillna(x.median()))
df['gender']  = df['gender'].fillna(df['gender'].mode()[0])
df['rating']  = df['rating'].fillna(df['rating'].mode()[0])
print("Step 4 — Missing values remaining:", df.isnull().sum().sum())

In [None]:
# STEP 5 — Fix types
df['age']        = df['age'].astype(int)
df['experience'] = df['experience'].astype(int)
df['rating']     = df['rating'].astype(int)
df['salary']     = df['salary'].round(2)
df['gender_code']= df['gender'].map({'Male':0,'Female':1})
print("Step 5 — dtypes:\n", df.dtypes)

## 4. Before vs After

In [None]:
print("BEFORE | AFTER")
print(f"Shape:      {df_raw.shape}  |  {df.shape}")
print(f"Duplicates: {df_raw.duplicated(subset='emp_id').sum()} | {df.duplicated(subset='emp_id').sum()}")
print(f"Missing:    {df_raw.isnull().sum().sum()} | {df.isnull().sum().sum()}")
print(f"Salary max: {df_raw['salary'].max():,.0f} | {df['salary'].max():,.0f}")

## 5. Visualize Cleaned Data

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(12, 8))
fig.suptitle('Cleaned Dataset Overview', fontsize=13, fontweight='bold')

axes[0,0].hist(df['salary'], bins=25, color='steelblue', edgecolor='white')
axes[0,0].set_title('Salary Distribution')

dept = df['department'].value_counts()
axes[0,1].bar(dept.index, dept.values, color='coral', edgecolor='white')
axes[0,1].set_title('Department Counts')

axes[1,0].hist(df['age'], bins=20, color='mediumseagreen', edgecolor='white')
axes[1,0].set_title('Age Distribution')

rating = df['rating'].value_counts().sort_index()
axes[1,1].bar(rating.index, rating.values, color='mediumpurple', edgecolor='white')
axes[1,1].set_title('Rating Distribution')

plt.tight_layout()
plt.savefig('/home/claude/data_analysis_portfolio/notebooks/03_cleaning_plot.png', dpi=100)
plt.show()
print("Plot saved.")

---
## ✅ Cleaning Checklist
| Issue | Fix |
|-------|-----|
| Duplicates | `drop_duplicates()` |
| Inconsistent strings | `.str.strip().str.title()` + `replace()` |
| Invalid values | Domain rules + `np.nan` |
| Outliers | IQR capping |
| Missing numeric | Median / group median |
| Missing categorical | Mode |
| Wrong dtypes | `.astype()` |