# 05 — Outlier Detection & Treatment
**Data Analysis Portfolio**

Methods: IQR, Z-Score, Modified Z-Score
Treatments: Remove, Cap, Log Transform

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
np.random.seed(42)
print("Ready.")

## 1. Dataset with Injected Outliers

In [None]:
n = 300
salary = np.random.normal(60000, 12000, n)
salary = np.append(salary, [200000, 220000, -5000, 185000, 3000])
age    = np.random.normal(35, 8, n).clip(18, 65)
age    = np.append(age, [120, 130, 2, 1, 125])
df = pd.DataFrame({'salary': salary.round(0), 'age': age.round(0).astype(int)})
print(f"Dataset: {len(df)} rows (includes injected outliers)")
print(df.describe())

## 2. Visual Detection — Boxplot

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(13, 9))
fig.suptitle('Outlier Detection — Visual', fontsize=14, fontweight='bold')

axes[0,0].boxplot(df['salary'], vert=False, patch_artist=True,
                  boxprops=dict(facecolor='lightblue'),
                  flierprops=dict(marker='o', color='red', markersize=7))
axes[0,0].set_title('Salary Boxplot (red = outliers)')

axes[0,1].hist(df['salary'], bins=30, color='steelblue', edgecolor='white')
axes[0,1].set_title('Salary Distribution')

axes[1,0].boxplot(df['age'], vert=False, patch_artist=True,
                  boxprops=dict(facecolor='lightcoral'),
                  flierprops=dict(marker='o', color='red', markersize=7))
axes[1,0].set_title('Age Boxplot')

normal = (df['salary']>0)&(df['salary']<150000)&(df['age']<80)
axes[1,1].scatter(df[normal]['age'],   df[normal]['salary'],   color='steelblue', alpha=0.4, s=15, label='Normal')
axes[1,1].scatter(df[~normal]['age'],  df[~normal]['salary'],  color='red',       s=60, marker='X', label='Outlier')
axes[1,1].set_title('Salary vs Age')
axes[1,1].legend()

plt.tight_layout()
plt.savefig('/home/claude/data_analysis_portfolio/notebooks/05_outlier_visual.png', dpi=100)
plt.show()

## 3. IQR Method

In [None]:
def detect_iqr(series, mult=1.5):
    Q1, Q3 = series.quantile(0.25), series.quantile(0.75)
    IQR    = Q3 - Q1
    low, high = Q1 - mult*IQR, Q3 + mult*IQR
    return (series < low) | (series > high), low, high

sal_mask, s_low, s_high = detect_iqr(df['salary'])
age_mask, a_low, a_high = detect_iqr(df['age'])

print(f"IQR Salary — range: [{s_low:,.0f}, {s_high:,.0f}] | outliers: {sal_mask.sum()}")
print(f"  Outlier values: {sorted(df[sal_mask]['salary'].tolist())}")
print(f"IQR Age    — range: [{a_low:.1f}, {a_high:.1f}] | outliers: {age_mask.sum()}")
print(f"  Outlier values: {sorted(df[age_mask]['age'].tolist())}")

## 4. Z-Score Method

In [None]:
def detect_zscore(series, threshold=3.0):
    z = np.abs(stats.zscore(series))
    return z > threshold, z

sal_z_mask, sal_z = detect_zscore(df['salary'])
age_z_mask, age_z = detect_zscore(df['age'])
print(f"Z-Score (threshold=3) — Salary outliers: {sal_z_mask.sum()}")
print(f"Z-Score (threshold=3) — Age outliers:    {age_z_mask.sum()}")
top = df.copy()
top['z_salary'] = sal_z.round(2)
print("\nTop salary outliers by z-score:")
print(top.nlargest(5,'z_salary')[['salary','z_salary']])

## 5. Modified Z-Score (Robust for non-normal data)

In [None]:
def modified_z(series, threshold=3.5):
    median = series.median()
    mad    = np.median(np.abs(series - median))
    mz     = 0.6745 * (series - median) / (mad + 1e-10)
    return np.abs(mz) > threshold, mz

sal_mz_mask, _ = modified_z(df['salary'])
print(f"Modified Z-Score — Salary outliers: {sal_mz_mask.sum()}")

comp = pd.DataFrame({'salary':df['salary'], 'IQR':sal_mask, 'Z-Score':sal_z_mask, 'Mod-Z':sal_mz_mask})
print("\nOutliers flagged by each method:")
print(comp[comp.iloc[:,1:].any(axis=1)].head(8))

## 6. Treatment: Remove / Cap / Log Transform

In [None]:
# REMOVE
df_rem = df[~sal_mask & ~age_mask].copy()
print(f"Remove: {len(df)} → {len(df_rem)} rows")

# CAP (Winsorization)
df_cap = df.copy()
df_cap['salary'] = df_cap['salary'].clip(lower=s_low, upper=s_high)
df_cap['age']    = df_cap['age'].clip(lower=a_low,   upper=a_high)
print(f"Capped salary max: {df_cap['salary'].max():,.0f}")

# LOG TRANSFORM
df_log = df[df['salary']>0].copy()
df_log['salary_log'] = np.log1p(df_log['salary'])
print(f"Skew: original={df['salary'].skew():.2f} → log={df_log['salary_log'].skew():.2f}")

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(14, 4))
fig.suptitle('Salary: Original vs Capped vs Log', fontsize=12, fontweight='bold')
axes[0].hist(df['salary'],          bins=25, color='salmon',          edgecolor='white')
axes[0].set_title(f'Original  (skew={df["salary"].skew():.2f})')
axes[1].hist(df_cap['salary'],      bins=25, color='steelblue',        edgecolor='white')
axes[1].set_title(f'Capped    (skew={df_cap["salary"].skew():.2f})')
axes[2].hist(df_log['salary_log'],  bins=25, color='mediumseagreen',   edgecolor='white')
axes[2].set_title(f'Log       (skew={df_log["salary_log"].skew():.2f})')
plt.tight_layout()
plt.savefig('/home/claude/data_analysis_portfolio/notebooks/05_treatment.png', dpi=100)
plt.show()

---
## ✅ Summary
| Method | Best For | Code |
|--------|----------|------|
| IQR | Non-normal | Q1-1.5×IQR, Q3+1.5×IQR |
| Z-Score | Normal data | `scipy.stats.zscore` |
| Modified Z | Skewed/robust | MAD-based |
| Remove | Clear errors | `df[~mask]` |
| Cap | Keep rows | `.clip(low, high)` |
| Log | Right skew | `np.log1p()` |