# Goal

Detect outliers statistically + algorithmically.


In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv")


In [3]:
# Z-Score Method
from scipy.stats import zscore

df["Age_z"] = zscore(df["Age"].fillna(df["Age"].median()))
outliers_z = df[np.abs(df["Age_z"]) > 3]
outliers_z.shape


(7, 13)

In [4]:
# IQR Method
Q1 = df["Age"].quantile(0.25)
Q3 = df["Age"].quantile(0.75)
IQR = Q3 - Q1

outliers_iqr = df[(df["Age"] < Q1 - 1.5*IQR) | (df["Age"] > Q3 + 1.5*IQR)]


In [6]:
# Isolation Forest (Advanced)
from sklearn.ensemble import IsolationForest

iso = IsolationForest(contamination=0.05, random_state=42)
df["outlier"] = iso.fit_predict(df[["Age"]].fillna(df["Age"].median()))
df["outlier"].value_counts()


outlier
 1    846
-1     45
Name: count, dtype: int64

# Final Insight

- Not all outliers are errors
- Domain knowledge > blind removal
- Some models benefit from outliers
