# Stage 07 — Outliers, Risk & Assumptions

In [None]:

import pandas as pd, numpy as np
from pathlib import Path

DATA = Path("../data/raw/outliers_homework.csv")
df = pd.read_csv(DATA)
df.head()


In [None]:

def detect_outliers_iqr(series: pd.Series) -> pd.Series:
    q1, q3 = series.quantile([0.25,0.75])
    iqr = q3 - q1
    lower, upper = q1 - 1.5*iqr, q3 + 1.5*iqr
    return (series < lower) | (series > upper)

def detect_outliers_zscore(series: pd.Series, threshold=3.0) -> pd.Series:
    mu, sd = series.mean(), series.std()
    z = (series - mu) / sd
    return z.abs() > threshold

def winsorize_series(series: pd.Series, lower=0.05, upper=0.95) -> pd.Series:
    lo, hi = series.quantile(lower), series.quantile(upper)
    return series.clip(lo, hi)


In [None]:

df['outlier_iqr'] = detect_outliers_iqr(df['target_y'])
df['outlier_z'] = detect_outliers_zscore(df['target_y'])
df.head()


In [None]:

def summarize(series):
    return pd.Series({'mean':series.mean(),'median':series.median(),'std':series.std()})

summary_full = summarize(df['target_y'])
summary_no_iqr = summarize(df.loc[~df['outlier_iqr'],'target_y'])
summary_no_z = summarize(df.loc[~df['outlier_z'],'target_y'])
pd.DataFrame({'full':summary_full,'no_iqr':summary_no_iqr,'no_z':summary_no_z})
