In [2]:
import os
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score
import matplotlib.pyplot as plt

# -----------------------------
# CONFIG PATHS
# -----------------------------
DATA_DIR_RAW = "./data/raw"
DATA_DIR_PROCESSED = "./data/processed"
os.makedirs(DATA_DIR_RAW, exist_ok=True)
os.makedirs(DATA_DIR_PROCESSED, exist_ok=True)

raw_file = os.path.join(DATA_DIR_RAW, "outliers_homework.csv")

# -----------------------------
# LOAD DATA (fallback to synthetic)
# -----------------------------
if os.path.exists(raw_file):
    df = pd.read_csv(raw_file)
else:
    np.random.seed(42)
    df = pd.DataFrame({
        "x": np.concatenate([np.random.normal(50, 10, 100), [200, 220]]),
        "y": np.concatenate([np.random.normal(100, 20, 100), [500, 550]])
    })

print("===== RAW DATA HEAD =====")
print(df.head())

# -----------------------------
# OUTLIER FUNCTIONS
# -----------------------------
def detect_outliers_iqr(series):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return (series < lower) | (series > upper)

def detect_outliers_zscore(series, threshold=3.0):
    z = np.abs(stats.zscore(series, nan_policy='omit'))
    return z > threshold

def winsorize_series(series, lower=0.05, upper=0.95):
    lower_val = series.quantile(lower)
    upper_val = series.quantile(upper)
    return series.clip(lower=lower_val, upper=upper_val)

# -----------------------------
# APPLY TO NUMERIC COLUMNS
# -----------------------------
numeric_col = "x"
df["outlier_iqr"] = detect_outliers_iqr(df[numeric_col])
df["outlier_zscore"] = detect_outliers_zscore(df[numeric_col])
df["x_winsor"] = winsorize_series(df[numeric_col])

print("\nOutlier counts:")
print("IQR method:", df["outlier_iqr"].sum())
print("Z-score method:", df["outlier_zscore"].sum())

# -----------------------------
# MODELING: With and Without Outliers
# -----------------------------
X = df[["x"]]
y = df["y"]

# Full data
model_full = LinearRegression().fit(X, y)
y_pred_full = model_full.predict(X)
mae_full = mean_absolute_error(y, y_pred_full)
r2_full = r2_score(y, y_pred_full)

# Remove IQR outliers
df_no_outliers = df[~df["outlier_iqr"]]
X_no = df_no_outliers[["x"]]
y_no = df_no_outliers["y"]
model_no = LinearRegression().fit(X_no, y_no)
y_pred_no = model_no.predict(X_no)
mae_no = mean_absolute_error(y_no, y_pred_no)
r2_no = r2_score(y_no, y_pred_no)

# Winsorized
X_win = df[["x_winsor"]]
model_win = LinearRegression().fit(X_win, y)
y_pred_win = model_win.predict(X_win)
mae_win = mean_absolute_error(y, y_pred_win)
r2_win = r2_score(y, y_pred_win)

print("\n=== Model Comparison ===")
print(f"Full data: MAE={mae_full:.2f}, R2={r2_full:.2f}")
print(f"No outliers: MAE={mae_no:.2f}, R2={r2_no:.2f}")
print(f"Winsorized: MAE={mae_win:.2f}, R2={r2_win:.2f}")

# -----------------------------
# PLOTS
# -----------------------------
plt.figure(figsize=(12, 4))

# Raw with outliers
plt.subplot(1, 3, 1)
plt.scatter(df["x"], df["y"], c=df["outlier_iqr"], cmap="coolwarm", label="Outliers")
plt.plot(df["x"], y_pred_full, color="green", label="Fit")
plt.title("With Outliers (IQR flagged)")
plt.xlabel("x"); plt.ylabel("y")
plt.legend()

# Without outliers
plt.subplot(1, 3, 2)
plt.scatter(X_no, y_no, color="blue", alpha=0.6)
plt.plot(X_no, y_pred_no, color="red", label="Fit (no outliers)")
plt.title("Outliers Removed (IQR)")
plt.xlabel("x"); plt.ylabel("y")
plt.legend()

# Winsorized
plt.subplot(1, 3, 3)
plt.scatter(df["x_winsor"], y, color="purple", alpha=0.6)
plt.plot(df["x_winsor"], y_pred_win, color="orange", label="Fit (winsorized)")
plt.title("Winsorized X")
plt.xlabel("x"); plt.ylabel("y")
plt.legend()

plt.tight_layout()
plot_path = os.path.join(DATA_DIR_PROCESSED, "outlier_models.png")
plt.savefig(plot_path)
plt.close()

print(f"\n✅ Plots saved to {plot_path}")


===== RAW DATA HEAD =====
           x          y
0  54.967142  71.692585
1  48.617357  91.587094
2  56.476885  93.145710
3  65.230299  83.954455
4  47.658466  96.774286

Outlier counts:
IQR method: 3
Z-score method: 2

=== Model Comparison ===
Full data: MAE=24.34, R2=0.76
No outliers: MAE=15.12, R2=0.02
Winsorized: MAE=27.91, R2=0.04

✅ Plots saved to ./data/processed\outlier_models.png
