Coyomo - ML_Project #2

In [171]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import OLSInfluence

from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression

In [172]:
df = pd.read_csv('usina_with_outliers.csv')
df.head()

print("Columns:", df.columns.tolist())
print("\nShape:", df.shape)

print("\nSummary statistics:")
display(df.describe(include="all"))

Columns: ['AT', 'V', 'AP', 'RH', 'PE']

Shape: (9568, 5)

Summary statistics:


Unnamed: 0,AT,V,AP,RH,PE
count,9568.0,9568.0,9568.0,9568.0,9568.0
mean,19.618518,54.250021,1013.288871,73.308978,454.40782
std,8.256412,13.993655,6.636609,16.094499,18.760047
min,-39.174839,-38.397358,959.607298,-53.091613,327.52803
25%,13.48,41.67,1009.0775,63.2275,439.73
50%,20.32,52.08,1012.95,74.955,451.62
75%,25.7325,66.54,1017.32,84.8825,468.53
max,77.344839,155.117358,1064.772702,187.691613,590.09197


## Q1: Outlier Detection and Removal using Cook's Distance
- Why did you choose this model (Linear vs Ridge vs Lasso) for implementing Cook's Distance outlier detection?
    - I chose to work with linear regression because Cook's Distance is based on linear regression. It measures how much the fitted values change when each observation is removed one at a time.
- Why did you choose this library (Statsmodels vs scikit-learn)?
    - Since the goal is to remove outliers and computationally determine which datapoints are outliers, we are going to need access to statistical summaries to understandhow outliers affect the model.

In [173]:
# Helper Functions for OLS Diagnostics and Plotting
def fit_ols_diagnostics(X, y):
    """Fit OLS and return (model, diagnostics dataframe).

    Parameters
    ----------
    X : array-like, shape (n,) or (n, p)
        Feature matrix (without intercept column).
    y : array-like, shape (n,)
        Target vector.
    """
    X = np.asarray(X)
    y = np.asarray(y).reshape(-1)
    if X.ndim == 1:
        X = X.reshape(-1, 1)

    # Add intercept column for statsmodels
    X_sm = sm.add_constant(X)
    model = sm.OLS(y, X_sm).fit()

    infl = OLSInfluence(model)
    diag = pd.DataFrame({
        "y": y,
        "y_hat": model.fittedvalues,
        "residual": model.resid,
        "leverage_hii": infl.hat_matrix_diag,   # diagonal of Hat matrix H
        "cooks_D": infl.cooks_distance[0]
    })
    return model, diag


def plot_line_fit(x, y, model, title=""):
    """Scatter + fitted line for 1D x."""
    x = np.asarray(x).reshape(-1)
    order = np.argsort(x)

    plt.figure(figsize=(7, 4))
    print(x.shape, y.shape)
    plt.scatter(x, y)
    x_sorted = x[order]

    X_sm = sm.add_constant(x_sorted)
    yhat_sorted = model.predict(X_sm)
    plt.plot(x_sorted, yhat_sorted)

    plt.xlabel("x")
    plt.ylabel("y")
    plt.title(title)
    plt.grid(True, alpha=0.3)
    plt.show()


def plot_curve_fit(x, y, pipeline, title=""):
    """Scatter + fitted curve for scikit-learn pipeline (1D x)."""
    x = np.asarray(x).reshape(-1, 1)
    y = np.asarray(y).reshape(-1)

    # Create a smooth grid for plotting
    grid = np.linspace(x.min(), x.max(), 200).reshape(-1, 1)
    yhat = pipeline.predict(grid)

    plt.figure(figsize=(7, 4))
    plt.scatter(x, y)
    plt.plot(grid, yhat)
    plt.xlabel("x")
    plt.ylabel("y")
    plt.title(title)
    plt.grid(True, alpha=0.3)
    plt.show()


In [174]:
# Prepare features and target variable
TARGET_COL = 'PE'
features = df.drop(columns=[TARGET_COL]).copy()
target = df[TARGET_COL].values

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=42)

X_train = np.asarray(X_train)
y_train = np.asarray(y_train)
X_test = np.asarray(X_test)
y_test = np.asarray(y_test)


In [190]:
# Q1: Outlier Detection and Removal using Cook's Distance
model_full, diag_full = fit_ols_diagnostics(X_train, y_train)
# print(model_full.summary())

# Identify influential points using Cook's Distance
threshold = 4 / len(X_train)

X_sm = sm.add_constant(X_train) # Adds a constant term to the predictors
model_w_constants, diag_w_constants = fit_ols_diagnostics(X_sm, y_train)
# print(model_w_constants.summary())

diag_table = diag_w_constants.copy()
diag_table["is outlier"] = diag_table["cooks_D"] > threshold

diag_table.sort_values(by='cooks_D', ascending=False)

Unnamed: 0,y,y_hat,residual,leverage_hii,cooks_D,is outlier
2397,370.64803,509.615713,-138.967683,0.037486,1.265702e+00,True
5670,364.32803,559.465487,-195.137457,0.018852,1.207862e+00,True
3919,541.43197,415.727283,125.704687,0.041370,1.152221e+00,True
3248,575.06197,408.386275,166.675695,0.024168,1.142049e+00,True
3477,358.96803,503.918492,-144.950462,0.030533,1.105591e+00,True
...,...,...,...,...,...,...
5449,463.70000,463.686856,0.013144,0.000276,7.718906e-11,False
4540,452.27000,452.258677,0.011323,0.000294,6.102539e-11,False
960,452.13000,452.124884,0.005116,0.000539,2.288271e-11,False
6353,463.46000,463.464109,-0.004109,0.000587,1.607772e-11,False


In [200]:
idx_remove = diag_table[diag_table["is outlier"]].index.tolist()

X_train_clean = np.delete(X_train, idx_remove, axis=0)
y_train_clean = np.delete(y_train, idx_remove)

model_clean, diag_clean = fit_ols_diagnostics(X_train_clean, y_train_clean)
print(model_clean.summary())

coef_full = np.asarray(model_full.params).reshape(-1)
coef_clean = np.asarray(model_clean.params).reshape(-1)

print(f"Full-data:   b0 = {coef_full[0]:.4f}, b1 = {coef_full[1]:.4f}, b2 = {coef_full[2]:.4f}, b3 = {coef_full[3]:.4f}, b4 = {coef_full[4]:.4f}")
print(f"Cleaned-data: b0 = {coef_clean[0]:.4f}, b1 = {coef_clean[1]:.4f}, b2 = {coef_clean[2]:.4f}, b3 = {coef_clean[3]:.4f}, b4 = {coef_clean[4]:.4f}")

df_clean = pd.DataFrame(X_train_clean, columns=features.columns)
df_clean[TARGET_COL] = y_train_clean
df_clean.to_csv('usina.csv', index=False)

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.932
Model:                            OLS   Adj. R-squared:                  0.932
Method:                 Least Squares   F-statistic:                 2.261e+04
Date:                Sat, 24 Jan 2026   Prob (F-statistic):               0.00
Time:                        17:11:34   Log-Likelihood:                -19229.
No. Observations:                6612   AIC:                         3.847e+04
Df Residuals:                    6607   BIC:                         3.850e+04
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        453.1256     11.444     39.594      0.0

## Q2: Train/Test Evaluation Before vs After Outlier Removal

In [None]:
# Train models with outlier data

# Train models with clean data

### Discussion Questions:

- Do outliers change train error? Test error?
- Which dataset (with outliers vs without outliers) shows better generalization?
- Do Ridge/Lasso appear to help relative to standard linear regression?