In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

# ----------------------
# Gradient Descent Linear Regression
# ----------------------
class LinearRegressionGD:
    def __init__(self, lr=0.01, n_iter=1000, tol=1e-6):
        self.lr = lr
        self.n_iter = n_iter
        self.tol = tol
        self.theta = None
        self.converged = False
        self.iters_to_converge = None

    def fit(self, X, y):
        m, n = X.shape
        self.theta = np.zeros(n)
        prev_loss = float('inf')

        for i in range(self.n_iter):
            y_pred = X.dot(self.theta)
            error = y_pred - y
            gradient = (1/m) * X.T.dot(error)
            self.theta -= self.lr * gradient

            loss = (1/(2*m)) * np.sum(error**2)  # 1/2 MSE
            if abs(prev_loss - loss) < self.tol:
                self.converged = True
                self.iters_to_converge = i+1
                break
            prev_loss = loss

        if not self.converged:
            self.iters_to_converge = self.n_iter

    def predict(self, X):
        return X.dot(self.theta)


# ----------------------
# Load and Clean Data
# ----------------------
file_path = "california_housing.csv"  # replace with your dataset path
df = pd.read_csv(file_path)

# Drop empty rows
df.dropna(inplace=True)

# Drop non-numeric / unwanted columns
drop_cols = ["ocean_proximity", "latitude", "longitude"]
df = df.drop(columns=[c for c in drop_cols if c in df.columns])

# Split features and target
X = df.drop("median_house_value", axis=1)
y = df["median_house_value"].values

# ----------------------
# Assignment 3: Non-linear Experiments
# ----------------------
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df.drop("median_house_value", axis=1))
X = pd.DataFrame(X_scaled, columns=df.drop("median_house_value", axis=1).columns)

def run_experiment(features_fn, run_id):
    X_new = features_fn(X.copy())
    # Drop NaN or infinite values
    X_new = X_new.replace([np.inf, -np.inf], np.nan)
    mask = ~X_new.isna().any(axis=1)
    X_new = X_new.loc[mask].reset_index(drop=True)
    y_new = pd.Series(y).loc[mask].reset_index(drop=True).values

    # Ensure y_new has the same length as X_new
    if len(y_new) != len(X_new):
        min_len = min(len(y_new), len(X_new))
        X_new = X_new.iloc[:min_len]
        y_new = y_new[:min_len]

    X_mat = np.c_[np.ones(len(X_new)), X_new.values]

    results = []
    for lr in [0.001, 0.01, 0.1]:
        model = LinearRegressionGD(lr=lr, n_iter=5000)
        model.fit(X_mat, y_new)
        y_pred = model.predict(X_mat)

        # Drop any residual NaN values in predictions
        mask_pred = ~np.isnan(y_pred)
        y_pred = y_pred[mask_pred]
        y_clean = y_new[mask_pred]

        mse_half = (1/(2*len(y_clean))) * np.sum((y_pred - y_clean)**2)
        r2 = r2_score(y_clean, y_pred)

        results.append({
            "Run": run_id,
            "Learning Rate": lr,
            "1/2 MSE": round(mse_half, 4),
            "R2": round(r2, 4),
            "Converged": model.converged,
            "Iterations": model.iters_to_converge
        })
    return pd.DataFrame(results)

# Experiment 1
exp1 = run_experiment(lambda df: df.assign(
    MedianIncome2=df["median_income"]**2,
    LogPopulation=np.log1p(df["population"]),
    RoomsPerHousehold=df["total_rooms"] / (df["households"] + 1e-6)
), run_id="Assignment3_Exp1")

# Experiment 2
exp2 = run_experiment(lambda df: df.assign(
    IncomeXRooms=df["median_income"] * df["total_rooms"],
    BedroomsRatio=df["total_bedrooms"] / (df["total_rooms"] + 1e-6),
    PopPerHousehold=df["population"] / (df["households"] + 1e-6)
), run_id="Assignment3_Exp2")

# Experiment 3
exp3 = run_experiment(lambda df: df.assign(
    Age2=df["housing_median_age"]**2,
    LogRooms=np.log1p(df["total_rooms"]),
    Income2=df["median_income"]**2
), run_id="Assignment3_Exp3")

assignment3_results = pd.concat([exp1, exp2, exp3], ignore_index=True)
print("Assignment 3 Results:\n", assignment3_results)
assignment3_results.to_excel("assignment3_results.xlsx", index=False)


  result = getattr(ufunc, method)(*inputs, **kwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  loss = (1/(2*m)) * np.sum(error**2)  # 1/2 MSE
  if abs(prev_loss - loss) < self.tol:
  self.theta -= self.lr * gradient


ZeroDivisionError: division by zero