In [None]:
#Problem 1

import numpy as np

class ScratchLogisticRegression():
    """
    Scratch implementation of logistic regression

    Parameters
    ----------
    num_iter : int
      Number of iterations
    lr : float
      Learning rate
    bias : bool
      False if no bias term is included
    verbose : bool
      True to output the learning process

    Attributes
    ----------
    self.coef_ : The following form of ndarray, shape (n_features,)
      Parameters
    self.loss : The following form of ndarray, shape (self.iter,)
      Record losses on training data
    self.val_loss : The following form of ndarray, shape (self.iter,)
      Record loss on validation data
    """

    def __init__(self, num_iter=1000, lr=0.01, bias=True, verbose=False):
        #hyperparameters as attributes
        self.iter = num_iter
        self.lr = lr
        self.bias = bias
        self.verbose = verbose
        #arrays to record loss
        self.loss = np.zeros(self.iter)
        self.val_loss = np.zeros(self.iter)
        self.coef_ = None

    def _sigmoid(self, z):
        """Sigmoid function"""
        return 1 / (1 + np.exp(-z))

    def _add_bias(self, X):
        """Add bias term to features"""
        return np.c_[np.ones(X.shape[0]), X]

    def _initialize_weights(self, n_features):
        """Initialize weights with small random values"""
        return np.random.randn(n_features) * 0.01

    def _compute_loss(self, y, y_pred):
        """Compute binary cross-entropy loss"""
        epsilon = 1e-15  # to avoid log(0)
        y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
        return -np.mean(y * np.log(y_pred) + (1 - y) * np.log(1 - y_pred))

    def fit(self, X, y, X_val=None, y_val=None):
        """
        Learn logistic regression. If validation data is entered, the loss and accuracy for it are also calculated for each iteration.

        Parameters
        ----------
        X : The following forms of ndarray, shape (n_samples, n_features)
            Features of training data
        y : The following form of ndarray, shape (n_samples,)
            Correct answer value of training data
        X_val : The following forms of ndarray, shape (n_samples, n_features)
            Features of verification data
        y_val : The following form of ndarray, shape (n_samples,)
            Correct value of verification data
        """
        if self.bias:
            X = self._add_bias(X)
            if X_val is not None:
                X_val = self._add_bias(X_val)

        n_samples, n_features = X.shape
        self.coef_ = self._initialize_weights(n_features)

        for i in range(self.iter):
            # 
            z = np.dot(X, self.coef_)
            #sigmoid function
            y_pred = self._sigmoid(z)
            
            #gradient
            error = y_pred - y
            gradient = np.dot(X.T, error) / n_samples
            
            # Update weights
            self.coef_ -= self.lr * gradient
            
            # loss
            self.loss[i] = self._compute_loss(y, y_pred)
            
            #validation loss if validation data is provided
            if X_val is not None and y_val is not None:
                val_pred = self._sigmoid(np.dot(X_val, self.coef_))
                self.val_loss[i] = self._compute_loss(y_val, val_pred)
            
            if self.verbose and i % 100 == 0:
                print(f"Iteration {i}: Training Loss = {self.loss[i]:.4f}", end="")
                if X_val is not None and y_val is not None:
                    print(f", Validation Loss = {self.val_loss[i]:.4f}")
                else:
                    print()

    def predict_proba(self, X):
        """
        Estimate the probability using logistic regression.

        Parameters
        ----------
        X : The following forms of ndarray, shape (n_samples, n_features)
            sample

        Returns
        -------
            The following form of ndarray, shape (n_samples,)
            Estimated probability by logistic regression
        """
        if self.bias:
            X = self._add_bias(X)
        return self._sigmoid(np.dot(X, self.coef_))

    def predict(self, X, threshold=0.5):
        """
        Estimate the label using logistic regression.

        Parameters
        ----------
        X : The following forms of ndarray, shape (n_samples, n_features)
            sample
        threshold : float
            Threshold for classification (default 0.5)

        Returns
        -------
            The following form of ndarray, shape (n_samples,)
            Estimated result by logistic regression
        """
        return (self.predict_proba(X) >= threshold).astype(int)
    

    #Problem 2

import numpy as np

class ScratchLogisticRegression():
    """
    Scratch implementation of logistic regression with gradient descent

    Parameters
    ----------
    num_iter : int
      Number of iterations
    lr : float
      Learning rate
    bias : bool
      False if no bias term is included
    verbose : bool
      True to output the learning process
    lambda_ : float
      Regularization parameter (default: 0, no regularization)

    Attributes
    ----------
    self.coef_ : ndarray, shape (n_features,)
      Parameters
    self.loss : ndarray, shape (self.iter,)
      Record losses on training data
    self.val_loss : ndarray, shape (self.iter,)
      Record loss on validation data
    """

    def __init__(self, num_iter=1000, lr=0.01, bias=True, verbose=False, lambda_=0):
        #hyperparameters as attributes
        self.iter = num_iter
        self.lr = lr
        self.bias = bias
        self.verbose = verbose
        self.lambda_ = lambda_
        #
        self.loss = np.zeros(self.iter)
        self.val_loss = np.zeros(self.iter)
        self.coef_ = None

    def _sigmoid(self, z):
        """Sigmoid function"""
        return 1 / (1 + np.exp(-z))

    def _add_bias(self, X):
        """Add bias term to features"""
        return np.c_[np.ones(X.shape[0]), X]

    def _initialize_weights(self, n_features):
        """Initialize weights with small random values"""
        return np.random.randn(n_features) * 0.01

    def _compute_loss(self, y, y_pred):
        """Compute binary cross-entropy loss with L2 regularization"""
        epsilon = 1e-15  # to avoid log(0)
        y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
        
        # Cross-entropy loss
        loss = -np.mean(y * np.log(y_pred) + (1 - y) * np.log(1 - y_pred))
        
        #L2 regularization
        if self.lambda_ > 0:
            if self.bias:
                reg_term = (self.lambda_ / (2 * len(y))) * np.sum(self.coef_[1:]**2)
            else:
                reg_term = (self.lambda_ / (2 * len(y))) * np.sum(self.coef_**2)
            loss += reg_term
        
        return loss

    def _gradient_descent(self, X, y, y_pred):
        """
        Update parameters using gradient descent with L2 regularization
        
        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)
            Feature matrix
        y : ndarray, shape (n_samples,)
            Target values
        y_pred : ndarray, shape (n_samples,)
            Predicted probabilities
            
        Returns
        -------
        gradient : ndarray, shape (n_features,)
            Gradient vector
        """
        error = y_pred - y
        gradient = np.dot(X.T, error) / len(y)
        
        #regularization term 
        if self.lambda_ > 0:
            if self.bias:
                reg_term = (self.lambda_ / len(y)) * self.coef_
                reg_term[0] = 0  # Don't regularize bias term
            else:
                reg_term = (self.lambda_ / len(y)) * self.coef_
            gradient += reg_term
            
        return gradient

    def fit(self, X, y, X_val=None, y_val=None):
        """
        Learn logistic regression using gradient descent. 
        If validation data is provided, also calculate validation loss at each iteration.
        """
        if self.bias:
            X = self._add_bias(X)
            if X_val is not None:
                X_val = self._add_bias(X_val)

        n_samples, n_features = X.shape
        self.coef_ = self._initialize_weights(n_features)

        for i in range(self.iter):
            #predictions
            z = np.dot(X, self.coef_)
            y_pred = self._sigmoid(z)
            
            #gradient and weights
            gradient = self._gradient_descent(X, y, y_pred)
            self.coef_ -= self.lr * gradient
            
            #loss
            self.loss[i] = self._compute_loss(y, y_pred)
            
            #validation loss if validation data is provided
            if X_val is not None and y_val is not None:
                val_pred = self._sigmoid(np.dot(X_val, self.coef_))
                self.val_loss[i] = self._compute_loss(y_val, val_pred)
            
            if self.verbose and i % 100 == 0:
                print(f"Iteration {i}: Training Loss = {self.loss[i]:.4f}", end="")
                if X_val is not None and y_val is not None:
                    print(f", Validation Loss = {self.val_loss[i]:.4f}")
                else:
                    print()

    def predict_proba(self, X):
        """
        Estimate the probability using logistic regression.
        """
        if self.bias:
            X = self._add_bias(X)
        return self._sigmoid(np.dot(X, self.coef_))

    def predict(self, X, threshold=0.5):
        """
        Estimate the label using logistic regression.
        """
        return (self.predict_proba(X) >= threshold).astype(int)
    

    #Problem 3

import numpy as np

class ScratchLogisticRegression():
    """
    Scratch implementation of logistic regression

    Parameters
    ----------
    num_iter : int
      Number of iterations
    lr : float
      Learning rate
    bias : bool
      True if bias term is included
    verbose : bool
      True to output the learning process
    threshold : float
      Decision threshold for classification (default: 0.5)

    Attributes
    ----------
    self.coef_ : ndarray, shape (n_features,)
      Learned weights
    self.loss : ndarray, shape (self.iter,)
      Training loss at each iteration
    self.val_loss : ndarray, shape (self.iter,)
      Validation loss at each iteration
    """

    def __init__(self, num_iter=1000, lr=0.01, bias=True, verbose=False, threshold=0.5):
        self.iter = num_iter
        self.lr = lr
        self.bias = bias
        self.verbose = verbose
        self.threshold = threshold
        self.loss = np.zeros(self.iter)
        self.val_loss = np.zeros(self.iter)
        self.coef_ = None

    def _sigmoid(self, z):
        """Sigmoid function"""
        return 1 / (1 + np.exp(-z))

    def _add_bias(self, X):
        """Add bias term (column of ones) to features"""
        return np.c_[np.ones(X.shape[0]), X]

    def fit(self, X, y, X_val=None, y_val=None):
        """Training method (implementation omitted for brevity)"""
        pass

    def predict_proba(self, X):
        """
        Estimate class probabilities using logistic regression hypothesis function

        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)
            Input features

        Returns
        -------
        ndarray, shape (n_samples,)
            Estimated probabilities for class 1
        """
        #
        if self.bias:
            X = self._add_bias(X)
        
        #
        if self.coef_ is None:
            raise ValueError("Model not trained yet. Call fit() first.")
        
        #
        z = np.dot(X, self.coef_)
        
        #
        probabilities = self._sigmoid(z)
        
        return probabilities

    def predict(self, X):
        """
        Predict class labels using learned logistic regression model

        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)
            Input features

        Returns
        -------
        ndarray, shape (n_samples,)
            Predicted class labels (0 or 1)
        """
        #probabilities from predict_proba
        probabilities = self.predict_proba(X)
        
        #
        predictions = (probabilities >= self.threshold).astype(int)
        
        return predictions
    

    #Problem 4

import numpy as np

class ScratchLogisticRegression():
    """
    Scratch implementation of logistic regression with L2 regularization

    Parameters
    ----------
    num_iter : int
      Number of iterations
    lr : float
      Learning rate
    bias : bool
      True if bias term is included
    verbose : bool
      True to output the learning process
    lambda_ : float
      Regularization strength (default: 0, no regularization)
    threshold : float
      Decision threshold for classification (default: 0.5)

    Attributes
    ----------
    self.coef_ : ndarray, shape (n_features,)
      Learned weights
    self.loss : ndarray, shape (self.iter,)
      Training loss at each iteration
    self.val_loss : ndarray, shape (self.iter,)
      Validation loss at each iteration
    """

    def __init__(self, num_iter=1000, lr=0.01, bias=True, verbose=False, lambda_=0, threshold=0.5):
        self.iter = num_iter
        self.lr = lr
        self.bias = bias
        self.verbose = verbose
        self.lambda_ = lambda_
        self.threshold = threshold
        self.loss = np.zeros(self.iter)
        self.val_loss = np.zeros(self.iter)
        self.coef_ = None

    def _sigmoid(self, z):
        """Sigmoid function"""
        return 1 / (1 + np.exp(-z))

    def _add_bias(self, X):
        """Add bias term (column of ones) to features"""
        return np.c_[np.ones(X.shape[0]), X]

    def _compute_loss(self, X, y, coef):
        """
        Compute the logistic regression objective function with L2 regularization
        
        J(θ) = (1/m) * Σ[-yⁱlog(hθ(xⁱ)) - (1-yⁱ)log(1-hθ(xⁱ))] + (λ/2m) * Σθⱼ² (j=1 to n)
        
        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)
            Input features
        y : ndarray, shape (n_samples,)
            Target labels
        coef : ndarray, shape (n_features,)
            Current model coefficients
            
        Returns
        -------
        float
            Computed loss value
        """
        m = len(y)
        h = self._sigmoid(np.dot(X, coef))
        
        #
        epsilon = 1e-15  # to prevent log(0)
        h = np.clip(h, epsilon, 1 - epsilon)
        cross_entropy = -np.mean(y * np.log(h) + (1 - y) * np.log(1 - h))
        
        # L2 regularization term 
        if self.bias:
            reg_term = (self.lambda_ / (2 * m)) * np.sum(coef[1:]**2)
        else:
            reg_term = (self.lambda_ / (2 * m)) * np.sum(coef**2)
            
        total_loss = cross_entropy + reg_term
        
        return total_loss

    def fit(self, X, y, X_val=None, y_val=None):
        """
        Train logistic regression model with gradient descent
        
        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)
            Training features
        y : ndarray, shape (n_samples,)
            Training labels
        X_val : ndarray, shape (n_val_samples, n_features), optional
            Validation features
        y_val : ndarray, shape (n_val_samples,), optional
            Validation labels
        """
        if self.bias:
            X = self._add_bias(X)
            if X_val is not None:
                X_val = self._add_bias(X_val)

        m, n = X.shape
        self.coef_ = np.zeros(n)
        
        for i in range(self.iter):
            #predictions
            z = np.dot(X, self.coef_)
            h = self._sigmoid(z)
            
            #gradient
            error = h - y
            gradient = np.dot(X.T, error) / m
            
            #regularization term
            if self.lambda_ > 0:
                reg_term = (self.lambda_ / m) * self.coef_
                if self.bias:
                    reg_term[0] = 0  # Don't regularize bias term
                gradient += reg_term
                
            #coefficients
            self.coef_ -= self.lr * gradient
            
            #training loss
            self.loss[i] = self._compute_loss(X, y, self.coef_)
            
            #validation loss 
            if X_val is not None and y_val is not None:
                self.val_loss[i] = self._compute_loss(X_val, y_val, self.coef_)
            
            if self.verbose and i % 100 == 0:
                msg = f"Iteration {i}: Training Loss = {self.loss[i]:.4f}"
                if X_val is not None and y_val is not None:
                    msg += f", Validation Loss = {self.val_loss[i]:.4f}"
                print(msg)

    def predict_proba(self, X):
        """Implementation as before..."""
        pass

    def predict(self, X):
        """Implementation as before..."""
        pass


    #Problem 5

import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression

class ScratchLogisticRegression():
    """Our implementation from previous steps"""
    def __init__(self, num_iter=1000, lr=0.01, bias=True, verbose=False, lambda_=0, threshold=0.5):
        self.iter = num_iter
        self.lr = lr
        self.bias = bias
        self.verbose = verbose
        self.lambda_ = lambda_
        self.threshold = threshold
        self.loss = np.zeros(self.iter)
        self.val_loss = np.zeros(self.iter)
        self.coef_ = None

    def _sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def _add_bias(self, X):
        return np.c_[np.ones(X.shape[0]), X]

    def _compute_loss(self, X, y, coef):
        m = len(y)
        h = self._sigmoid(np.dot(X, coef))
        epsilon = 1e-15
        h = np.clip(h, epsilon, 1 - epsilon)
        cross_entropy = -np.mean(y * np.log(h) + (1 - y) * np.log(1 - h))
        if self.bias:
            reg_term = (self.lambda_ / (2 * m)) * np.sum(coef[1:]**2)
        else:
            reg_term = (self.lambda_ / (2 * m)) * np.sum(coef**2)
        return cross_entropy + reg_term

    def fit(self, X, y, X_val=None, y_val=None):
        if self.bias:
            X = self._add_bias(X)
            if X_val is not None:
                X_val = self._add_bias(X_val)

        m, n = X.shape
        self.coef_ = np.zeros(n)
        
        for i in range(self.iter):
            z = np.dot(X, self.coef_)
            h = self._sigmoid(z)
            error = h - y
            gradient = np.dot(X.T, error) / m
            if self.lambda_ > 0:
                reg_term = (self.lambda_ / m) * self.coef_
                if self.bias:
                    reg_term[0] = 0
                gradient += reg_term
            self.coef_ -= self.lr * gradient
            self.loss[i] = self._compute_loss(X, y, self.coef_)
            if X_val is not None and y_val is not None:
                self.val_loss[i] = self._compute_loss(X_val, y_val, self.coef_)
            if self.verbose and i % 100 == 0:
                msg = f"Iteration {i}: Loss = {self.loss[i]:.4f}"
                if X_val is not None and y_val is not None:
                    msg += f", Val Loss = {self.val_loss[i]:.4f}"
                print(msg)

    def predict_proba(self, X):
        if self.bias:
            X = self._add_bias(X)
        if self.coef_ is None:
            raise ValueError("Model not trained yet")
        return self._sigmoid(np.dot(X, self.coef_))

    def predict(self, X):
        return (self.predict_proba(X) >= self.threshold).astype(int)

# Load and prepare iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Filter for only virginica (2) and versicolor (1)
mask = (y == 1) | (y == 2)
X = X[mask]
y = y[mask]

# Convert to binary classification (1 for virginica, 0 for versicolor)
y = (y == 2).astype(int)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train our scratch implementation
our_lr = ScratchLogisticRegression(num_iter=2000, lr=0.1, lambda_=0.1, verbose=True)
our_lr.fit(X_train, y_train, X_test, y_test)

# Train scikit-learn implementation
sk_lr = LogisticRegression(penalty='l2', C=1/0.1, max_iter=2000, random_state=42)
sk_lr.fit(X_train, y_train)

# Make predictions
our_preds = our_lr.predict(X_test)
sk_preds = sk_lr.predict(X_test)

# Calculate metrics
def print_metrics(y_true, y_pred, model_name):
    print(f"\n{model_name} Performance:")
    print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
    print(f"Precision: {precision_score(y_true, y_pred):.4f}")
    print(f"Recall: {recall_score(y_true, y_pred):.4f}")

print_metrics(y_test, our_preds, "Our Scratch Implementation")
print_metrics(y_test, sk_preds, "Scikit-learn Implementation")

# Compare coefficients
print("\nCoefficient Comparison:")
print("Our coefficients:", our_lr.coef_)
print("Sklearn coefficients:", np.concatenate([sk_lr.intercept_, sk_lr.coef_[0]]))



#Problem 6

import matplotlib.pyplot as plt

#

#learning curves
plt.figure(figsize=(10, 6))
plt.plot(our_lr.loss, label='Training Loss', color='blue')
plt.plot(our_lr.val_loss, label='Validation Loss', color='orange', linestyle='--')
plt.title('Learning Curve of Scratch Logistic Regression')
plt.xlabel('Iterations')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)

#region where loss decreases significantly
min_loss = min(min(our_lr.loss), min(our_lr.val_loss))
max_loss = max(max(our_lr.loss[:10]), max(our_lr.val_loss[:10]))  # First 10 iterations
plt.ylim(min_loss - 0.1, max_loss + 0.1)  # Set y-axis limits to see details

#vertical line at point where loss stabilizes
stable_iter = np.argmin(our_lr.val_loss)
plt.axvline(x=stable_iter, color='red', linestyle=':', 
            label=f'Stabilization at iter {stable_iter}')
plt.legend()

plt.show()

#final loss values
print(f"\nFinal Training Loss: {our_lr.loss[-1]:.4f}")
print(f"Final Validation Loss: {our_lr.val_loss[-1]:.4f}")
print(f"Minimum Validation Loss at iteration: {stable_iter}")

# Compare with sklearn's loss (log loss)
from sklearn.metrics import log_loss
sk_probs = sk_lr.predict_proba(X_test)[:, 1]
sk_loss = log_loss(y_test, sk_probs)
print(f"\nScikit-learn's Log Loss: {sk_loss:.4f}")


#Problem 7

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler

class ScratchLogisticRegression():
    """Same implementation as before, kept for completeness"""
    def __init__(self, num_iter=1000, lr=0.01, bias=True, verbose=False, lambda_=0, threshold=0.5):
        self.iter = num_iter
        self.lr = lr
        self.bias = bias
        self.verbose = verbose
        self.lambda_ = lambda_
        self.threshold = threshold
        self.loss = np.zeros(self.iter)
        self.val_loss = np.zeros(self.iter)
        self.coef_ = None

    def _sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def _add_bias(self, X):
        return np.c_[np.ones(X.shape[0]), X]

    def fit(self, X, y):
        if self.bias:
            X = self._add_bias(X)
        
        m, n = X.shape
        self.coef_ = np.zeros(n)
        
        for i in range(self.iter):
            z = np.dot(X, self.coef_)
            h = self._sigmoid(z)
            gradient = np.dot(X.T, (h - y)) / m
            if self.lambda_ > 0:
                reg_term = (self.lambda_ / m) * self.coef_
                if self.bias:
                    reg_term[0] = 0
                gradient += reg_term
            self.coef_ -= self.lr * gradient
            self.loss[i] = -np.mean(y * np.log(h + 1e-15) + (1 - y) * np.log(1 - h + 1e-15))

    def predict_proba(self, X):
        if self.bias:
            X = self._add_bias(X)
        return self._sigmoid(np.dot(X, self.coef_))

    def predict(self, X):
        return (self.predict_proba(X) >= self.threshold).astype(int)

# Load data
iris = load_iris()
X = iris.data[:, [1, 2]]  # Sepal width (index 1) and petal length (index 2)
y = iris.target

# Filter for only virginica (2) and versicolor (1)
mask = (y == 1) | (y == 2)
X = X[mask]
y = y[mask]

# Convert to binary classification (1 for virginica, 0 for versicolor)
y = (y == 2).astype(int)

#
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Train model
model = ScratchLogisticRegression(num_iter=1000, lr=0.1, lambda_=0.1)
model.fit(X, y)

#
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 200),
                     np.linspace(y_min, y_max, 200))

# Predict for each point in mesh grid
Z = model.predict_proba(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

#decision boundary and regions
plt.figure(figsize=(10, 6))
plt.contourf(xx, yy, Z, levels=[0, 0.5, 1], colors=['blue', 'orange'], alpha=0.3)
plt.contour(xx, yy, Z, levels=[0.5], colors='red', linewidths=2)

#data points
plt.scatter(X[y == 0][:, 0], X[y == 0][:, 1], color='blue', label='versicolor', edgecolor='k')
plt.scatter(X[y == 1][:, 0], X[y == 1][:, 1], color='orange', label='virginica', edgecolor='k')

# 
plt.xlabel('Sepal Width (standardized)')
plt.ylabel('Petal Length (standardized)')
plt.title('Decision Region for Logistic Regression (virginica vs versicolor)')
plt.legend()
plt.grid(True)
plt.show()



#Problem 8

import numpy as np
import matplotlib.pyplot as plt
import pickle
from sklearn.preprocessing import StandardScaler

class ScratchLogisticRegression():
    """
    Logistic Regression implementation from scratch with:
    - L2 regularization
    - Learning curve tracking
    - Decision boundary visualization
    - Weight saving/loading (pickle and npz)
    """
    
    def __init__(self, num_iter=1000, lr=0.01, bias=True, verbose=False, lambda_=0, threshold=0.5):
        self.iter = num_iter
        self.lr = lr
        self.bias = bias
        self.verbose = verbose
        self.lambda_ = lambda_
        self.threshold = threshold
        self.loss = np.zeros(self.iter)
        self.val_loss = np.zeros(self.iter)
        self.coef_ = None

    def _sigmoid(self, z):
        """Sigmoid function"""
        return 1 / (1 + np.exp(-z))

    def _add_bias(self, X):
        """Add bias term to features"""
        return np.c_[np.ones(X.shape[0]), X]

    def _compute_loss(self, X, y, coef):
        """Compute loss with L2 regularization"""
        m = len(y)
        h = self._sigmoid(np.dot(X, coef))
        epsilon = 1e-15
        h = np.clip(h, epsilon, 1 - epsilon)
        cross_entropy = -np.mean(y * np.log(h) + (1 - y) * np.log(1 - h))
        
        if self.bias:
            reg_term = (self.lambda_ / (2 * m)) * np.sum(coef[1:]**2)
        else:
            reg_term = (self.lambda_ / (2 * m)) * np.sum(coef**2)
            
        return cross_entropy + reg_term

    def fit(self, X, y, X_val=None, y_val=None):
        """Train the model"""
        if self.bias:
            X = self._add_bias(X)
            if X_val is not None:
                X_val = self._add_bias(X_val)

        m, n = X.shape
        self.coef_ = np.zeros(n)
        
        for i in range(self.iter):
            # Forward pass
            z = np.dot(X, self.coef_)
            h = self._sigmoid(z)
            
            # Backward pass
            error = h - y
            gradient = np.dot(X.T, error) / m
            
            # Add regularization
            if self.lambda_ > 0:
                reg_term = (self.lambda_ / m) * self.coef_
                if self.bias:
                    reg_term[0] = 0
                gradient += reg_term
                
            #weights
            self.coef_ -= self.lr * gradient
            
            #losses
            self.loss[i] = self._compute_loss(X, y, self.coef_)
            if X_val is not None and y_val is not None:
                self.val_loss[i] = self._compute_loss(X_val, y_val, self.coef_)
            
            if self.verbose and i % 100 == 0:
                msg = f"Iteration {i}: Loss = {self.loss[i]:.4f}"
                if X_val is not None and y_val is not None:
                    msg += f", Val Loss = {self.val_loss[i]:.4f}"
                print(msg)

    def predict_proba(self, X):
        """Predict probabilities"""
        if self.bias:
            X = self._add_bias(X)
        if self.coef_ is None:
            raise ValueError("Model not trained yet")
        return self._sigmoid(np.dot(X, self.coef_))

    def predict(self, X):
        """Predict class labels"""
        return (self.predict_proba(X) >= self.threshold).astype(int)

    def plot_decision_boundary(self, X, y):
        """
        Plot decision boundary for 2D feature space
        Parameters:
        X : ndarray, shape (n_samples, 2)
            Features (should be exactly 2 dimensions)
        y : ndarray, shape (n_samples,)
            Target labels (0 or 1)
        """
        if X.shape[1] != 2:
            raise ValueError("Decision boundary plotting only works for 2D features")
            
        #mesh grid
        x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
        y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
        xx, yy = np.meshgrid(np.linspace(x_min, x_max, 200),
                             np.linspace(y_min, y_max, 200))
        
        # Predict probabilities
        Z = self.predict_proba(np.c_[xx.ravel(), yy.ravel()])
        Z = Z.reshape(xx.shape)
        
        #
        plt.figure(figsize=(10, 6))
        plt.contourf(xx, yy, Z, levels=[0, 0.5, 1], colors=['blue', 'orange'], alpha=0.3)
        plt.contour(xx, yy, Z, levels=[0.5], colors='red', linewidths=2)
        plt.scatter(X[y == 0][:, 0], X[y == 0][:, 1], color='blue', label='Class 0', edgecolor='k')
        plt.scatter(X[y == 1][:, 0], X[y == 1][:, 1], color='orange', label='Class 1', edgecolor='k')
        plt.xlabel('Feature 1')
        plt.ylabel('Feature 2')
        plt.title('Decision Boundary')
        plt.legend()
        plt.grid(True)
        plt.show()

    def save_weights_pickle(self, filename):
        """Save weights and parameters using pickle"""
        with open(filename, 'wb') as f:
            pickle.dump({
                'coef': self.coef_,
                'params': {
                    'num_iter': self.iter,
                    'lr': self.lr,
                    'bias': self.bias,
                    'lambda_': self.lambda_,
                    'threshold': self.threshold
                }
            }, f)

    def load_weights_pickle(self, filename):
        """Load weights and parameters using pickle"""
        with open(filename, 'rb') as f:
            data = pickle.load(f)
        self.coef_ = data['coef']
        params = data['params']
        self.iter = params['num_iter']
        self.lr = params['lr']
        self.bias = params['bias']
        self.lambda_ = params['lambda_']
        self.threshold = params['threshold']

    def save_weights_npz(self, filename):
        """Save weights and parameters using numpy's savez"""
        np.savez(
            filename,
            coef=self.coef_,
            num_iter=self.iter,
            lr=self.lr,
            bias=self.bias,
            lambda_=self.lambda_,
            threshold=self.threshold
        )

    def load_weights_npz(self, filename):
        """Load weights and parameters using numpy's load"""
        data = np.load(filename, allow_pickle=True)
        self.coef_ = data['coef']
        self.iter = int(data['num_iter'])
        self.lr = float(data['lr'])
        self.bias = bool(data['bias'])
        self.lambda_ = float(data['lambda_'])
        self.threshold = float(data['threshold'])