In [9]:
import numpy as np

# Linear Regression Model
* __init__ is initializing the class and setting the parameters [m,b] b  - bias term
* _prepare_featues is modifying the input array for the 2D multiplication. Adding the column of 1s in front of the column of Xs for the multiplication with self.theta [m,b]
* hypothesis is returning the our predicted values.
* initialize parameters is setting the self.theta i.e. our parameters [m,b]
* in generate_synthetic_data we are creating our own data with some noise in it.

__for now both parameters [m,b] are set to [0,0]__

In [10]:
class LinearRegression:
    def __init__(self):
        # theta will store our parameters: [theta_0 (bias), theta_1 (weight for x1)] [m,b]
        self.theta = None

    def _prepare_features(self, X):
        """
        Prepares the feature matrix X for calculations.
        Adds a column of ones for the bias term (theta_0).
        For a single feature X (1D array), it converts it to a 2D array:
        [[1, x1],
         [1, x2],
         ...]

        for many rows of data we have to make the X array into 2d matrix for multiplying it with [m,b]
        """
        if X.ndim == 1: # If X is a 1D array (e.g., np.array([1, 2, 3]))
            X = X.reshape(-1, 1) # Reshape to a column vector: [[1], [2], [3]]

        # Add a column of ones to the left of X for the bias term (theta_0)
        # np.c_ concatenates arrays column-wise
        # This transforms X into a design matrix where the first column is all ones.
        '''this thing makes the X_b matrix as :
        [[1,x1],
        [1,x2],
        ....]'''
        X_b = np.c_[np.ones((X.shape[0], 1)), X]
        return X_b

    def hypothesis(self, X):
        """
        Calculates the predicted output y_hat using the current theta values.
        h_theta(x) = theta_0 * 1 + theta_1 * x_1

        Args:
            X (np.array): Feature array (can be 1D for single feature).

        Returns:
            np.array: Predicted y values.
        """
        if self.theta is None:
            raise ValueError("Model parameters (theta) not initialized. Call initialize_parameters() first.")

        # Prepare X by adding a bias term (column of ones)
        X_prepared = self._prepare_features(X)

        # Perform dot product: (Number of samples x Number of features) . (Number of features x 1)
        # This gives (Number of samples x 1) -> predicted y for each sample
        # Remember: dot product for matrix A (m,n) and vector B (n,) is A.dot(B) which gives (m,)
        return X_prepared.dot(self.theta)

    def initialize_parameters(self, num_features):
        """
        num_features tells how many variables is there in your eqn
        Initializes theta with zeros.
        num_features is the number of 'x' variables (excluding the bias term).
        So, theta will have size (num_features + 1) because it includes theta_0.
        +1 is adding the y-intersect to the self.theta
        """
        # theta will be [theta_0, theta_1] for single variable LR
        self.theta = np.zeros(num_features + 1)

In [19]:
def generate_synthetic_data(num_samples=100, bias=2.0, weight=3.0, noise_std=1.0):
    """
    Generates synthetic linear data: y = bias + weight * x + noise.

    Args:
        num_samples (int): Number of data points to generate.
        bias (float): The true intercept (theta_0).
        weight (float): The true slope (theta_1).
        noise_std (float): Standard deviation of the random noise.

    Returns:
        tuple: (X, y) where X is a NumPy array of features and y is a NumPy array of target values.
    """
    # Generate random x values between 0 and 10
    X = np.random.rand(num_samples) * 10

    # Generate noise from a normal distribution with specified standard deviation
    noise = np.random.randn(num_samples) * noise_std

    # Generate y values based on the linear equation and add noise
    y = bias + weight * X + noise

    return X, y

def MSE(predicted, target):
    total_squared_error = 0
    for i in range(len(predicted)):
        error = (predicted[i] - target[i])**2
        total_squared_error += error
    MSE = total_squared_error/len(predicted)
    return MSE

In [20]:
# --- Generate some synthetic data ---
features, targets = generate_synthetic_data(num_samples=50, bias=1.5, weight=2.5, noise_std=0.5)

print("--- Generated Data ---")
print("First 5 features (X):", features[:5])
print("First 5 targets (y):", targets[:5])
print("Shape of X:", features.shape) # Should be (50,)
print("Shape of y:", targets.shape) # Should be (50,)
print("-" * 20)

# --- Instantiate the Linear Regression model ---
model = LinearRegression()

# --- Initialize parameters (theta) ---
# For single variable linear regression, we have 1 feature (the 'x' variable).
# So, we need 1 parameter for 'x' (theta_1) + 1 parameter for the bias term (theta_0).
# Total parameters = 1 + 1 = 2.
model.initialize_parameters(num_features=1)

print("\n--- Model Initialization ---")
print("Initial theta (model parameters):", model.theta) # Should be [0. 0.]
print("-" * 20)

# --- Get predictions with initial (zero) theta ---
print("\n--- Initial Predictions ---")
initial_predictions = model.hypothesis(features)
print("First 5 initial predictions (should be close to 0):", initial_predictions[:5])
print("Shape of predictions:", initial_predictions.shape) # Should be (50,)
print("-" * 20)

# You'll notice the initial predictions are all close to zero because theta is initialized to zeros.
# This is expected before the model has been trained!
print(MSE(targets, initial_predictions))

--- Generated Data ---
First 5 features (X): [7.72528568 0.64461405 0.58932068 4.08915626 2.14461122]
First 5 targets (y): [20.9564502   2.90734252  3.22758642 11.24924477  7.27746434]
Shape of X: (50,)
Shape of y: (50,)
--------------------

--- Model Initialization ---
Initial theta (model parameters): [0. 0.]
--------------------

--- Initial Predictions ---
First 5 initial predictions (should be close to 0): [0. 0. 0. 0. 0.]
Shape of predictions: (50,)
--------------------
236.4174012936884
