In this notebook, you will:

1. Implement a Linear Regression model from scratch using the closed-form solution (Normal Equation).
2. Solve the same problem using Scikit-learn's Linear Regression module.
3. Extend this to a Multiple Linear Regression scenario.
4. Apply linear and logistic regression to a real-world datasets.

Let's get started!

In [None]:
# Import Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

We will first generate a synthetic dataset for a simple linear regression problem.

In [None]:
np.random.seed(42)  # For reproducibility

# Simple Linear Regression Data
X_simple = np.random.rand(100, 1) * 10  # Feature
y_simple = 3 * X_simple + np.random.randn(100, 1) * 2  # Target with noise

# Convert to DataFrame for visualization
df_simple = pd.DataFrame({'X': X_simple.flatten(), 'y': y_simple.flatten()})
df_simple.head()


In [None]:
plt.scatter(X_simple, y_simple, color='blue', label='Data')
plt.title("Simple Linear Regression Data")
plt.xlabel("X")
plt.ylabel("y")
plt.legend()
plt.show()

## Implement Closed-Form Solution

In [None]:
# Add bias (intercept term)
X_simple_bias = np.c_[np.ones((X_simple.shape[0], 1)), X_simple]

## TODO: Compute weights
w_simple =

## TODO: predict values
y_pred_simple_closed =


# Print weights
print(f"Weights (Closed-Form Solution):\nIntercept: {w_simple[0][0]}, Slope: {w_simple[1][0]}")

In [None]:
def mean_squared_error(y_true, y_pred):
  ## TODO: Compute the error between the predicted and true values
  return

mse_simple_closed = mean_squared_error(y_simple, y_pred_simple_closed)
print(f"Mean Squared Error (Closed-Form): {mse_simple_closed}")


In [None]:
plt.scatter(X_simple, y_simple, color='blue', label='Actual')
plt.plot(X_simple, y_pred_simple_closed, color='red', label='Prediction (Closed-Form)')
plt.title("Simple Linear Regression - Closed Form")
plt.xlabel("X")
plt.ylabel("y")
plt.legend()
plt.show()

## Solving with scikit learn

In [None]:
## TODO: Scikit-learn Solution for Simple Linear Regression


# TODO: make predictions
y_pred_simple_sklearn =
intercept =
coeff =

# Print weights
print(f"Weights (Scikit-learn):\nIntercept: {intercept}, Slope: {coeff}")

# Evaluate
mse_simple_sklearn = mean_squared_error(y_simple, y_pred_simple_sklearn)
print(f"Mean Squared Error (Scikit-learn): {mse_simple_sklearn}")

## Multiple regression problem

Let's try this on a multiple linear regression problem now.

In [None]:
# Multiple Linear Regression Data
X_multi = np.random.rand(100, 2) * 10  # Two features
y_multi = 3 * X_multi[:, 0:1] + 2 * X_multi[:, 1:2] + np.random.randn(100, 1) * 2  # Target with noise

df_multi = pd.DataFrame({'X1': X_multi[:, 0], 'X2': X_multi[:, 1], 'y': y_multi.flatten()})

df_multi.head()

Implement closed-form solution

In [None]:
# Add bias (intercept term)
X_multi_bias = np.c_[np.ones((X_multi.shape[0], 1)), X_multi]

## TODO: Compute weights
w_multi =

## TODO: Predicted values
y_pred_multi_closed =

# Print weights
print(f"Weights (Closed-Form Solution): {w_multi.flatten()}")

In [None]:
# Evaluate Multiple Linear Regression
mse_multi_closed = mean_squared_error(y_multi, y_pred_multi_closed)
print(f"Mean Squared Error (Closed-Form): {mse_multi_closed}")

In [None]:
## TODO: fit the model

# make predictions
y_pred_multi_sklearn =
intercept =
coeff =

# Print weights
print(f"Weights (Scikit-learn):\nIntercept: {intercept}, Coefficients: {coeff}")

# Evaluate
mse_multi_sklearn = mean_squared_error(y_multi, y_pred_multi_sklearn)
print(f"Mean Squared Error (Scikit-learn): {mse_multi_sklearn}")

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(y_multi, y_pred_multi_sklearn, alpha=0.6, edgecolors='k')
plt.plot([y_multi.min(), y_multi.max()], [y_multi.min(), y_multi.max()], 'r--', lw=2)
plt.xlabel("Observed Values")
plt.ylabel("Predicted Values")
plt.title("Observed vs. Predicted Values")
plt.grid(True)
plt.show()


## Now let's work with a real-world dataset: the New York Stock Exchange dataset.

While we focus on this specific dataset, you can find several new datasets to apply and play around with [here](https://www.kaggle.com/datasets?tags=13405-Linear+Regression).


In [None]:
! pip install kagglehub

In [None]:
import kagglehub
path = kagglehub.dataset_download("dgawlik/nyse")

In [None]:
print("Path to dataset files:", path)

In [None]:
%cd /root/.cache/kagglehub/datasets/dgawlik/nyse/versions/3

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


# Preparing the data


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Load the datasets
prices_split = pd.read_csv("prices-split-adjusted.csv")

print("Prices-split-adjusted dataset:")
print(prices_split.head(), "\n")

# Check for missing values
print("\nMissing values in prices-split-adjusted:\n", prices_split.isnull().sum())


In [None]:
prices_split.head()

In [None]:
plt.figure(figsize=(10, 6))
prices_split.groupby('date')['close'].mean().plot(title="Average Closing Prices Over Time")
plt.ylabel("Average Closing Price")
plt.xlabel("Date")
plt.xticks(rotation=45)
plt.show()

In [None]:
## TODO: select the AAPL stock prices dataset, Filter a single stock for simplicity, e.g., 'AAPL'
stock_data =
print("\nApple (AAPL) stock data:")
print(stock_data.head())

# Convert dates to datetime format and sort
stock_data['date'] = pd.to_datetime(stock_data['date'])
stock_data = stock_data.sort_values(by='date')

stock_data['previous_close'] = stock_data['close'].shift(1)
stock_data['next_close'] = stock_data['close'].shift(-1)

# Drop rows with NaN values (from lagging/leading)
stock_data = stock_data.dropna()

In [None]:

X = stock_data[['previous_close']]  # Feature
y = stock_data['next_close']       # Target variable

## TODO: Split the data into training and testing sets


In [None]:
## TODO: Train Linear Regression Model



# Predict on the test set
y_pred = model.predict(X_test)


In [None]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\nModel Evaluation:")
print(f"Mean Squared Error: {mse:.2f}")
print(f"R-squared: {r2:.2f}")

In [None]:
# Plot predictions vs actual values
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.6, color='b')
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', lw=2)  # Perfect prediction line
plt.title("Actual vs Predicted Stock Prices")
plt.xlabel("Actual Prices")
plt.ylabel("Predicted Prices")
plt.show()

# Step 6: Analyze model coefficients
print("\nModel Coefficients:")
print(f"Intercept: {model.intercept_:.2f}")
print(f"Coefficient for 'previous_close': {model.coef_[0]:.2f}")


In [None]:
plt.figure(figsize=(14, 8))

# Sort test data for smoother plotting
test_data = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred}, index=X_test.index).sort_index()

# Plot actual prices
plt.plot(test_data.index, test_data['Actual'], label='Actual Prices', color='blue', linewidth=2)

# Plot predicted prices
plt.plot(test_data.index, test_data['Predicted'], label='Predicted Prices', color='orange', linestyle='--', linewidth=2)

plt.title("Stock Price Prediction: Actual vs Predicted", fontsize=16)
plt.xlabel("Index (sorted test set)", fontsize=12)
plt.ylabel("Stock Price", fontsize=12)
plt.legend(fontsize=12)
plt.grid(alpha=0.3)

# Highlight differences
for i in range(len(test_data)):
    plt.plot([test_data.index[i], test_data.index[i]],
             [test_data['Actual'].iloc[i], test_data['Predicted'].iloc[i]],
             color='gray', alpha=0.4, linestyle='--')

plt.show()

# Logistic regression using a real-world dataset

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt

# Load dataset
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']
data = pd.read_csv(url, header=None, names=columns)

In [None]:
# Display the first few rows
print("Dataset preview:")
print(data.head())

# Check for missing values
print("\nMissing values count:")
print(data.isnull().sum())


In [None]:
# Replace zero values in specific columns with NaN (indicates missing values)
columns_to_replace = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
for col in columns_to_replace:
    data[col].replace(0, np.nan, inplace=True)

# Fill missing values with the column mean
data.fillna(data.mean(), inplace=True)

# Split data into features and target
X = data.drop(columns='Outcome')
y = data['Outcome']

In [None]:
## TODO: Normalize feature using a StandardScaler
from sklearn.preprocessing import StandardScaler

## TODO: Split into training and testing sets
X_train, X_test, y_train, y_test =

## TODO: Train Logistic Regression Model

# Make predictions
y_pred =

In [None]:
# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

conf_matrix = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(conf_matrix)

report = classification_report(y_test, y_pred)
print("\nClassification Report:")
print(report)

# Implement SGD

In [None]:
# Sigmoid function
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def sgd_logistic_regression(X, y, learning_rate=0.1, epochs=10, batch_size=32):
    np.random.seed(42)
    n_samples, n_features = X.shape

    ##TODO: randomly initialize weights
    weights =
    bias = 0

    for epoch in range(epochs):
        # Shuffle the data
        indices = np.arange(n_samples)
        np.random.shuffle(indices)
        X, y = X[indices], y[indices]

        for start in range(0, n_samples, batch_size):
            end = start + batch_size
            X_batch, y_batch = X[start:end], y[start:end]

            # TODO: run linear model with sigmoid
            linear_model =
            y_pred =

            # TODO: Gradients
            error =
            dw =
            db =

             # TODO: update weights
            weights -=
            bias -=

        # TODO: get predictions and loss
        y_preds =
        loss =
        print(f'Epoch {epoch + 1}/{epochs}, Loss: {loss:.4f}')

    return weights, bias


In [None]:
X = data.drop(columns='Outcome')
y = data['Outcome']

# Standardize features
X = (X - np.mean(X, axis=0)) / np.std(X, axis=0)

# TODO: Split data into train and test sets
X_train, X_test, y_train, y_test =

# Train logistic regression model using SGD
weights, bias = sgd_logistic_regression(X_train.values, y_train.values, learning_rate=0.2, epochs=10)


In [None]:

# TODO: Make predictions on the test set
y_test_pred =
y_test_pred_class =

# Evaluate model
accuracy = accuracy_score(y_test, y_test_pred_class)
print(f'Accuracy: {accuracy:.4f}')
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred_class))
print("Classification Report:")
print(classification_report(y_test, y_test_pred_class))