# Campaign Sales Model Walkthrough
This notebook rebuilds the linear regression model used in the project, step by step, using the advertising dataset.

## Workflow
1. Load and inspect the dataset
2. Prepare features and the target variable
3. Split into training and validation sets
4. Fit a closed-form linear regression (no external dependencies beyond NumPy/Pandas)
5. Evaluate R² / RMSE / MAE
6. Visualize predictions vs. actual values

In [None]:
# Imports
from pathlib import Path
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme(style='darkgrid')

In [None]:
# Load dataset
DATA_PATH = Path('notebooks/data/Advertising And Sales.csv')
if not DATA_PATH.exists():
    raise FileNotFoundError(f'Dataset not found at {DATA_PATH.resolve()}')
df = pd.read_csv(DATA_PATH)
print(f'Dataset shape: {df.shape}')
df.head()

### Feature preparation
We'll use TV, Radio, and Newspaper spend as predictors and Sales (in thousands of units) as the target. Missing rows are dropped for clarity.

In [None]:
FEATURE_COLS = ['TV', 'Radio', 'Newspaper']
TARGET_COL = 'Sales'
df = df.dropna(subset=FEATURE_COLS + [TARGET_COL]).reset_index(drop=True)
X = df[FEATURE_COLS].values
y = df[TARGET_COL].values
df.describe(include='all')

In [None]:
# Train/validation split (80/20, deterministic)
RANDOM_STATE = 42
train_df = df.sample(frac=0.8, random_state=RANDOM_STATE)
test_df = df.drop(train_df.index)
X_train, y_train = train_df[FEATURE_COLS].values, train_df[TARGET_COL].values
X_test, y_test = test_df[FEATURE_COLS].values, test_df[TARGET_COL].values
print(f'Train size: {len(train_df)} rows | Test size: {len(test_df)} rows')

### Closed-form linear regression
We solve for θ = (XᵀX)⁻¹Xᵀy to match the original project pipeline without relying on scikit-learn.

In [None]:
def add_intercept(X):
    ones = np.ones((X.shape[0], 1))
    return np.hstack([ones, X])

def fit_linear_regression(X, y):
    X_design = add_intercept(X)
    theta = np.linalg.pinv(X_design.T @ X_design) @ X_design.T @ y
    intercept = float(theta[0])
    coefs = theta[1:].astype(float)
    return intercept, coefs

intercept, coefs = fit_linear_regression(X_train, y_train)
for name, coef in zip(FEATURE_COLS, coefs):
    print(f'{name} coefficient: {coef:.4f}')
print(f'Intercept: {intercept:.4f}')

### Evaluation helpers

In [None]:
def predict(X, intercept, coefs):
    return intercept + X @ coefs

def r2_score(y_true, y_pred):
    ss_res = np.sum((y_true - y_pred) ** 2)
    ss_tot = np.sum((y_true - np.mean(y_true)) ** 2)
    return 1 - ss_res / ss_tot

def rmse(y_true, y_pred):
    return np.sqrt(np.mean((y_true - y_pred) ** 2))

def mae(y_true, y_pred):
    return np.mean(np.abs(y_true - y_pred))

train_pred = predict(X_train, intercept, coefs)
test_pred = predict(X_test, intercept, coefs)

metrics = {
    'train': {
        'R2': r2_score(y_train, train_pred),
        'RMSE': rmse(y_train, train_pred),
        'MAE': mae(y_train, train_pred)
    },
    'test': {
        'R2': r2_score(y_test, test_pred),
        'RMSE': rmse(y_test, test_pred),
        'MAE': mae(y_test, test_pred)
    }
}
metrics

### Predicted vs. actual (validation set)

In [None]:
plt.figure(figsize=(6, 6))
sns.scatterplot(x=y_test, y=test_pred, color='#00E0FF', edgecolor='black')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', label='Ideal fit')
plt.xlabel('Actual Sales (k units)')
plt.ylabel('Predicted Sales (k units)')
plt.title('Predicted vs Actual (Validation)')
plt.legend()
plt.tight_layout()

### Residual distribution

In [None]:
residuals = y_test - test_pred
plt.figure(figsize=(6, 4))
sns.histplot(residuals, bins=15, kde=True, color='#14F195')
plt.xlabel('Residual (Actual - Predicted)')
plt.title('Validation Residuals Distribution')
plt.tight_layout()