In [5]:
# Imports and installation
# You may need to install the packages first:
# %pip install numpy pandas matplotlib scikit-learn statsmodels
#then you need to restart the kernel. Go to Kernel next to File in the Jupyter menu then "restart kernel"
import os
import pandas as pd
import numpy as np

In [6]:
# Load the dataset
# Put Advertising.csv in the same folder as this notebook (recommended),
# or change csv_path to the correct location on your machine using forward slashes "C:/Users/username/Box/..."

df = pd.read_csv("Advertising.csv")

print("Shape:", df.shape)
print("Columns:", list(df.columns))
df.head()

Shape: (200, 4)
Columns: ['TV', 'radio', 'newspaper', 'sales']


Unnamed: 0,TV,radio,newspaper,sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


In [7]:
# Basic sanity checks
print("\nMissing values per column:")
print(df.isna().sum())

print("\nSummary statistics:")
print(df.describe(include="all"))


Missing values per column:
TV           0
radio        0
newspaper    0
sales        0
dtype: int64

Summary statistics:
               TV       radio   newspaper       sales
count  200.000000  200.000000  200.000000  200.000000
mean   147.042500   23.264000   30.554000   14.022500
std     85.854236   14.846809   21.778621    5.217457
min      0.700000    0.000000    0.300000    1.600000
25%     74.375000    9.975000   12.750000   10.375000
50%    149.750000   22.900000   25.750000   12.900000
75%    218.825000   36.525000   45.100000   17.400000
max    296.400000   49.600000  114.000000   27.000000


### STEP ONE: PARTITION OUR DATA AND PREPROCESSING 

In [8]:
from sklearn.model_selection import train_test_split

# Define predictors (X) and response (y)
X = pd.get_dummies(df.drop(columns=["sales"]), drop_first=True) 
Y = df["sales"]

# Train/test split (80/20) 
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=99
)

### STEP TWO: TRAIN MODEL

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

# Build a pipeline which is your model
linreg_pipe = Pipeline([
    ("model", LinearRegression())
])

# Fit the model on the training set
linreg_pipe.fit(X_train, Y_train)

In [10]:
# Inspect coefficients from the fitted model
# Because we did NOT standardize, these are coefficients in the ORIGINAL feature units.

model = linreg_pipe.named_steps["model"]

coef = pd.Series(model.coef_, index=X.columns)
intercept = model.intercept_


display(coef.to_frame("coef"))
print("Intercept:", intercept)

Unnamed: 0,coef
TV,0.045506
radio,0.188205
newspaper,0.000752


Intercept: 2.90448591407692


In [11]:
import statsmodels.api as sm

ols = sm.OLS(Y_train, sm.add_constant(X_train)).fit()
print(ols.summary())

                            OLS Regression Results                            
Dep. Variable:                  sales   R-squared:                       0.891
Model:                            OLS   Adj. R-squared:                  0.889
Method:                 Least Squares   F-statistic:                     423.4
Date:                Tue, 13 Jan 2026   Prob (F-statistic):           1.02e-74
Time:                        20:13:18   Log-Likelihood:                -314.14
No. Observations:                 160   AIC:                             636.3
Df Residuals:                     156   BIC:                             648.6
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          2.9045      0.369      7.877      0.0

### Predictions for test set 

In [12]:
# Prediction on the test set
Y_predictions = linreg_pipe.predict(X_test)
print(Y_predictions)

[13.95446559  6.56096843  8.37730268  8.96942642  6.81536408 21.0724867
 14.61616898 23.29712852 10.61869678 17.01582341 15.20861013  8.17851346
  5.27853174  7.88197539 12.60600345  5.34954895  8.12465182 13.92041312
 12.501818   18.19974255  8.91614106  8.87644672 14.70561647 19.95309416
 12.38660642  7.79536974 12.33553347 16.34015654 19.16036183 16.93591248
 10.0968486  24.0871139  18.4187788  11.56426295  9.65330308 14.35933948
 13.15512731 16.42132385 13.67471724 10.34934211]


### MSE for the test set

In [13]:
# Test MSE 
errors = Y_predictions - Y_test
# print(errors)
mse = np.mean(errors ** 2) 

print(mse)

2.048557611640516
