In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
import seaborn as sns

import warnings

warnings.filterwarnings('ignore')
%matplotlib inline

In [5]:
import statsmodels.api as sm


In [3]:
df = pd.read_csv('advertising.csv')

In [4]:
df.head()

Unnamed: 0,TV,Radio,Newspaper,Sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,12.0
3,151.5,41.3,58.5,16.5
4,180.8,10.8,58.4,17.9


In [7]:
x1 = df[['TV']]
y = df['Sales']

In [8]:
model = LinearRegression()
model.fit(x1,y)

0,1,2
,"fit_intercept  fit_intercept: bool, default=True Whether to calculate the intercept for this model. If set to False, no intercept will be used in calculations (i.e. data is expected to be centered).",True
,"copy_X  copy_X: bool, default=True If True, X will be copied; else, it may be overwritten.",True
,"tol  tol: float, default=1e-6 The precision of the solution (`coef_`) is determined by `tol` which specifies a different convergence criterion for the `lsqr` solver. `tol` is set as `atol` and `btol` of :func:`scipy.sparse.linalg.lsqr` when fitting on sparse training data. This parameter has no effect when fitting on dense data. .. versionadded:: 1.7",1e-06
,"n_jobs  n_jobs: int, default=None The number of jobs to use for the computation. This will only provide speedup in case of sufficiently large problems, that is if firstly `n_targets > 1` and secondly `X` is sparse or if `positive` is set to `True`. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.",
,"positive  positive: bool, default=False When set to ``True``, forces the coefficients to be positive. This option is only supported for dense arrays. For a comparison between a linear regression model with positive constraints on the regression coefficients and a linear regression without such constraints, see :ref:`sphx_glr_auto_examples_linear_model_plot_nnls.py`. .. versionadded:: 0.24",False


In [13]:
r2_score(y, (model.predict(x1)))

0.8121757029987414

In [19]:
xsm = sm.add_constant(x1)
ols_model = sm.OLS(y, xsm).fit()
ols_model.summary()

0,1,2,3
Dep. Variable:,Sales,R-squared:,0.812
Model:,OLS,Adj. R-squared:,0.811
Method:,Least Squares,F-statistic:,856.2
Date:,"Thu, 05 Feb 2026",Prob (F-statistic):,7.93e-74
Time:,15:07:53,Log-Likelihood:,-448.99
No. Observations:,200,AIC:,902.0
Df Residuals:,198,BIC:,908.6
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,6.9748,0.323,21.624,0.000,6.339,7.611
TV,0.0555,0.002,29.260,0.000,0.052,0.059

0,1,2,3
Omnibus:,0.013,Durbin-Watson:,2.029
Prob(Omnibus):,0.993,Jarque-Bera (JB):,0.043
Skew:,-0.018,Prob(JB):,0.979
Kurtosis:,2.938,Cond. No.,338.0


In [20]:
from sklearn.preprocessing import PolynomialFeatures

# 1. Create Polynomial Features (Degree 2 means we add TV^2)
poly = PolynomialFeatures(degree=2, include_bias=False)

# Transform the original X data
X_poly = poly.fit_transform(df[['TV']]) # This creates [TV, TV^2]

# Split the new polynomial data
X_train_p, X_test_p, y_train_p, y_test_p = train_test_split(X_poly, y, test_size=0.2, random_state=42)

# 2. Train Linear Regression on the polynomial features
poly_model = LinearRegression()
poly_model.fit(X_train_p, y_train_p)

# 3. Predict
y_pred_p = poly_model.predict(X_test_p)

# 4. Evaluate
r2_poly = r2_score(y_test_p, y_pred_p)

print(f"\n--- Polynomial Regression (Degree 2) Results ---")
print(f"Standard Linear R²: ~0.83")
print(f"Polynomial Linear R²: {r2_poly:.4f}")
print(f"Improvement: {r2_poly - 0.83:.4f}")


--- Polynomial Regression (Degree 2) Results ---
Standard Linear R²: ~0.83
Polynomial Linear R²: 0.8161
Improvement: -0.0139


In [24]:
plt.figure(figsize=(10,6))

# Sort for cleaner plotting lines
sort_idx = np.argsort(x1['TV'].values)
X_test_sorted = x1.iloc[sort_idx]
y_pred_sorted = y_pred[sort_idx]

# Plot 1: Simple Line
plt.scatter(x1, y, color='blue', alpha=0.5, label='Actual Data')
plt.plot(X_test_sorted, y_pred_sorted, color='red', linewidth=2, label='Linear Fit (SLR)')

# Plot 2: Polynomial Curve (if you ran the polynomial code above)
sort_idx_p = np.argsort(X_test_p[:, 0]) # Sort by original TV value
plt.plot(X_test_p[sort_idx_p, 0], y_pred_p[sort_idx_p], color='green', linewidth=2, linestyle='--', label='Polynomial Fit (Degree 2)')

plt.title('TV Budget vs Sales: Linear vs Polynomial')
plt.xlabel('TV Budget')
plt.ylabel('Sales')
plt.legend()
plt.show()

NameError: name 'y_pred' is not defined

<Figure size 1000x600 with 0 Axes>