In [19]:
#step 0 
# Regression predicts a continuous number not a class.

In [20]:
# GOAL
#Predict a continuous value (disease progression score) using multiple features and evaluate how close our predictions are.

In [21]:
# STEP 1 - Load the dataset
from sklearn.datasets import load_diabetes
import pandas as pd

data=load_diabetes(as_frame=True)
df=data.frame

df.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204,75.0
2,0.085299,0.05068,0.044451,-0.00567,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.02593,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641,135.0


In [22]:
# step 2 - separate feature and target
X = df.drop("target", axis=1)
y = df["target"]

"""What This Means (Very Easy)

X â†’ all input features

y â†’ number we want to predict (disease progression)

In regression:

y is a continuous value

Not 0 / 1"""

'What This Means (Very Easy)\n\nX â†’ all input features\n\ny â†’ number we want to predict (disease progression)\n\nIn regression:\n\ny is a continuous value\n\nNot 0 / 1'

In [23]:
X.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204
2,0.085299,0.05068,0.044451,-0.00567,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.02593
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641


In [24]:
y.head()

0    151.0
1     75.0
2    141.0
3    206.0
4    135.0
Name: target, dtype: float64

In [25]:
# step 3 - train/test split (Regression)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
X_train.shape

(353, 10)

In [27]:
X_test.shape

(89, 10)

In [28]:
"""STEP 4 â€” Feature Scaling (Why & How)
ðŸ§  Why Scaling is Needed Here

We are going to use Linear Regression next.

Linear Regression:
Uses distances and gradients
Gets affected if features are on very different scales

Example:

age â†’ small values
bmi / bp â†’ different ranges

ðŸ‘‰ Scaling puts all features on similar scale, so learning is stable
"""
# Step 4.1 - import Scaler and create scaler
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

#step 4.2 fit on training data and transform
X_train_scaled = scaler.fit_transform(X_train)

#STEP 4.4 â€” Transform Test Data
X_test_scaled = scaler.transform(X_test)

In [29]:
# step 5 - train linear regression model
# We will train a Linear Regression model that learns how input features relate to a numeric target value.

from sklearn.linear_model import LinearRegression
lr_model = LinearRegression()

# step 5.1 train the model
lr_model.fit(X_train_scaled, y_train)

"""Model looks at:

X_train_scaled â†’ features

y_train â†’ true numeric values

Learns:

Coefficients (weights)

Intercept

Tries to minimize error between predictions and actual values

After this:

The model is trained."""

'Model looks at:\n\nX_train_scaled â†’ features\n\ny_train â†’ true numeric values\n\nLearns:\n\nCoefficients (weights)\n\nIntercept\n\nTries to minimize error between predictions and actual values\n\nAfter this:\n\nThe model is trained.'

In [30]:
# step 6.1 predict on test data
y_pred = lr_model.predict(X_test_scaled)

"""What this does (easy)

Uses the trained Linear Regression model

Takes unseen feature data (X_test_scaled)

Outputs predicted numeric values

These are numbers, not 0/1
"""

y_pred[:5]

array([139.5475584 , 179.51720835, 134.03875572, 291.41702925,
       123.78965872])

In [31]:
y_test[:5].values

array([219.,  70., 202., 230., 111.])

In [None]:
"""Key Concept (Very Important)

In regression:

We donâ€™t expect exact matches

We measure how far predictions are from actual values

Thatâ€™s why we need error-based metrics, not accuracy."""

"""Memory Rule

Regression predicts numbers; evaluation checks how wrong they are."""

In [None]:
"""We use error-based metrics:

MSE â†’ average squared error

RMSE â†’ error in original units

RÂ² â†’ how much variance is explained"""


In [32]:
"""STEP 7.1 â€” Mean Squared Error (MSE)
ðŸ“Œ What is MSE? (Easy)

Average of squared differences between actual and predicted values

It:

Penalizes big errors more

Is always positive"""

from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_pred)
mse

"""Interpretation

Smaller MSE â†’ better model

Value is in squared units, so not very intuitive"""

2900.193628493483

In [33]:
"""STEP 7.2 â€” Root Mean Squared Error (RMSE)
ðŸ“Œ What is RMSE? (Very Easy)

Square root of MSE, so error is in original units

This is the most intuitive regression metric.

Interpretation

On average, the modelâ€™s prediction is off by about RMSE units.

Example:

RMSE = 50
â†’ Model is wrong by ~50 points on average"""

import numpy as np

rmse = np.sqrt(mse)
rmse


53.85344583676594

In [34]:
"""STEP 7.3 â€” RÂ² Score (Most Important)
ðŸ“Œ What is RÂ²? (Plain English)

How much of the dataâ€™s variation the model explains

RÂ² = 1 â†’ perfect

RÂ² = 0 â†’ useless

RÂ² < 0 â†’ worse than guessing mean

Interpretation Example

RÂ² = 0.45 â†’ model explains 45% of variation

RÂ² = 0.7 â†’ strong model"""

from sklearn.metrics import r2_score

r2 = r2_score(y_test, y_pred)
r2


0.45260276297191915

In [None]:
# Summary Table
"""Metric	Meaning
MSE	Average squared error
RMSE	Average error (original units)
RÂ²	% of variance explained

ðŸ§  Memory Rules (Very Important)

MSE â†’ mathematical error
RMSE â†’ human-friendly error
RÂ² â†’ overall model strength"""