In [None]:
"""
step 0
Goal of This Project

The goal of this project is to predict a continuous numeric value (disease progression score) using a Random Forest Regressor and compare its performance
with Linear Regression.

Dataset Used

Dataset: Diabetes dataset (from sklearn)
Features: Medical measurements
Target: Disease progression score (target column)

Model Used
Random Forest Regressor

Random Forest Regressor is an ensemble model that uses multiple decision trees and predicts a value by averaging the predictions of all trees.

Why Random Forest Regressor?

Handles non-linear relationships
Works well without feature scaling
Captures feature interactions
Often performs better than linear models on complex data
"""


In [1]:
# step 1 : Load dataset
from sklearn.datasets import load_diabetes
import pandas as pd

data=load_diabetes(as_frame=True)
df=data.frame

df.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204,75.0
2,0.085299,0.05068,0.044451,-0.00567,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.02593,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641,135.0


In [2]:
# step 2 separate X and y
X = df.drop("target", axis=1)
y=df["target"]


In [3]:
X.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204
2,0.085299,0.05068,0.044451,-0.00567,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.02593
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641


In [4]:
y.head()

0    151.0
1     75.0
2    141.0
3    206.0
4    135.0
Name: target, dtype: float64

In [5]:
#step 3. train/test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)


In [6]:
X_train.shape

(265, 10)

In [7]:
X_test.shape

(177, 10)

In [8]:
#step 4 create random forest regressor and train it on training data
from sklearn.ensemble import RandomForestRegressor
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
rf_reg.fit(X_train, y_train)


In [9]:
#step 5.1 make predictions on test data
y_pred_rf = rf_reg.predict(X_test)
y_pred_rf[:5]

array([143.31, 183.72, 156.21, 248.45, 117.97])

In [10]:
# step 5.2 evaluate using mse and rmse
from sklearn.metrics import mean_squared_error
import numpy as np

mse_rf = mean_squared_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mse_rf)
mse_rf, rmse_rf

(2990.8375022598866, 54.688550010581615)

In [11]:
# step 5.3 evaluate ysung R^2 score
from sklearn.metrics import r2_score
r2_rf = r2_score(y_test, y_pred_rf)
r2_rf

0.4887631379036501

In [None]:
"""
| Model                   | RMSE  | R²   |
| ----------------------- | ----- | ---- |
| Linear Regression       | 53.85 | 0.45 |
| Random Forest Regressor | 54.69 | 0.49 |

RMSE

Random Forest RMSE is slightly worse

Error increased by ~1 unit

Difference is very small

2️⃣ R²

Random Forest explains more variation

49% vs 45%

So it follows data ups & downs a bit better
"""