In [14]:
#day 3 : underfitting and overfitting practice
#mastering model complexity and validation
print ("day 3 : finding the model complexity sweet spot")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

day 3 : finding the model complexity sweet spot


In [15]:
#create more realistic housing dataset
np.random.seed(42)
size = np.random.randint(800,3000,200)
bedrooms = np.random.randint(1,6,200)
distance_to_city = np.random.uniform(1,30,200)

#let us create realistic price formulas
base_price = 50000
price = base_price+size*120 + bedrooms*15000 - distance_to_city * 3000 + np.random.randint(-20000,20000,200)
housing_data = pd.DataFrame({'size_sqft': size,'bedrooms': bedrooms ,'distance_km': distance_to_city,'price': price})

print("Realistic housing Dataset: ")
print(housing_data.head())
print(f"\nDataset shape: {housing_data.shape}")
print(f"\nAverage price : ${housing_data} ['price'].mean():,0f")

Realistic housing Dataset: 
   size_sqft  bedrooms  distance_km          price
0       1660         4    17.627644  263015.067375
1       2094         5     4.726510  357881.469269
2       1930         5    24.524921  276076.236624
3       1895         3    24.798545  267653.365610
4       2438         1    19.152250  287052.248701

Dataset shape: (200, 4)

Average price : $     size_sqft  bedrooms  distance_km          price
0         1660         4    17.627644  263015.067375
1         2094         5     4.726510  357881.469269
2         1930         5    24.524921  276076.236624
3         1895         3    24.798545  267653.365610
4         2438         1    19.152250  287052.248701
..         ...       ...          ...            ...
195       1423         4    18.108721  211689.836949
196       1816         3    14.921333  267175.000869
197       1680         4    13.195538  260175.387256
198       2849         4    23.755380  366028.861037
199       2127         2    19.541479  2

In [23]:
# Prepare features and target
X = housing_data[['size_sqft', 'bedrooms', 'distance_km']]
y = housing_data['price'] #targeted

# Splitting the  data
X_train, X_test,y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Data Split:")
print(f"Training houses:{X_train.shape[0]}")
print(f"Testing houses: {X_test.shape[0]}")

Data Split:
Training houses:160
Testing houses: 40


In [28]:
# test 3 Models - Simple, Medium, Complex
print(" Testing 3 Smartness Levels")

# We will test just 3 models:
models_to_test = [1, 3, 20]  # Just 3 depths: Simple, Medium, Complex
model_names = ["Too Simple", "Just Right", "Too Complex"]

print("Model Type  | Train Error | Test Error")
print("-" * 40)

for i in range(3): 
    depth = models_to_test[i]
    name = model_names[i]
    
    # Create model
    model = DecisionTreeRegressor(max_depth=depth, random_state=42)
    model.fit(X_train, y_train)
    
    # Get predictions
    train_predictions = model.predict(X_train)
    test_predictions = model.predict(X_test)
    
    # Calculate errors
    train_error = mean_absolute_error(y_train, train_predictions)
    test_error = mean_absolute_error(y_test, test_predictions)
    
    print(f"{name:^11} | ${train_error:>9,.0f} | ${test_error:>9,.0f}")

 Testing 3 Smartness Levels
Model Type  | Train Error | Test Error
----------------------------------------
Too Simple  | $   38,286 | $   40,836
Just Right  | $   21,682 | $   28,419
Too Complex | $        0 | $   24,042


In [27]:
print("ANALYSIS:")
print("-" * 50)
print("Too Simple:   High errors on both : UNDERFITTING")
print("Just Right:   Balanced errors : PERFECT BALANCE")  
print("Too Complex:  $0 train error but high test error : OVERFITTING!")
print(f"\nWINNER: 'Just Right' model (depth 3)")
print(f"Because it generalizes best to new houses!")

ANALYSIS:
--------------------------------------------------
Too Simple:   High errors on both : UNDERFITTING
Just Right:   Balanced errors : PERFECT BALANCE
Too Complex:  $0 train error but high test error : OVERFITTING!

WINNER: 'Just Right' model (depth 3)
   Because it generalizes best to new houses!


In [30]:
print("Key Learnings of the day 3")

print("""
1. Too Simple Model(1 question):
     Makes lots of mistakes on everything
     Like using only size to guess house price

2. Too Complex Model (20 questions):
     Memorizes all training houses perfectly  
     But fails on new houses it hasn't seen
     Like memorizing answers instead of understanding

3. Just Right Model (3 questions):
       Learns the patterns properly
       Works well on both old and new houses
       The sweet spot!

Lesson of the day:  The best model balances learning vs memorizing.
""")

print("Day 3 done! Time to rest")

Key Learnings of the day 3

1. Too Simple Model(1 question):
     Makes lots of mistakes on everything
     Like using only size to guess house price

2. Too Complex Model (20 questions):
     Memorizes all training houses perfectly  
     But fails on new houses it hasn't seen
     Like memorizing answers instead of understanding

3. Just Right Model (3 questions):
       Learns the patterns properly
       Works well on both old and new houses
       The sweet spot!

Lesson of the day:  The best model balances learning vs memorizing.

Day 3 done! Time to rest
