In [1]:
"""
BIM453 - INTRODUCTION TO MACHINE LEARNING
HOMEWORK 1
Group Members:
- [14128544032] [Duran Özçelik]
- Student 2: [ID NUMBER] [FULL NAME]

Date: [19.10.2025]
"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from scipy.stats import loguniform
import warnings
warnings.filterwarnings('ignore')

print("✓ All libraries loaded successfully!")

✓ All libraries loaded successfully!


In [2]:
import os
import tarfile
from urllib import request

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()
    print(f"✓ Data downloaded: {housing_path}")

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

fetch_housing_data()
housing = load_housing_data()

print(f"\n📊 Dataset Information:")
print(f"   - Number of rows: {housing.shape[0]}")
print(f"   - Number of columns: {housing.shape[1]}")
print(f"\n   First 5 rows:")
housing.head()

✓ Data downloaded: datasets\housing

📊 Dataset Information:
   - Number of rows: 20640
   - Number of columns: 10

   First 5 rows:


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [3]:
print("📋 Dataset Structure:")
housing.info()

print("\n📊 Numerical Summary:")
housing.describe()

print("\n🗂️ Categorical Variable (ocean_proximity):")
housing['ocean_proximity'].value_counts()

📋 Dataset Structure:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB

📊 Numerical Summary:

🗂️ Categorical Variable (ocean_proximity):


ocean_proximity
<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: count, dtype: int64

In [4]:
housing["income_cat"] = pd.cut(housing["median_income"],
                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                               labels=[1, 2, 3, 4, 5])

print("📊 Income Category Distribution:")
print(housing["income_cat"].value_counts().sort_index())

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)

print(f"\n✓ Train set: {len(strat_train_set)} samples")
print(f"✓ Test set: {len(strat_test_set)} samples")

📊 Income Category Distribution:
income_cat
1     822
2    6581
3    7236
4    3639
5    2362
Name: count, dtype: int64

✓ Train set: 16512 samples
✓ Test set: 4128 samples


In [5]:
housing_labels = strat_train_set["median_house_value"].copy()
housing_features = strat_train_set.drop("median_house_value", axis=1)

print(f"✓ Features shape: {housing_features.shape}")
print(f"✓ Labels shape: {housing_labels.shape}")
print(f"✓ Label range: ${housing_labels.min():,.0f} - ${housing_labels.max():,.0f}")

✓ Features shape: (16512, 9)
✓ Labels shape: (16512,)
✓ Label range: $14,999 - $500,001


In [6]:
housing_num = housing_features.drop("ocean_proximity", axis=1)
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

print(f"📊 Numerical attributes: {num_attribs}")
print(f"📊 Categorical attributes: {cat_attribs}")

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('scaler', StandardScaler())
])

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs)
])

housing_prepared = full_pipeline.fit_transform(housing_features)

print(f"\n✓ Transformed data size: {housing_prepared.shape}")
print(f"✓ Number of features: {housing_prepared.shape[1]}")
print("   (8 numerical + 5 categorical one-hot encoded = 13 features)")

📊 Numerical attributes: ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income']
📊 Categorical attributes: ['ocean_proximity']

✓ Transformed data size: (16512, 13)
✓ Number of features: 13
   (8 numerical + 5 categorical one-hot encoded = 13 features)


In [7]:
print("="*80)
print("QUESTION 1: GridSearchCV for SVR Hyperparameter Optimization")
print("="*80)

param_grid = [
    {
        'kernel': ['linear'],
        'C': [0.1, 1, 10, 100, 1000]
    },
    {
        'kernel': ['rbf'],
        'C': [0.1, 1, 10, 100, 1000],
        'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1]
    }
]

print("\n🔍 Grid Search Parameters:")
print(f"   Linear kernel: C = [0.1, 1, 10, 100, 1000]")
print(f"   RBF kernel: C = [0.1, 1, 10, 100, 1000]")
print(f"   RBF kernel: gamma = ['scale', 'auto', 0.001, 0.01, 0.1, 1]")
print(f"\n   Total combinations: 5 + (5×6) = 35")
print(f"   Total training with 3-fold CV: 35×3 = 105 models")

svr = SVR()

grid_search = GridSearchCV(
    svr,
    param_grid,
    cv=3,
    scoring='neg_mean_squared_error',
    verbose=2,
    n_jobs=-1
)

print("\n⏳ Grid Search starting... (may take 5-15 minutes)")
grid_search.fit(housing_prepared, housing_labels)

best_rmse_grid = np.sqrt(-grid_search.best_score_)
best_params_grid = grid_search.best_params_

print("\n" + "="*80)
print("📊 GRID SEARCH RESULTS")
print("="*80)
print(f"✅ Best RMSE: ${best_rmse_grid:,.2f}")
print(f"✅ Best Hyperparameters:")
for param, value in best_params_grid.items():
    print(f"   • {param}: {value}")

QUESTION 1: GridSearchCV for SVR Hyperparameter Optimization

🔍 Grid Search Parameters:
   Linear kernel: C = [0.1, 1, 10, 100, 1000]
   RBF kernel: C = [0.1, 1, 10, 100, 1000]
   RBF kernel: gamma = ['scale', 'auto', 0.001, 0.01, 0.1, 1]

   Total combinations: 5 + (5×6) = 35
   Total training with 3-fold CV: 35×3 = 105 models

⏳ Grid Search starting... (may take 5-15 minutes)
Fitting 3 folds for each of 35 candidates, totalling 105 fits

📊 GRID SEARCH RESULTS
✅ Best RMSE: $70,602.93
✅ Best Hyperparameters:
   • C: 1000
   • kernel: linear


In [8]:
print("\n" + "="*80)
print("QUESTION 2: RandomizedSearchCV for SVR Hyperparameter Optimization")
print("="*80)

param_distributions = {
    'kernel': ['linear', 'rbf'],
    'C': loguniform(0.1, 1000),
    'gamma': loguniform(0.001, 1)
}

print("\n🎲 Randomized Search Parameters:")
print(f"   kernel: ['linear', 'rbf']")
print(f"   C: log-uniform(0.1, 1000)")
print(f"   gamma: log-uniform(0.001, 1)")
print(f"   n_iter: 50 (50 random combinations)")

random_search = RandomizedSearchCV(
    svr,
    param_distributions,
    n_iter=50,
    cv=3,
    scoring='neg_mean_squared_error',
    verbose=2,
    n_jobs=-1,
    random_state=42
)

print("\n⏳ Randomized Search starting... (may take 3-10 minutes)")
random_search.fit(housing_prepared, housing_labels)

best_rmse_random = np.sqrt(-random_search.best_score_)
best_params_random = random_search.best_params_

print("\n" + "="*80)
print("📊 RANDOMIZED SEARCH RESULTS")
print("="*80)
print(f"✅ Best RMSE: ${best_rmse_random:,.2f}")
print(f"✅ Best Hyperparameters:")
for param, value in best_params_random.items():
    if isinstance(value, float):
        print(f"   • {param}: {value:.6f}")
    else:
        print(f"   • {param}: {value}")


QUESTION 2: RandomizedSearchCV for SVR Hyperparameter Optimization

🎲 Randomized Search Parameters:
   kernel: ['linear', 'rbf']
   C: log-uniform(0.1, 1000)
   gamma: log-uniform(0.001, 1)
   n_iter: 50 (50 random combinations)

⏳ Randomized Search starting... (may take 3-10 minutes)
Fitting 3 folds for each of 50 candidates, totalling 150 fits

📊 RANDOMIZED SEARCH RESULTS
✅ Best RMSE: $70,627.34
✅ Best Hyperparameters:
   • C: 856.886979
   • gamma: 0.025136
   • kernel: linear


In [9]:
print("\n" + "="*80)
print("📊 COMPARISON: GridSearchCV vs RandomizedSearchCV")
print("="*80)

print("\n" + "─"*80)
print(f"{'Method':<25} {'Best RMSE':<15} {'Kernel':<10} {'C':<12} {'Gamma':<12} {'Combinations':<10}")
print("─"*80)

grid_gamma = best_params_grid.get('gamma', 'N/A')
random_gamma = best_params_random.get('gamma', 0)

print(f"{'GridSearchCV':<25} ${best_rmse_grid:>13,.2f} {best_params_grid['kernel']:<10} {best_params_grid['C']:<12} {str(grid_gamma):<12} {len(grid_search.cv_results_['params']):<10}")
print(f"{'RandomizedSearchCV':<25} ${best_rmse_random:>13,.2f} {best_params_random['kernel']:<10} {best_params_random['C']:<12.4f} {random_gamma:<12.6f} {len(random_search.cv_results_['params']):<10}")
print("─"*80)

rmse_diff = abs(best_rmse_grid - best_rmse_random)
percentage_diff = (rmse_diff / min(best_rmse_grid, best_rmse_random)) * 100

print("\n📈 Performance Analysis:")
print(f"  • RMSE Difference: ${rmse_diff:,.2f} ({percentage_diff:.2f}%)")

if best_rmse_grid < best_rmse_random:
    improvement = ((best_rmse_random - best_rmse_grid) / best_rmse_random * 100)
    print(f"  • The Best Search: GridSearchCV is {improvement:.2f}% better")
else:
    improvement = ((best_rmse_grid - best_rmse_random) / best_rmse_grid * 100)
    print(f"  • The Best Search: RandomizedSearchCV is {improvement:.2f}% better")




📊 COMPARISON: GridSearchCV vs RandomizedSearchCV

────────────────────────────────────────────────────────────────────────────────
Method                    Best RMSE       Kernel     C            Gamma        Combinations
────────────────────────────────────────────────────────────────────────────────
GridSearchCV              $    70,602.93 linear     1000         N/A          35        
RandomizedSearchCV        $    70,627.34 linear     856.8870     0.025136     50        
────────────────────────────────────────────────────────────────────────────────

📈 Performance Analysis:
  • RMSE Difference: $24.41 (0.03%)
  • The Best Search: GridSearchCV is 0.03% better
