In [235]:
# For data manipulation and loading datasets
import pandas as pd

# For numerical operations, especially with arrays and matrices
import numpy as np

# The main machine learning library in Python
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.metrics import r2_score
from sklearn.decomposition import PCA

# To handle potential warnings and keep the output clean
import warnings
warnings.filterwarnings('ignore')

In [236]:
print("--- Solving Question 1 ---")

--- Solving Question 1 ---


In [237]:
df_house = pd.read_csv("USA_Housing.csv")

In [238]:
df_house.head()

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price
0,79545.45857,5.682861,7.009188,4.09,23086.8005,1059034.0
1,79248.64245,6.0029,6.730821,3.09,40173.07217,1505891.0
2,61287.06718,5.86589,8.512727,5.13,36882.1594,1058988.0
3,63345.24005,7.188236,5.586729,3.26,34310.24283,1260617.0
4,59982.19723,5.040555,7.839388,4.23,26354.10947,630943.5


In [239]:
df_house.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 6 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Avg. Area Income              5000 non-null   float64
 1   Avg. Area House Age           5000 non-null   float64
 2   Avg. Area Number of Rooms     5000 non-null   float64
 3   Avg. Area Number of Bedrooms  5000 non-null   float64
 4   Area Population               5000 non-null   float64
 5   Price                         5000 non-null   float64
dtypes: float64(6)
memory usage: 234.5 KB


a) part

In [240]:
y = df_house['Price']

In [241]:
X = df_house.drop('Price', axis=1)

In [242]:
X.shape 

(5000, 5)

In [243]:
y.shape

(5000,)

In [244]:
y.head()

0    1.059034e+06
1    1.505891e+06
2    1.058988e+06
3    1.260617e+06
4    6.309435e+05
Name: Price, dtype: float64

In [245]:
X.head()

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population
0,79545.45857,5.682861,7.009188,4.09,23086.8005
1,79248.64245,6.0029,6.730821,3.09,40173.07217
2,61287.06718,5.86589,8.512727,5.13,36882.1594
3,63345.24005,7.188236,5.586729,3.26,34310.24283
4,59982.19723,5.040555,7.839388,4.23,26354.10947


b) part

In [246]:
scaler = StandardScaler()

In [247]:
X_scaled = scaler.fit_transform(X)

In [248]:
print(X_scaled)

[[ 1.02865969 -0.29692705  0.02127433  0.08806222 -1.31759867]
 [ 1.00080775  0.02590164 -0.25550611 -0.72230146  0.40399945]
 [-0.68462915 -0.11230283  1.5162435   0.93084045  0.07240989]
 ...
 [-0.48723454  1.28447022 -2.17026949 -1.50025059 -0.29193658]
 [-0.05459152 -0.44669439  0.14154061  1.18205319  0.65111608]
 [-0.28831272  0.01521477 -0.19434166  0.07185495  1.04162464]]


In [249]:
print(X.values[0])

[7.95454586e+04 5.68286132e+00 7.00918814e+00 4.09000000e+00
 2.30868005e+04]


c) part and d) part

In [250]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [251]:
beta_list = []
r2_scores = []

In [252]:
for i, (train_index, test_index) in enumerate(kf.split(X_scaled)):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y[train_index], y[test_index]
    X_train_b = np.c_[np.ones((X_train.shape[0], 1)), X_train]
    beta = np.linalg.inv(X_train_b.T @ X_train_b) @ X_train_b.T @ y_train
    X_test_b = np.c_[np.ones((X_test.shape[0], 1)), X_test]
    y_pred = X_test_b @ beta
    score = r2_score(y_test, y_pred)
    beta_list.append(beta)
    r2_scores.append(score)
    print(f"Fold {i+1}: R2 Score = {score:.4f}")

print(f"All R2 scores: {r2_scores}")


Fold 1: R2 Score = 0.9180
Fold 2: R2 Score = 0.9146
Fold 3: R2 Score = 0.9116
Fold 4: R2 Score = 0.9193
Fold 5: R2 Score = 0.9244
All R2 scores: [0.9179971706985147, 0.9145677884802818, 0.9116116385364478, 0.9193091764960816, 0.9243869413350316]


e) part

successfully trained and evaluated our model 5 different times on 5 different subsets of the data

In [253]:
best_beta_index = np.argmax(r2_scores)
best_beta_index


np.int64(4)

In [254]:
best_beta = beta_list[best_beta_index]
best_beta

array([1.23161736e+06, 2.30225051e+05, 1.63956839e+05, 1.21115120e+05,
       7.83467170e+02, 1.50662447e+05])

In [255]:
best_r2 = r2_scores[best_beta_index]
best_r2

0.9243869413350316

In [256]:
print(f"The best model was from Fold {best_beta_index + 1}, with an R2 score of {best_r2:.4f}.")
print("We will now use the beta coefficients from this model for our final test.")

The best model was from Fold 5, with an R2 score of 0.9244.
We will now use the beta coefficients from this model for our final test.


splitting data into 70 percent training and 30 percent testing

In [257]:
X_train_final, X_test_final, y_train_final, y_test_final = train_test_split(X_scaled, y, test_size=0.3, random_state=42)


In [258]:
X_test_final_b = np.c_[np.ones((X_test_final.shape[0], 1)), X_test_final]
y_pred_final = X_test_final_b @ best_beta

y_pred_final

array([1309472.34209342, 1238698.51255206, 1247440.13320666, ...,
       1458529.22587541, 1482180.096334  , 1050564.30627078],
      shape=(1500,))

In [259]:
final_score = r2_score(y_test_final, y_pred_final)

In [260]:
final_score

0.9147458156636434

Q2)

In [261]:
# --- Step 1: Divide data into training (56%), validation (14%), and test (30%) sets ---
print("--- Splitting the data ---")

# split into a large training/validation combo (70%) and a test set (30%)

--- Splitting the data ---


In [262]:
X_train_val, X_test, y_train_val, y_test = train_test_split(X_scaled, y, test_size=0.30, random_state=42)

# Now, split the 70% part into training (80% of 70% = 56%) and validation (20% of 70% = 14%)
# test_size=0.20 here means 20% of the 70% we have, which is 14% of the total.

In [263]:
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.20, random_state=42)

In [264]:
# --- Step 2: Gradient Descent Implementation ---
def gradient_descent(X, y, learning_rate, iterations):
    X_b = np.c_[np.ones((len(X), 1)), X]
    n_samples, n_features = X_b.shape
    

    beta = np.zeros(n_features)

  
    for _ in range(iterations):
        # 1. Make predictions
        y_pred = X_b @ beta
        # 2. Calculate the error (how far off the predictions are)
        error = y_pred - y
        # 3. Calculate the gradient (the direction of the slope)
        gradient = (2/n_samples) * X_b.T @ error
        # 4. Update the beta coefficients (take a step downhill)
        beta = beta - learning_rate * gradient
        
    return beta


In [265]:
# --- Step 3: Find the best learning rate using the validation set ---
learning_rates = [0.001, 0.01, 0.1, 1]
best_beta = None
best_val_score = -np.inf # Initialize with a very low score
best_lr = None


In [266]:
for lr in learning_rates:
    # Train the model on the training set
    beta_coeffs = gradient_descent(X_train, y_train, learning_rate=lr, iterations=1000) 
    
    # Evaluate on the validation set
    X_val_b = np.c_[np.ones((len(X_val), 1)), X_val]
    y_val_pred = X_val_b @ beta_coeffs
    val_score = r2_score(y_val, y_val_pred)
    
    print(f"Learning Rate: {lr}, Validation R2 Score: {val_score:.4f}")

    # If this model is the best one so far, save its results
    if val_score > best_val_score:
        best_val_score = val_score
        best_beta = beta_coeffs
        best_lr = lr

Learning Rate: 0.001, Validation R2 Score: 0.6820
Learning Rate: 0.01, Validation R2 Score: 0.9098
Learning Rate: 0.1, Validation R2 Score: 0.9098
Learning Rate: 1, Validation R2 Score: -inf


In [267]:
print(f"\nBest learning rate is {best_lr} with a validation R2 score of {best_val_score:.4f}.")


Best learning rate is 0.01 with a validation R2 score of 0.9098.


In [268]:
# --- Step 4: Final evaluation on the test set ---
print("\n--- Evaluating the best model on the unseen test set ---")
X_test_b = np.c_[np.ones((len(X_test), 1)), X_test]
y_test_pred = X_test_b @ best_beta
test_score = r2_score(y_test, y_test_pred)

print(f"Final Test R2 Score: {test_score:.4f}")


--- Evaluating the best model on the unseen test set ---
Final Test R2 Score: 0.9148
