In [None]:
import numpy as np
import pandas as pd
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv("/kaggle/input/usa-housing/USA_Housing (1).csv")
print("Dataset shape:", df.shape)
print(df.describe())
print(df.head())

In [None]:
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
import warnings
warnings.filterwarnings('ignore')

In [None]:
print("QUESTION 1: 5-Fold Cross Validation with Least Squares")
print("="*60)

X = df.drop('Price', axis=1)
y = df['Price'].values

print(f"Input features shape: {X.shape}")
print(f"Output variable shape: {y.shape}")
print(f"Feature names: {list(X.columns)}")

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print("Features scaled using StandardScaler")

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
beta_matrices = []
r2_scores = []
fold_results = []

print("5-Fold Cross Validation Results:")
print("-" * 50)

for fold, (train_index, test_index) in enumerate(kf.split(X_scaled), 1):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    X_train_intercept = np.hstack([np.ones((X_train.shape[0], 1)), X_train])
    X_test_intercept = np.hstack([np.ones((X_test.shape[0], 1)), X_test])
    
    beta = np.linalg.inv(X_train_intercept.T @ X_train_intercept) @ X_train_intercept.T @ y_train
    beta_matrices.append(beta)
    
    y_pred = X_test_intercept @ beta
    r2 = r2_score(y_test, y_pred)
    r2_scores.append(r2)
    
    fold_results.append({
        'fold': fold,
        'beta': beta,
        'r2_score': r2,
        'train_size': len(X_train),
        'test_size': len(X_test)
    })
    
    print(f"Fold {fold}: R2 Score = {r2:.6f}, Train size = {len(X_train)}, Test size = {len(X_test)}")

In [None]:
best_fold_idx = np.argmax(r2_scores)
best_beta = beta_matrices[best_fold_idx]
best_r2 = r2_scores[best_fold_idx]

print(f"Best fold: {best_fold_idx + 1} with R2 score: {best_r2:.6f}")
print(f"Average R2 score across all folds: {np.mean(r2_scores):.6f}")
print(f"Standard deviation of R2 scores: {np.std(r2_scores):.6f}")

In [None]:
print("Final Model Training (70% train, 30% test)")
print("-"*50)

X_train_final, X_test_final, y_train_final, y_test_final = train_test_split(
    X_scaled, y, test_size=0.3, random_state=42
)

X_train_final_intercept = np.hstack([np.ones((X_train_final.shape[0], 1)), X_train_final])
X_test_final_intercept = np.hstack([np.ones((X_test_final.shape[0], 1)), X_test_final])

final_beta = np.linalg.inv(X_train_final_intercept.T @ X_train_final_intercept) @ X_train_final_intercept.T @ y_train_final
final_predictions = X_test_final_intercept @ final_beta
final_r2 = r2_score(y_test_final, final_predictions)

print(f"Final model R2 score on 30% test data: {final_r2:.6f}")
print(f"Training set size: {len(X_train_final)} (70%)")
print(f"Test set size: {len(X_test_final)} (30%)")

In [None]:
print("\nQUESTION 2: Validation Set Approach with Gradient Descent")
print("="*60)

X_temp, X_test_q2, y_temp, y_test_q2 = train_test_split(
    X_scaled, y, test_size=0.3, random_state=42
)
X_train_q2, X_val_q2, y_train_q2, y_val_q2 = train_test_split(
    X_temp, y_temp, test_size=0.2, random_state=42
)

print(f"Training set size: {len(X_train_q2)} ({len(X_train_q2)/len(X_scaled)*100:.0f}%)")
print(f"Validation set size: {len(X_val_q2)} ({len(X_val_q2)/len(X_scaled)*100:.0f}%)")
print(f"Test set size: {len(X_test_q2)} ({len(X_test_q2)/len(X_scaled)*100:.0f}%)")

In [None]:
def gradient_descent_regression(X, y, learning_rate, iterations):
    n_features = X.shape[1]
    beta = np.zeros(n_features)
    intercept = 0
    m = len(y)
    
    for i in range(iterations):
        y_pred = X.dot(beta) + intercept
        dw = (1/m) * X.T.dot(y_pred - y)
        db = (1/m) * np.sum(y_pred - y)
        beta -= learning_rate * dw
        intercept -= learning_rate * db
    
    return beta, intercept

In [None]:
learning_rates = [0.001, 0.01, 0.1, 1]
results_q2 = []

print("Testing different learning rates:")
print("-" * 40)

for lr in learning_rates:
    beta, intercept = gradient_descent_regression(
        X_train_q2, y_train_q2, lr, 1000
    )
    
    val_pred = X_val_q2.dot(beta) + intercept
    val_r2 = r2_score(y_val_q2, val_pred)
    
    test_pred = X_test_q2.dot(beta) + intercept
    test_r2 = r2_score(y_test_q2, test_pred)
    
    results_q2.append({
        'learning_rate': lr,
        'beta': beta,
        'intercept': intercept,
        'val_r2': val_r2,
        'test_r2': test_r2
    })
    
    print(f"LR = {lr:>5}: Val R2 = {val_r2:.4f}, Test R2 = {test_r2:.4f}")

In [None]:
best_lr_result = max(results_q2, key=lambda x: x['val_r2'])
print(f"Best learning rate: {best_lr_result['learning_rate']}")
print(f"Best validation R2: {best_lr_result['val_r2']:.4f}")
print(f"Test R2 with best model: {best_lr_result['test_r2']:.4f}")

print("\nANALYSIS COMPLETE")
print("="*50)