In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score


url = "C:/Users/predator/Downloads/USA_Housing.csv"
df = pd.read_csv(url)


df.head()


Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price
0,79545.45857,5.682861,7.009188,4.09,23086.8005,1059034.0
1,79248.64245,6.0029,6.730821,3.09,40173.07217,1505891.0
2,61287.06718,5.86589,8.512727,5.13,36882.1594,1058988.0
3,63345.24005,7.188236,5.586729,3.26,34310.24283,1260617.0
4,59982.19723,5.040555,7.839388,4.23,26354.10947,630943.5


In [3]:

X = df.drop(columns='Price')  
y = df['Price']


In [4]:

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [5]:

def compute_beta(X_train, y_train):
    X_train_T = X_train.T
    beta = np.linalg.inv(X_train_T @ X_train) @ X_train_T @ y_train
    return beta


def predict(X_test, beta):
    return X_test @ beta


kf = KFold(n_splits=5)


r2_scores = []
beta_matrices = []


for train_index, test_index in kf.split(X_scaled):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y[train_index], y[test_index]


    beta = compute_beta(X_train, y_train)
    beta_matrices.append(beta)
    

    y_pred = predict(X_test, beta)


    r2 = r2_score(y_test, y_pred)
    r2_scores.append(r2)


print(f"R2 Scores: {r2_scores}")


best_beta_index = np.argmax(r2_scores)
best_beta = beta_matrices[best_beta_index]
print(f"Best Beta Matrix (R² = {max(r2_scores)}): \n{best_beta}")


R2 Scores: [-11.566262890130576, -10.327833022754156, -12.218871237545434, -10.951296474620897, -11.59351061899871]
Best Beta Matrix (R² = -10.327833022754156): 
[231008.84527364 174703.35249181 121834.54293259  -2872.53491108
 152806.89864888]


In [6]:

X_train_full, X_test_full, y_train_full, y_test_full = train_test_split(X_scaled, y, test_size=0.3, random_state=42)


y_pred_full = predict(X_test_full, best_beta)


final_r2_score = r2_score(y_test_full, y_pred_full)
print(f"Final R² score on the test set: {final_r2_score}")


Final R² score on the test set: -11.983806002911246
