In [20]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold,train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

In [21]:
df=pd.read_csv('USA_housing.csv')

In [22]:
df.head()

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price
0,79545.45857,5.682861,7.009188,4.09,23086.8005,1059034.0
1,79248.64245,6.0029,6.730821,3.09,40173.07217,1505891.0
2,61287.06718,5.86589,8.512727,5.13,36882.1594,1058988.0
3,63345.24005,7.188236,5.586729,3.26,34310.24283,1260617.0
4,59982.19723,5.040555,7.839388,4.23,26354.10947,630943.5


In [23]:
x=df.drop('Price',axis=1).values
y=df['Price'].values

In [24]:
scaler=StandardScaler()
y=np.array(y)
y=y.reshape(-1,1)
x_scaled=scaler.fit_transform(x)

In [25]:
scaler_y=StandardScaler()
y_scaled=scaler_y.fit_transform(y)

In [26]:
x_scaled

array([[ 1.02865969, -0.29692705,  0.02127433,  0.08806222, -1.31759867],
       [ 1.00080775,  0.02590164, -0.25550611, -0.72230146,  0.40399945],
       [-0.68462915, -0.11230283,  1.5162435 ,  0.93084045,  0.07240989],
       ...,
       [-0.48723454,  1.28447022, -2.17026949, -1.50025059, -0.29193658],
       [-0.05459152, -0.44669439,  0.14154061,  1.18205319,  0.65111608],
       [-0.28831272,  0.01521477, -0.19434166,  0.07185495,  1.04162464]])

In [27]:
x_train,x_test,y_train,y_test=train_test_split(x_scaled,y_scaled,test_size=0.3,random_state=42)

In [28]:
x_train_with1=np.c_[np.ones(x_train.shape[0]), x_train]
x_test_with1=np.c_[np.ones(x_test.shape[0]),x_test]

In [29]:
# beta = np.linalg.inv(x_train_with1.T @ x_train_with1) @ x_train_with1.T @ y_train
A=x_train_with1.T.dot(x_train_with1)
B=np.linalg.inv(A)
C=B.dot(x_train_with1.T)
beta=C.dot(y_train)
print(beta)

[[-0.00224882]
 [ 0.65272181]
 [ 0.46493181]
 [ 0.34132186]
 [ 0.00825197]
 [ 0.42771714]]


In [30]:
y_predict=np.dot(x_test_with1,beta)
print(y_predict)

[[ 0.21655993]
 [ 0.01430282]
 [ 0.0333151 ]
 ...
 [ 0.63737868]
 [ 0.7118915 ]
 [-0.5227168 ]]


In [31]:
error=y_test-y_predict
square_error=np.power(error,2)
sum_square_error=np.sum(square_error)
mean_square_error=sum_square_error/len(y_predict)
print("mse:",mean_square_error)
rms_error=np.sqrt(mean_square_error)
print("rms:",rms_error)
y_mean=np.mean(y_test)
total_variance=np.sum((y_test-y_mean)**2)
print("r2_scratch",1-sum_square_error/total_variance)
r2 = r2_score(y_test, y_predict)
print("r2:",r2)

mse: 0.08076245417602514
rms: 0.2841873575232106
r2_scratch 0.9146818498916266
r2: 0.9146818498916266


In [32]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [33]:
best_r2=-np.inf
best_beta=None

In [34]:
for train_idx, test_idx in kf.split(x_scaled):
    X_train, X_test = x_scaled[train_idx], x_scaled[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    
    X_train_b = np.c_[np.ones((X_train.shape[0], 1)), X_train]
    beta = np.linalg.inv(X_train_b.T.dot(X_train_b)).dot(X_train_b.T).dot(y_train)
    
    X_test_b = np.c_[np.ones((X_test.shape[0], 1)), X_test]
    y_pred = X_test_b.dot(beta)
    
    r2 = r2_score(y_test, y_pred)
    
    if r2 > best_r2:
        best_r2 = r2
        best_beta = beta

In [35]:
X_train_final, X_test_final, y_train_final, y_test_final = train_test_split(x_scaled, y_scaled, test_size=0.3, random_state=42)

X_train_final_b = np.c_[np.ones((X_train_final.shape[0], 1)), X_train_final]
X_test_final_b = np.c_[np.ones((X_test_final.shape[0], 1)), X_test_final]

y_pred_final = X_test_final_b.dot(best_beta)

final_r2 = r2_score(y_test_final, y_pred_final)
print("r2:",r2)

r2: 0.9243869413350316


In [36]:
best_beta

array([[1.23161736e+06],
       [2.30225051e+05],
       [1.63956839e+05],
       [1.21115120e+05],
       [7.83467170e+02],
       [1.50662447e+05]])