In [328]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Pre-processing the Data

house_df = pd.read_csv("house_price_dataset_handled.csv")
X = house_df.drop('price', axis=1)  # Features
y = house_df['price']  # Target variable

# Splitting the data for testing and training

X_train , X_test , y_train , y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

#Scaling the Features

scaler = StandardScaler()

X_train.iloc[:, 1:] = scaler.fit_transform(X_train.iloc[:, 1:])  # Only scale the features, exclude X0
X_test.iloc[:, 1:] = scaler.transform(X_test.iloc[:, 1:])  # Same for test data

# Converting to Numpy Array
if isinstance(X_train, pd.DataFrame):
    X_train = X_train.to_numpy()
if isinstance(X_test, pd.DataFrame):
    X_test = X_test.to_numpy()  

# Initializing theta

theta = np.random.randn(X.shape[1])*0.01 


In [None]:
# Funtions

def cost_function(y_pred, y_test_or_train):
    m = len(y_pred)
    sum = 0
    y = y_test_or_train.to_numpy()
    for i in range(m):
        sum += ( y_pred[i] - y[i] )**2
    return ( 1 / (2 * m) ) * sum

def batch_gradient_descent(theta, X_train, y_train, lr):
    m = len(y_train)
    y_train = y_train.to_numpy()  # Ensure y_train is a NumPy array
    y_pred = np.dot(X_train, theta)  # Predictions

    for j in range(len(theta)):  # Iterate over all parameters (features)
        sum_grad = 0
        for i in range(m):  # Iterate over all data points
            # Correct the gradient: (y_pred - y_train) instead of (y_train - y_pred)
            sum_grad += (y_pred[i] - y_train[i]) * X_train[i, j]
        theta[j] = theta[j] - (lr * sum_grad) / m  # Update theta[j]
    
    return theta

In [330]:
# Training the model

tolerance = 1e-6 # Convergence thershold
previous_cost = float('inf')
alpha = 0.5
iteration = 0
cost_history = []
theta_history = []

while True:

    if iteration >= 10000:
        break
    
    y_pred = np.dot(X_train , theta)  
    cost = cost_function(y_pred, y_train)
    print(cost)
    
    if abs(previous_cost - cost) < tolerance:  
        break  

    theta = batch_gradient_descent(theta, X_train, y_train, alpha)  
    previous_cost = cost  
    iteration += 1

    if iteration % 100 == 0:  
       cost_history.append(cost)
       theta_history.append(theta.copy())
print("Converged in ",iteration," iterations")



335969792730.36957
204459111898.08197
173904761202.6095
166178621306.08563
164119120809.2783
163523706656.13248
163326391001.57852
163248009567.69974
163211003032.0743
163191317159.01645
163180142632.92325
163173601653.45963
163169720688.70227
163167404626.972
163166019084.89713
163165189362.1089
163164692275.86478
163164394418.81503
163164215927.81378
163164108963.6633
163164044862.51828
163164006447.97562
163163983426.85034
163163969630.7036
163163961362.91666
163163956408.17758
163163953438.89066
163163951659.44727
163163950593.05994
163163949953.99152
163163949571.00925
163163949341.49484
163163949203.95074
163163949121.52338
163163949072.12573
163163949042.523
163163949024.78207
163163949014.1495
163163949007.7785
163163949003.9605
163163949001.67307
163163949000.30142
163163948999.47913
163163948998.9877
163163948998.6919
163163948998.5153
163163948998.4093
163163948998.34567
163163948998.307
163163948998.28445
163163948998.27103
163163948998.26282
163163948998.258
163163948998.2

In [331]:
cost_history

[]

In [332]:
theta_history

[]

In [333]:
print("theta = ",theta)

theta =  [558258.51343596  18981.87475867 173187.8959434 ]


In [334]:
y_predictions = np.dot(X_test , theta)

In [335]:
# Evaluation Functions

def mean_absolute_error(y_pred , y_test_or_train):
    m = len(y_pred)
    sum = 0
    y = y_test_or_train.values
    for i in range(m):
        sum += np.abs( y_pred[i] - y[i] )
    return (1/m) * sum

def mean_squared_error(y_pred, y_test_or_train):
    m = len(y_pred)
    sum = 0
    y = y_test_or_train.values
    for i in range(m):
        sum += ( y_pred[i] - y[i] )**2
    return ( 1 / m ) * sum

def r2_score(y_pred , y_test_or_train):
    numerator = 0
    denominator = 0
    sum = 0
    m = len(y_pred)
    y = y_test_or_train.values

    for i in range(m):
        sum += y[i]
        numerator += (y_pred[i] - y[i])**2
    mean = sum/m

    for i in range(m):
        denominator += (y[i] - mean)**2
    
    return 1 - (numerator/denominator)

def root_mean_squared_error(mse):
    return mse**0.5

In [336]:
# Testing the model

print(f"Cost : ", cost_function(y_predictions,y_test))
print(f"Mean squared error : ", mean_squared_error(y_predictions,y_test))
print(f"Root mean squared error : ", root_mean_squared_error(mean_squared_error(y_predictions,y_test)))
print(f"Mean absolute error : ", mean_absolute_error(y_predictions,y_test))
print(f"R2 score : ", r2_score(y_predictions,y_train))
print(f"These are the model predictions : ", y_predictions)

Cost :  49305935065.62082
Mean squared error :  98611870131.24164
Root mean squared error :  314025.26989279327
Mean absolute error :  205104.1018342525
R2 score :  -0.032208530063242424
These are the model predictions :  [1103269.98507538  269582.01139489  650832.72975431  290543.3998292
  481168.7590089   629871.34132     579646.19279265  290543.3998292
  629871.34132     629871.34132     629871.34132     537723.41592404
  707387.38666944  686425.99823514  650832.72975431  650832.72975431
  460207.3705746   608909.9528857   650832.72975431  481168.7590089
  460207.3705746   403652.71365946  898012.74584915  269582.01139489
  537723.41592404  877051.35741484  820496.70049971  460207.3705746
  650832.72975431  671794.11818861  629871.34132     481168.7590089
  629871.34132     290543.3998292   516762.02748973  650832.72975431
  537723.41592404  460207.3705746   629871.34132     650832.72975431
  629871.34132     594278.07283917  516762.02748973  537723.41592404
  707387.38666944  51676