In [50]:
import pandas as pd
import numpy as np
import matplotlib as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

#Pre-processing the Data

house_df = pd.read_csv('california_house_dataset_handled.csv')
# X = house_df.drop('median_house_value', axis=1)  #Feature
X = house_df[['X0','median_income']]
y = house_df['median_house_value'] #Target

X

Unnamed: 0,X0,median_income
0,1,8.3252
1,1,8.3014
2,1,7.2574
3,1,5.6431
4,1,3.8462
...,...,...
20635,1,1.5603
20636,1,2.5568
20637,1,1.7000
20638,1,1.8672


In [51]:

# Splitting the data for testing and training

X_train , X_test , y_train , y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

#Scaling the Features

scaler = StandardScaler()

X_train.iloc[:, 1:] = scaler.fit_transform(X_train.iloc[:, 1:])  # Only scale the features, exclude X0 and other hot coded data
X_test.iloc[:, 1:] = scaler.transform(X_test.iloc[:, 1:])  # Same for test data

# Converting to Numpy Array
if isinstance(X_train, pd.DataFrame):
    X_train = X_train.to_numpy()
if isinstance(X_test, pd.DataFrame):
    X_test = X_test.to_numpy()  

# Initializing theta

theta = np.random.randn(X.shape[1])*0.01 


In [52]:
# Funtions

def cost_function(y_pred, y_test_or_train):
    m = len(y_pred)
    sum = 0
    y = y_test_or_train.to_numpy()
    for i in range(m):
        sum += ( y_pred[i] - y[i] )**2
    return ( 1 / (2 * m) ) * sum

def batch_gradient_descent(theta, X_train, y_train, lr):
    m = len(y_train)
    y_train = y_train.to_numpy()  # Ensure y_train is a NumPy array
    y_pred = np.dot(X_train, theta)  # Predictions

    for j in range(len(theta)):  # Iterate over all parameters (features)
        sum_grad = 0
        for i in range(m):  # Iterate over all data points
            # Correct the gradient: (y_pred - y_train) instead of (y_train - y_pred)
            sum_grad += (y_pred[i] - y_train[i]) * X_train[i, j]
        theta[j] = theta[j] - (lr * sum_grad) / m  # Update theta[j]
    
    return theta

In [53]:
# Training the model

tolerance = 1e-6 # Convergence thershold
previous_cost = float('inf')
alpha = 0.2
iteration = 0
cost_history = []
theta_history = []

while True:

    if iteration >= 10000:
        break
    
    y_pred = np.dot(X_train , theta)  
    cost = cost_function(y_pred, y_train)
    print(cost)
    
    if abs(previous_cost - cost) < tolerance:  
        break  

    theta = batch_gradient_descent(theta, X_train, y_train, alpha)  
    previous_cost = cost  
    iteration += 1

    if iteration % 100 == 0:  
       cost_history.append(cost)
       theta_history.append(theta.copy())
print("Converged in ",iteration," iterations")

28148709549.946438
19273634602.598534
13593586636.295979
9958355937.862362
7631808290.864765
6142817796.786337
5189863880.576167
4579973374.20167
4189643450.1219783
3939832298.7109895
3779953161.8079343
3677630514.189991
3612144019.714505
3570232663.2502065
3543409395.1130357
3526242503.505245
3515255692.8762627
3508224134.073726
3503723936.4400926
3500843809.9545608
3499000529.0038457
3497820829.195357
3497065821.31794
3496582616.276399
3496273365.049815
3496075444.264796
3495948774.9623613
3495867706.6088233
3495815822.8625627
3495782617.264964
3495761365.682471
3495747764.669689
3495739060.0215015
3495733489.046681
3495729923.6227813
3495727641.751497
3495726181.353876
3495725246.6993876
3495724648.520498
3495724265.6860375
3495724020.6719847
3495723863.862975
3495723763.505218
3495723699.2762556
3495723658.169708
3495723631.861524
3495723615.024257
3495723604.2484503
3495723597.3519263
3495723592.9381094
3495723590.113328
3495723588.3054376
3495723587.148373
3495723586.4079027
3495

In [54]:
# Evaluation Functions

def mean_absolute_error(y_pred , y_test_or_train):
    m = len(y_pred)
    sum = 0
    y = y_test_or_train.values
    for i in range(m):
        sum += np.abs( y_pred[i] - y[i] )
    return (1/m) * sum

def mean_squared_error(y_pred, y_test_or_train):
    m = len(y_pred)
    sum = 0
    y = y_test_or_train.values
    for i in range(m):
        sum += ( y_pred[i] - y[i] )**2
    return ( 1 / m ) * sum

def r2_score(y_pred , y_test_or_train):
    numerator = 0
    denominator = 0
    sum = 0
    m = len(y_pred)
    y = y_test_or_train.values

    for i in range(m):
        sum += y[i]
        numerator += (y_pred[i] - y[i])**2
    mean = sum/m

    for i in range(m):
        denominator += (y[i] - mean)**2
    
    return 1 - (numerator/denominator)

def root_mean_squared_error(mse):
    return mse**0.5

y_predictions = np.dot(X_test , theta)

In [55]:
# Testing the model

print(f"Cost : ", cost_function(y_predictions,y_test))
print(f"Mean squared error : ", mean_squared_error(y_predictions,y_test))
print(f"Root mean squared error : ", root_mean_squared_error(mean_squared_error(y_predictions,y_test)))
print(f"Mean absolute error : ", mean_absolute_error(y_predictions,y_test))
print(f"R2 score : ", r2_score(y_predictions,y_train))
print(f"These are the model predictions : ", y_predictions)

Cost :  3545578885.8817515
Mean squared error :  7091157771.763503
Root mean squared error :  84209.01241413239
Mean absolute error :  62990.865298953504
R2 score :  -0.4454217559103968
These are the model predictions :  [114958.91676382 150606.88213159 190393.71843432 ... 431500.77228105
 161245.49972224 193412.95559051]
