In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/home-data-for-ml-course/sample_submission.csv
/kaggle/input/home-data-for-ml-course/sample_submission.csv.gz
/kaggle/input/home-data-for-ml-course/train.csv.gz
/kaggle/input/home-data-for-ml-course/data_description.txt
/kaggle/input/home-data-for-ml-course/test.csv.gz
/kaggle/input/home-data-for-ml-course/train.csv
/kaggle/input/home-data-for-ml-course/test.csv


In [2]:
#Loading data
data = pd.read_csv("/kaggle/input/home-data-for-ml-course/train.csv")
data.head()


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
m =  len(data)
m

1460

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Assuming 'data' is your original DataFrame containing features and target
features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
y = data['SalePrice']
X = data[features]

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler on the features and transform them
X_scaled = scaler.fit_transform(X)

# Convert the scaled features back to a DataFrame
X_scaled_df = pd.DataFrame(X_scaled, columns=features)

# Split the data into training and validation sets
# X_train, X_val, y_train, y_val = train_test_split(X_scaled_df, y, train_size=0.8, random_state=42)

# Optionally, you can also scale the target variable separately if needed
# y_scaled = scaler.fit_transform(y.values.reshape(-1, 1))  # Example for scaling the target

# Print shapes to verify
print(f"X_val shape: {X_scaled.shape}, y_val shape: {y.shape}")

X_val shape: (1460, 7), y_val shape: (1460,)


In [5]:
X_scaled.shape, y.shape

((1460, 7), (1460,))

In [6]:
#Making all the necessary functions
def pick_weights_and_biases():
    n = len(features)
    w = np.random.rand(n)
    b = np.random.rand()
    return w, b

def calculate_absolute_cost(X_data, y_data, w, b):
    m = len(X_data)
    f_wb = np.dot(X_data, w) + b
    cost = np.sum(np.abs(f_wb - y_data)) / m  # MAE
    return cost

def calculate_squared_cost(X_data, y_data, w, b):
    m = len(X_data)
    f_wb = np.dot(X_data, w) + b
    cost = np.sum((f_wb - y_data) ** 2) / (2 * m) #mse
    return cost

def calculate_gradient(X_data, y_data, w, b):
    m = len(X_data)
    f_wb = np.dot(X_data, w) + b
    dj_dw = np.dot(X_data.T, (f_wb - y_data)) / m
    dj_db = np.sum(f_wb - y_data) / m  # Gradient with respect to bias
    return dj_dw, dj_db

def gradient_descent(X_data, y_data, w, b, alpha, iterations):
    for i in range(iterations):
        dj_dw, dj_db = calculate_gradient(X_data, y_data, w, b)
        w = w - (alpha * dj_dw)
        b = b - (alpha * dj_db)
        if i%(iterations/10) == 0:
            print(f" i = {i} ---> Absolute cost = {calculate_absolute_cost(X_data, y_data, w, b)}")
            print(f" i = {i} ---> Squared cost = {calculate_squared_cost(X_data, y_data, w, b)}")
    return w, b

In [7]:
# Adjusted learning rate and iterations
alpha = 0.01  # Reduce learning rate further
iterations = 25000  # Increase number of iterations

# Re-run gradient descent
w_initial, b_initial = pick_weights_and_biases()
print(f"Initial weights: {w_initial}")
print(f"Initial bias: {b_initial}")

print(f"Shape of X_train: {X_scaled.shape}")
print(f"Shape of y_train: {y.shape}")
print(f"Shape of w_initial: {w_initial.shape}")

initial_absolute_cost = calculate_absolute_cost(X_scaled, y, w_initial, b_initial)
initial_squared_cost = calculate_squared_cost(X_scaled, y, w_initial, b_initial)
print(f"Initial absolute Cost: {initial_absolute_cost}")
print(f"Initial squared Cost: {initial_absolute_cost}")

w_final, b_final = gradient_descent(X_scaled, y, w_initial, b_initial, alpha, iterations)

print(f"Final weights: {w_final}")
print(f"Final bias: {b_final}")

final_absolute_cost = calculate_absolute_cost(X_scaled, y, w_final, b_final)
final_squared_cost = calculate_squared_cost(X_scaled, y, w_final, b_final)
print(f"Final Absolute Cost: {final_absolute_cost}")
print(f"Final Squared Cost: {final_absolute_cost}")



Initial weights: [0.10574807 0.0088219  0.65603864 0.4605826  0.36700135 0.6542491
 0.40055195]
Initial bias: 0.890850989141894
Shape of X_train: (1460, 7)
Shape of y_train: (1460,)
Shape of w_initial: (7,)
Initial absolute Cost: 180920.3050394218
Initial squared Cost: 180920.3050394218
 i = 0 ---> Absolute cost = 179111.1019890276
 i = 0 ---> Squared cost = 19104035298.5418
 i = 2500 ---> Absolute cost = 27690.844684872205
 i = 2500 ---> Squared cost = 929241210.7285581
 i = 5000 ---> Absolute cost = 27668.15018377992
 i = 5000 ---> Squared cost = 929140039.9227828
 i = 7500 ---> Absolute cost = 27667.28949337488
 i = 7500 ---> Squared cost = 929139882.1224777
 i = 10000 ---> Absolute cost = 27667.255497894836
 i = 10000 ---> Squared cost = 929139881.8763037
 i = 12500 ---> Absolute cost = 27667.25415516522
 i = 12500 ---> Squared cost = 929139881.8759193
 i = 15000 ---> Absolute cost = 27667.25410213103
 i = 15000 ---> Squared cost = 929139881.8759187
 i = 17500 ---> Absolute cost = 

In [8]:
# Initialize an array to store predictions
predicted = np.zeros(len(X_scaled))

# Iterate over each instance in X_train
for i in range(len(X_scaled)):
    # Calculate the prediction using the trained weights and bias
    predicted[i] = np.dot(w_final, X_scaled[i]) + b_final

# Print or use the predictions as needed
print(predicted)


[219164.64359536 161876.1612803  217670.6719971  ... 215189.55784088
 131337.00066566 152147.85672781]


In [9]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

mse = mean_squared_error(y, predicted)
mae = mean_absolute_error(y, predicted)

mse, mae

(1858279763.7518375, 27667.25409995024)

In [10]:
# path to file you will use for predictions
test_data_path = '/kaggle/input/home-data-for-ml-course/test.csv'

# read test data file using pandas
test_data = pd.read_csv(test_data_path)

# create test_X which comes from test_data but includes only the columns you used for prediction.
# The list of columns is stored in a variable called features
test_X = test_data[features]

# # Initialize an array to store predictions
# predicted_f = np.zeros(len(test_X))

# # Iterate over each instance in test_X
# for i in range(len(test_X)):
#     # Calculate the prediction using the trained weights and bias
#     predicted_f[i] = np.dot(w_final, test_X.iloc[i]) + b_final

# # Print or use the predictions as needed
# print(predicted_f)

# # You can now proceed to use these predictions for submission or further evaluation.


from sklearn.preprocessing import StandardScaler

# Assuming 'scaler' is the StandardScaler object fit on your training data
# Scale the test data using the same scaler
test_X_scaled = scaler.transform(test_X[features])
# Initialize an array to store predictions
predicted_f = np.zeros(len(test_X_scaled))

# Iterate over each instance in test_X_scaled
for i in range(len(test_X_scaled)):
    # Calculate the prediction using the trained weights and bias
    predicted_f[i] = np.dot(w_final, test_X_scaled[i]) + b_final

# Print or use the predictions as needed
print(predicted_f)


[120848.01748845 156876.0066946  202633.62101218 ... 136194.11768111
 142873.20620377 244806.2290663 ]


In [11]:
# Run the code to save predictions in the format used for competition scoring

output = pd.DataFrame({'Id': test_data.Id,
                       'SalePrice': predicted_f})
output.to_csv('submission.csv', index=False)