# Linear regression from scratch

## Firstly ignoring category data (dropping)

In [1]:
import numpy as np
import pandas as pd

Preprocess train set

In [2]:
train = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")

# Drop categorical features
categorical_columns = train.select_dtypes(include=['category', 'object']).columns
train.drop(columns=categorical_columns, inplace=True)

In [3]:
# Identify features with missing values (nan)
missing_values = train.isnull().sum()

# Filter to show only columns with missing values
missing_values = missing_values[missing_values > 0]

print("Columns with missing values:")
print(missing_values)

Columns with missing values:
LotFrontage    259
MasVnrArea       8
GarageYrBlt     81
dtype: int64


In this case, we want all categories with missing values to be imputed with 0.

In [4]:
# Replace both string 'nan' and NaN values with 0 for zero features
train = train.replace('nan', np.nan).fillna(0).astype('int64')

# Assign SalePrice to Y_train and drop SalePrice and Id from X_train
y_train = train["SalePrice"]
X_train = train.drop(["SalePrice", "Id"], axis=1)

In [7]:
# Check that all data types match
#print(X_train.dtypes)

Preprocess test set

In [5]:
X_test = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")

# Drop categorical features
categorical_columns = X_test.select_dtypes(include=['category', 'object']).columns
X_test.drop(columns=categorical_columns, inplace=True)

X_test = X_test.drop(["Id"], axis=1)

In [6]:
# Identify features with missing values (nan)
missing_values = X_test.isnull().sum()

# Filter to show only columns with missing values
missing_values = missing_values[missing_values > 0]

print("Columns with missing values:")
print(missing_values)

Columns with missing values:
LotFrontage     227
MasVnrArea       15
BsmtFinSF1        1
BsmtFinSF2        1
BsmtUnfSF         1
TotalBsmtSF       1
BsmtFullBath      2
BsmtHalfBath      2
GarageYrBlt      78
GarageCars        1
GarageArea        1
dtype: int64


In [7]:
# Replace both string 'nan' and NaN values with 0 for zero features
X_test = X_test.replace('nan', np.nan).fillna(0).astype('int64')

In [14]:
# Check that all data types match
#print(X_test.dtypes)

Feature scaling

In [8]:
X_train.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
0,60,65,8450,7,5,2003,2003,196,706,0,...,548,0,61,0,0,0,0,0,2,2008
1,20,80,9600,6,8,1976,1976,0,978,0,...,460,298,0,0,0,0,0,0,5,2007
2,60,68,11250,7,5,2001,2002,162,486,0,...,608,0,42,0,0,0,0,0,9,2008
3,70,60,9550,7,5,1915,1970,0,216,0,...,642,0,35,272,0,0,0,0,2,2006
4,60,84,14260,8,5,2000,2000,350,655,0,...,836,192,84,0,0,0,0,0,12,2008


In [9]:
from sklearn.preprocessing import StandardScaler

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler on the training data
scaler.fit(X_train)

# Transform both training and test data
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [10]:
y_train.head(1)

0    208500
Name: SalePrice, dtype: int64

In [11]:
y_scaler = StandardScaler()
y_train_2d = np.array(y_train).reshape(-1, 1)
y_train = y_scaler.fit_transform(y_train_2d)
y_train = y_train.flatten()

In [12]:
y_train[0]

0.34727321973650555

In [13]:
class LinearRegression:
    def __init__(self, learning_rate=0.01, n_iterations=1000):
        self.learning_rate = learning_rate
        self.n_iterations = n_iterations
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        n_samples, n_features = X.shape
        
        # Initialize weights and bias
        self.weights = np.zeros(n_features)
        self.bias = 0

        # Gradient descent
        for i in range(self.n_iterations):
            y_predicted = np.dot(X, self.weights) + self.bias

            # Compute gradients
            dw = (1 / n_samples) * np.dot(X.T, (y_predicted - y))
            db = (1 / n_samples) * np.sum(y_predicted - y)

            # Update parameters
            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db
            
            # Print loss every 100th iteration
            if i % 100 == 0:
                n = len(y)
                loss = (1/n) * sum((y - y_predicted)**2)
                print(f"Iteration {i}: Loss = {loss}")

    def predict(self, X):
        return np.dot(X, self.weights) + self.bias

In [14]:
# Assuming X_train and y_train are your training data
model = LinearRegression(learning_rate=0.1, n_iterations=1000)
model.fit(X_train, y_train)

# Make predictions
y_pred_scaled = model.predict(X_test)

Iteration 0: Loss = 1.000000000000002
Iteration 100: Loss = 0.18394708801130707
Iteration 200: Loss = 0.18363631820679843
Iteration 300: Loss = 0.18360703657299574
Iteration 400: Loss = 0.18360328781236682
Iteration 500: Loss = 0.1836027815305142
Iteration 600: Loss = 0.18360271184218835
Iteration 700: Loss = 0.18360270217370214
Iteration 800: Loss = 0.1836027008277
Iteration 900: Loss = 0.18360270064002768


In [15]:
print(f'y[0] pre-scaling: {y_pred_scaled[0]}')
# Scale up y_pred (inverse transform)
y_pred_2d = y_pred_scaled.reshape(-1, 1)

# Now, use the scaler to inverse transform
y_pred_original = y_scaler.inverse_transform(y_pred_2d)

# If you need y_pred_original as a 1D array, flatten it
y_pred_original = y_pred_original.flatten()
print(f'y[0] post-scaling: {y_pred_original[0]}')

y[0] pre-scaling: -0.7732380604357663
y[0] post-scaling: 119514.26962348787


In [16]:
y_pred_original

array([119514.26962349, 152368.50461324, 172896.62593489, ...,
       179828.18462177, 118988.40558454, 258064.6392571 ])

Submit predictions!

In [17]:
# Create a DataFrame for the submission
submission = pd.DataFrame({
    'Id': range(1461, 2920),  # Creates IDs from 1461 to 2919
    'SalePrice': y_pred_original
})

# Ensure 'Id' is integer type
submission['Id'] = submission['Id'].astype(int)

# Save the DataFrame to a CSV file
submission.to_csv('submission.csv', index=False)

# Print the first few rows to verify
print(submission.head())

# Print some information about the submission file
print("\nSubmission file info:")
print(f"Number of predictions: {len(submission)}")
print(f"ID range: {submission['Id'].min()} to {submission['Id'].max()}")
print(f"SalePrice range: ${submission['SalePrice'].min():.2f} to ${submission['SalePrice'].max():.2f}")

     Id      SalePrice
0  1461  119514.269623
1  1462  152368.504613
2  1463  172896.625935
3  1464  200219.921394
4  1465  194974.933609

Submission file info:
Number of predictions: 1459
ID range: 1461 to 2919
SalePrice range: $340.96 to $633650.24
