In [None]:
''' # AM

This notebook illustrates the following commonly used steps in data science, machine learning, deep learning:

- Load real estate data into a pandas dataframe
- load a subset of the data 
- get basic statistics on data
- drop columns that you are not using as features in your prediction model
- switch between numpy arrays and pytorch tensors (datatypes)
- Train a model with and without rescaling data and calculate effect on root MSE
- compare R^2 and MSE loss as conceptually meaningful

- Train a small pytorch model to determine the market price of a house 

- data is available https://naclai.xyz/MachineLearning_HackerDojo_2024.html

'''

In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error
import numpy as np

In [18]:
# DATA I/O
df = pd.read_csv('realtor-data.csv')
# Select all rows where the 'State' column is 'California'
df_CA = df[df['state'] == 'California']
df_CA.to_csv('Mini_Batch_Real_Estate_CA.csv')

df_CA_SanJose = df_CA[df_CA['city'] == 'San Jose']
df_CA_SanJose.to_csv('Mini_Batch_Real_Estate_CA_SanJose.csv')
df = df_CA_SanJose

# Set 'target_column' as the name of the column you want to predict
target_column = 'price'

# Data Pre-Processing
non_numerical_columns =['status','city','state','prev_sold_date']

non_meaningful_numerical_columns=['brokered_by','street']  # Could be meaningful # Can we identify that features that are best predictors of price?

df=df.drop(columns=non_numerical_columns)
df=df.drop(columns=non_meaningful_numerical_columns)
print(df.shape)
df = df.dropna()

print(df.shape)
df.head()

(4943, 6)
(3742, 6)


Unnamed: 0,price,bed,bath,acre_lot,zip_code,house_size
1322526,1699888.0,3.0,2.0,0.14,95130.0,1467.0
1322552,2380888.0,4.0,2.0,0.17,95129.0,1973.0
1322777,1199000.0,3.0,3.0,0.03,95129.0,1348.0
1322922,4750000.0,4.0,2.0,0.63,95117.0,3917.0
1323034,1349000.0,2.0,2.0,0.04,95129.0,1149.0


In [None]:
from sklearn.preprocessing import StandardScaler

# Assume 'df' is your DataFrame with one target column to predict

# Define target column name
# target_column = 'target_column'

# Separate features and target
X = df.drop(columns=[target_column]).values  # Convert to numpy array
y = df[target_column].values  # Target column as numpy array

print(X.shape, 'X shape')

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features (helps with training stability)  # Do an example without scaling to unit variance # Also talk a bit about the different distributions
#scaler = StandardScaler()  # scaler is an instance of StandardScaler which offers different methods called fit_transform and transform
#X_train = scaler.fit_transform(X_train)  # Why apply a different function to X_train and X_test , shouldn't we keep it symmetrical
#X_test = scaler.transform(X_test)

In [10]:
print(type(X_train))
X_train_array =  X_train.detach().cpu().numpy()
print(type(X_train_array))

<class 'torch.Tensor'>
<class 'numpy.ndarray'>


In [17]:
df.head()

Unnamed: 0,price,bed,bath,acre_lot,zip_code,house_size
1322526,1699888.0,3.0,2.0,0.14,95130.0,1467.0
1322552,2380888.0,4.0,2.0,0.17,95129.0,1973.0
1322777,1199000.0,3.0,3.0,0.03,95129.0,1348.0
1322922,4750000.0,4.0,2.0,0.63,95117.0,3917.0
1323034,1349000.0,2.0,2.0,0.04,95129.0,1149.0


In [25]:
# Remember we are predicting price!  So it's not in the X_train, it's only in the y 
average = np.mean(X_train_array, axis=0) # along the columns
median = np.median(X_train_array, axis=0)
range = np.ptp(X_train_array, axis=0)  # Peak-to-peak range (max - min)
std_dev = np.std(X_train_array, axis=0)  # Peak-to-peak range (max - min)
# Redundant but insightful
max = np.max(X_train_array, axis=0)  # Peak-to-peak range (max - min)
min = np.min(X_train_array, axis=0)  # Peak-to-peak range (max - min)


print('average',  average)
print('median',  median )
print('range', range )
print('standard deviation', std_dev )
print('')
print('Redundant but insightful')
print('max ', max )
print('min', min)

# WHAT ARE THE SAME STATS AFTER RESCALING (OR ANY KIND OF TRANSFORMATION)

average [3.3294353e+00 2.4520547e+00 2.3548795e-01 9.5125180e+04 1.6781594e+03]
median [3.0000e+00 2.0000e+00 1.4000e-01 9.5125e+04 1.5600e+03]
range [9.000e+00 7.000e+00 1.020e+02 3.800e+01 7.246e+03]
standard deviation [  0.9486017   0.8642177   2.570666    8.01002   675.4175   ]

Redundant but insightful
max  [1.0000e+01 8.0000e+00 1.0200e+02 9.5148e+04 7.7020e+03]
min [1.000e+00 1.000e+00 0.000e+00 9.511e+04 4.560e+02]


In [3]:




# Convert data to PyTorch tensors
X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)

# Define a simple neural network model
class SimpleNN(nn.Module):
    def __init__(self, input_dim):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)  # the first layer has 5 neurons and outputs 64 neurons
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, 5)
        self.fc4 = nn.Linear(5, 1)  # if you had a layer to the neural architecture, also add it to the forward pass.  
        
    
    def forward(self, x):  # when model(X_train) is called, the data in X_train is passed through the model.  
        #x = torch.relu(self.fc1(x))  # the output of layer fc1, in this case a linear function, y=Ax + b, is y, all the entries of y pass through RELU activation function
        #x = torch.relu(self.fc2(x))
        x = self.fc1(x)  # Also works if we don't apply the activation function , it's just a different fitting function then
        x = self.fc2(x)
        x = self.fc3(x) # here is no activation function is applied 
        x = self.fc4(x)
        return x

# Initialize the model, loss function, and optimizer
input_dim = X_train.shape[1]
print(input_dim, 'is the number of input dimensions, and its the number of columns/fields, number of features')
model = SimpleNN(input_dim)
criterion = nn.MSELoss()  #  THIS IS WHERE THE LOSS FUNCTION IS DEFINED
optimizer = optim.Adam(model.parameters(), lr=0.001)  # THIS IS THE SEARCH FOR PARAMETERS

# Train the model
epochs = 200
for epoch in range(epochs):
    model.train()
    
    # Forward pass
    outputs = model(X_train)
    loss = criterion(outputs, y_train)
    
    # Backward pass and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    # Print loss every 10 epochs
    if (epoch+1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}', f'Loss: {loss.item()/10**9}')

# Evaluate the model
model.eval()
with torch.no_grad():
    predictions = model(X_test)
    mse = mean_squared_error(y_test, predictions)
    print(f"Mean Squared Error on Test Set: {mse:.4f}", np.sqrt(int(mse)))

(3742, 5) X shape
5 is the number of input dimensions, and its the number of columns/fields, number of features


  from .autonotebook import tqdm as notebook_tqdm


Epoch [10/200], Loss: 2166592765952.0000 Loss: 2166.592765952
Epoch [20/200], Loss: 1958599458816.0000 Loss: 1958.599458816
Epoch [30/200], Loss: 1638311526400.0000 Loss: 1638.3115264
Epoch [40/200], Loss: 1199004450816.0000 Loss: 1199.004450816
Epoch [50/200], Loss: 724472102912.0000 Loss: 724.472102912
Epoch [60/200], Loss: 418765864960.0000 Loss: 418.76586496
Epoch [70/200], Loss: 393036267520.0000 Loss: 393.03626752
Epoch [80/200], Loss: 396524683264.0000 Loss: 396.524683264
Epoch [90/200], Loss: 381144858624.0000 Loss: 381.144858624
Epoch [100/200], Loss: 382284005376.0000 Loss: 382.284005376
Epoch [110/200], Loss: 380950642688.0000 Loss: 380.950642688
Epoch [120/200], Loss: 380312322048.0000 Loss: 380.312322048
Epoch [130/200], Loss: 380103491584.0000 Loss: 380.103491584
Epoch [140/200], Loss: 379751858176.0000 Loss: 379.751858176
Epoch [150/200], Loss: 379495841792.0000 Loss: 379.495841792
Epoch [160/200], Loss: 379215249408.0000 Loss: 379.215249408
Epoch [170/200], Loss: 378941