In [1]:
from datasets import Dataset as LDataset
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim

#### Loading the dataset

In [2]:
dataset = LDataset.from_csv("https://www.kaggle.com/api/v1/datasets/download/camnugent/california-housing-prices", cache_dir="/scratch/singh/hf/datasets/")

In [3]:
dataset

Dataset({
    features: ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'median_house_value', 'ocean_proximity'],
    num_rows: 20640
})

In [4]:
dataset[1]

{'longitude': -122.22,
 'latitude': 37.86,
 'housing_median_age': 21.0,
 'total_rooms': 7099.0,
 'total_bedrooms': 1106.0,
 'population': 2401.0,
 'households': 1138.0,
 'median_income': 8.3014,
 'median_house_value': 358500.0,
 'ocean_proximity': 'NEAR BAY'}

In [5]:
dataset[0].keys()

dict_keys(['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'median_house_value', 'ocean_proximity'])

#### Separating Numerical Featuress and Categorical Features

In [6]:
numerical_features = ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income']

In [7]:
categorical_features = ['ocean_proximity']

In [8]:
target_feature = 'median_house_value'

In [9]:
# for idx, row in enumerate(dataset):
#     if any(elem is None for elem in list(row.values())):
#         print(row)
#         print(idx)

In [10]:
features = list()
targets = list()

##### Total Bedrooms has None values, dealing with them

In [11]:
sum_total_bedrooms = 0
count_valid_rows = 0
for row in dataset:
    if row["total_bedrooms"]:
        sum_total_bedrooms += row["total_bedrooms"]
        count_valid_rows += 1
mean_total_bedrooms = sum_total_bedrooms/count_valid_rows

In [12]:
mean_total_bedrooms

537.8705525375618

Now, let's collect all the numerical features into a single list

In [13]:
for row in dataset:
    feature_values = list()
    for feat in numerical_features:
        if feat == "total_bedrooms":
            if row[feat]:
                feature_values.append(row[feat])
            else:
                feature_values.append(mean_total_bedrooms)
        else:
            feature_values.append(row[feat])
    features.append(feature_values)
    targets.append(row[target_feature])

In [14]:
features[0]

[-122.23, 37.88, 41.0, 880.0, 129.0, 322.0, 126.0, 8.3252]

In [15]:
targets[0]

452600.0

#### Now we need to handle the categorical feature

In [16]:
ocean_proximity = [row['ocean_proximity'] for row in dataset]
ocean_proximity[:5]

['NEAR BAY', 'NEAR BAY', 'NEAR BAY', 'NEAR BAY', 'NEAR BAY']

##### Let's look into what are the unique categories

In [17]:
unique_categories = list(set(ocean_proximity))
unique_categories

['ISLAND', 'NEAR OCEAN', '<1H OCEAN', 'NEAR BAY', 'INLAND']

##### Let's create a mapping from the category to the index for the category

In [18]:
category_to_onehot = dict()
for idx, category in enumerate(unique_categories):
    category_to_onehot[category] = idx

In [19]:
category_to_onehot

{'ISLAND': 0, 'NEAR OCEAN': 1, '<1H OCEAN': 2, 'NEAR BAY': 3, 'INLAND': 4}

##### Creating One hot encoded vectors

In [20]:
onehot_vectors = torch.eye(len(unique_categories))
onehot_vectors

tensor([[1., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 1.]])

##### Now, for all the values of ocean_proximity, we can add the respective one hot encoded vector for it in the features 

In [21]:
for i, category in enumerate(ocean_proximity):
    features[i].extend(onehot_vectors[category_to_onehot[category]].tolist())

In [22]:
features[290]

[-122.16,
 37.77,
 47.0,
 1256.0,
 537.8705525375618,
 570.0,
 218.0,
 4.375,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0]

##### Finally, we will convert these into tensors

In [23]:
X = torch.tensor(features, dtype=torch.float32)

In [24]:
X

tensor([[-122.2300,   37.8800,   41.0000,  ...,    0.0000,    1.0000,
            0.0000],
        [-122.2200,   37.8600,   21.0000,  ...,    0.0000,    1.0000,
            0.0000],
        [-122.2400,   37.8500,   52.0000,  ...,    0.0000,    1.0000,
            0.0000],
        ...,
        [-121.2200,   39.4300,   17.0000,  ...,    0.0000,    0.0000,
            1.0000],
        [-121.3200,   39.4300,   18.0000,  ...,    0.0000,    0.0000,
            1.0000],
        [-121.2400,   39.3700,   16.0000,  ...,    0.0000,    0.0000,
            1.0000]])

In [25]:
y = torch.tensor(targets, dtype=torch.float32).view(-1, 1)
y

tensor([[452600.],
        [358500.],
        [352100.],
        ...,
        [ 92300.],
        [ 84700.],
        [ 89400.]])

#### The final part for the preprocessing is the normalization of numerical features

In [26]:
# first 8 columns are the numerical features, 
# last 5 are the one hot encoded features for ocean_proximity
numeric_features = X[:, :8] 
# We are calculating mean and the standard deviation 
# of the numerical features so that we can normalize them
numeric_means = numeric_features.mean(dim=0, keepdim=True)
numeric_stds = numeric_features.std(dim=0, keepdim=True)

In [27]:
X[:, :8] = (numeric_features - numeric_means) / numeric_stds

In [28]:
X

tensor([[-1.3278,  1.0525,  0.9821,  ...,  0.0000,  1.0000,  0.0000],
        [-1.3228,  1.0432, -0.6070,  ...,  0.0000,  1.0000,  0.0000],
        [-1.3328,  1.0385,  1.8561,  ...,  0.0000,  1.0000,  0.0000],
        ...,
        [-0.8237,  1.7782, -0.9248,  ...,  0.0000,  0.0000,  1.0000],
        [-0.8736,  1.7782, -0.8454,  ...,  0.0000,  0.0000,  1.0000],
        [-0.8337,  1.7501, -1.0043,  ...,  0.0000,  0.0000,  1.0000]])

#### Preparing dataset and dataloaders

In [29]:
from sklearn.model_selection import train_test_split

In [30]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=9)

In [31]:
# Define custom PyTorch dataset
class RegressionDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [32]:
# Create datasets and dataloaders
train_dataset = RegressionDataset(torch.tensor(X_train, dtype=torch.float32),
                                   torch.tensor(y_train, dtype=torch.float32))
val_dataset = RegressionDataset(torch.tensor(X_val, dtype=torch.float32),
                                 torch.tensor(y_val, dtype=torch.float32))

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

  train_dataset = RegressionDataset(torch.tensor(X_train, dtype=torch.float32),
  torch.tensor(y_train, dtype=torch.float32))
  val_dataset = RegressionDataset(torch.tensor(X_val, dtype=torch.float32),
  torch.tensor(y_val, dtype=torch.float32))


#### Defining the Model

In [33]:
class SimpleModel(nn.Module):
    def __init__(self, input_dim):
        super(SimpleModel, self).__init__()
        self.layer1 = nn.Linear(input_dim, 256)  # Single layer
        self.relu = nn.ReLU()
        self.layer2 = nn.Linear(256, 1)

    def forward(self, x):
        x = self.layer1(x)
        x = self.relu(x)
        x = self.layer2(x)
        return x

In [34]:
X.shape[1]

13

In [35]:
model = SimpleModel(input_dim=X.shape[1])

#### Training with GPU and Model Checkpoint Handling

In [36]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model.to(device)

cuda


SimpleModel(
  (layer1): Linear(in_features=13, out_features=256, bias=True)
  (relu): ReLU()
  (layer2): Linear(in_features=256, out_features=1, bias=True)
)

In [37]:
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

best_val_loss = float('inf')
checkpoint_path = "best_model.pth"

In [38]:
for epoch in range(5):
    model.train()
    train_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        predictions = model(X_batch)
        # Check for NaN in predictions
        if torch.isnan(predictions).any():
            print(f"NaN detected in predictions at epoch {epoch}")
            break
        loss = criterion(predictions, y_batch)
        loss.backward()
        ## was getting exploding gradients problem with this model, needed to do gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()

        train_loss += loss.item()
        # print(train_loss)

    train_loss /= len(train_loader)

    # Validation phase
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            predictions = model(X_batch)
            loss = criterion(predictions, y_batch)
            val_loss += loss.item()

    val_loss /= len(val_loader)

    # Save the best model checkpoint
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'val_loss': best_val_loss
        }, checkpoint_path)

    print(f"Epoch {epoch+1}/{5}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

Epoch 1/5, Train Loss: 55966081889.2403, Val Loss: 56631219096.8062
Epoch 2/5, Train Loss: 55951132691.8450, Val Loss: 56611042089.6744
Epoch 3/5, Train Loss: 55925605364.0930, Val Loss: 56579500428.8992
Epoch 4/5, Train Loss: 55888397236.5891, Val Loss: 56536265116.7752
Epoch 5/5, Train Loss: 55839717816.5581, Val Loss: 56481397148.7752


In [39]:
# Load the best checkpoint
checkpoint = torch.load(checkpoint_path)
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
print(f"Loaded model from epoch {checkpoint['epoch']+1} with validation loss {checkpoint['val_loss']:.4f}")

Loaded model from epoch 5 with validation loss 56481397148.7752


  checkpoint = torch.load(checkpoint_path)


#### Now, let's do the device comparision

In [40]:
import time

In [41]:
def train_model_on_device(model, data_loader, intended_device, epochs=5):
    """
    Trains the model on the specified device and returns the training time.
    """
    device = torch.device(intended_device)
    model.to(device)
    optimizer = optim.SGD(model.parameters(), lr=0.01)
    criterion = nn.MSELoss()

    start_time = time.time()

    for epoch in range(epochs):
        model.train()
        total_loss = 0

        for X_batch, y_batch in data_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)

            optimizer.zero_grad()
            predictions = model(X_batch)
            loss = criterion(predictions, y_batch)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(data_loader)
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}")

    end_time = time.time()
    training_time = end_time - start_time

    return training_time

In [42]:
input_dim = X.shape[1]
model_cpu = SimpleModel(input_dim)
model_gpu = SimpleModel(input_dim)

# Train on CPU
print("Training on CPU...")
cpu_time = train_model_on_device(model_cpu, train_loader, intended_device="cpu")
print(f"CPU Training Time: {cpu_time:.2f} seconds")

# Train on GPU (if available)
if torch.cuda.is_available():
    print("\nTraining on GPU...")
    gpu_time = train_model_on_device(model_gpu, train_loader, intended_device="cuda")
    print(f"GPU Training Time: {gpu_time:.2f} seconds")

    # Compare CPU and GPU times
    speedup = cpu_time / gpu_time
    print(f"\nGPU is approximately {speedup:.2f}x faster than CPU.")
else:
    print("\nGPU not available. Skipping GPU comparison.")

Training on CPU...
Epoch 1/5, Loss: 55966046720.0000
Epoch 2/5, Loss: 55951157505.9845
Epoch 3/5, Loss: 55925250532.2171
Epoch 4/5, Loss: 55887620846.1395
Epoch 5/5, Loss: 55838235429.7054
CPU Training Time: 2.25 seconds

Training on GPU...
Epoch 1/5, Loss: 55966105639.6899
Epoch 2/5, Loss: 55951226864.1240
Epoch 3/5, Loss: 55925403425.7364
Epoch 4/5, Loss: 55888340845.1473
Epoch 5/5, Loss: 55839482848.2481
GPU Training Time: 7.15 seconds

GPU is approximately 0.31x faster than CPU.
