In [5]:
from datasets import Dataset as LDataset
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim

#### Loading the dataset

In [6]:
dataset = LDataset.from_csv("https://www.kaggle.com/api/v1/datasets/download/camnugent/california-housing-prices", cache_dir="/scratch/singh/hf/datasets/")

In [7]:
dataset

Dataset({
    features: ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'median_house_value', 'ocean_proximity'],
    num_rows: 20640
})

In [8]:
dataset[1]

{'longitude': -122.22,
 'latitude': 37.86,
 'housing_median_age': 21.0,
 'total_rooms': 7099.0,
 'total_bedrooms': 1106.0,
 'population': 2401.0,
 'households': 1138.0,
 'median_income': 8.3014,
 'median_house_value': 358500.0,
 'ocean_proximity': 'NEAR BAY'}

In [9]:
dataset[0].keys()

dict_keys(['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'median_house_value', 'ocean_proximity'])

#### Separating Numerical Featuress and Categorical Features

In [10]:
numerical_features = ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income']

In [11]:
categorical_features = ['ocean_proximity']

In [12]:
target_feature = 'median_house_value'

In [13]:
# for idx, row in enumerate(dataset):
#     if any(elem is None for elem in list(row.values())):
#         print(row)
#         print(idx)

In [14]:
features = list()
targets = list()

##### Total Bedrooms has None values, dealing with them

In [15]:
sum_total_bedrooms = 0
count_valid_rows = 0
for row in dataset:
    if row["total_bedrooms"]:
        sum_total_bedrooms += row["total_bedrooms"]
        count_valid_rows += 1
mean_total_bedrooms = sum_total_bedrooms/count_valid_rows

In [16]:
mean_total_bedrooms

537.8705525375618

Now, let's collect all the numerical features into a single list

In [17]:
for row in dataset:
    feature_values = list()
    for feat in numerical_features:
        if feat == "total_bedrooms":
            if row[feat]:
                feature_values.append(row[feat])
            else:
                feature_values.append(mean_total_bedrooms)
        else:
            feature_values.append(row[feat])
    features.append(feature_values)
    targets.append(row[target_feature])

In [18]:
features[0]

[-122.23, 37.88, 41.0, 880.0, 129.0, 322.0, 126.0, 8.3252]

In [19]:
targets[0]

452600.0

#### Now we need to handle the categorical feature

In [20]:
ocean_proximity = [row['ocean_proximity'] for row in dataset]
ocean_proximity[:5]

['NEAR BAY', 'NEAR BAY', 'NEAR BAY', 'NEAR BAY', 'NEAR BAY']

##### Let's look into what are the unique categories

In [21]:
unique_categories = list(set(ocean_proximity))
unique_categories

['<1H OCEAN', 'ISLAND', 'NEAR BAY', 'INLAND', 'NEAR OCEAN']

##### Let's create a mapping from the category to the index for the category

In [22]:
category_to_onehot = dict()
for idx, category in enumerate(unique_categories):
    category_to_onehot[category] = idx

In [23]:
category_to_onehot

{'<1H OCEAN': 0, 'ISLAND': 1, 'NEAR BAY': 2, 'INLAND': 3, 'NEAR OCEAN': 4}

##### Creating One hot encoded vectors

In [24]:
onehot_vectors = torch.eye(len(unique_categories))
onehot_vectors

tensor([[1., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 1.]])

##### Now, for all the values of ocean_proximity, we can add the respective one hot encoded vector for it in the features 

In [25]:
for i, category in enumerate(ocean_proximity):
    #category_to_onehot[category] 获取当前类别对应的独热编码索引。
    #onehot_vectors[...] 根据这个索引获取对应的独热向量。
    #.tolist() 将独热向量转换为列表。
    #features[i].extend(...) 将独热向量添加到对应行的特征列表末尾。
    features[i].extend(onehot_vectors[category_to_onehot[category]].tolist())

In [26]:
features[290]

[-122.16,
 37.77,
 47.0,
 1256.0,
 537.8705525375618,
 570.0,
 218.0,
 4.375,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0]

##### Finally, we will convert these into tensors

In [27]:
X = torch.tensor(features, dtype=torch.float32)

In [28]:
X

tensor([[-122.2300,   37.8800,   41.0000,  ...,    1.0000,    0.0000,
            0.0000],
        [-122.2200,   37.8600,   21.0000,  ...,    1.0000,    0.0000,
            0.0000],
        [-122.2400,   37.8500,   52.0000,  ...,    1.0000,    0.0000,
            0.0000],
        ...,
        [-121.2200,   39.4300,   17.0000,  ...,    0.0000,    1.0000,
            0.0000],
        [-121.3200,   39.4300,   18.0000,  ...,    0.0000,    1.0000,
            0.0000],
        [-121.2400,   39.3700,   16.0000,  ...,    0.0000,    1.0000,
            0.0000]])

In [29]:
#.view(-1, 1):重塑（reshape）张量的维度。-1 表示自动计算这个维度的大小，以匹配总元素数。
#1 表示将张量转换为列向量。
y = torch.tensor(targets, dtype=torch.float32).view(-1, 1)
y

tensor([[452600.],
        [358500.],
        [352100.],
        ...,
        [ 92300.],
        [ 84700.],
        [ 89400.]])

#### The final part for the preprocessing is the normalization of numerical features

In [30]:
# first 8 columns are the numerical features, 
# last 5 are the one hot encoded features for ocean_proximity
# X[:, :8] 表示选择所有行（:）和前8列（:8）。
numeric_features = X[:, :8] 
# We are calculating mean and the standard deviation 
# of the numerical features so that we can normalize them
# dim=0 表示沿着第0维（列方向）计算均值。
# keepdim=True 保持结果的维度，使结果为 (1, 8) 而不是 (8,)。
numeric_means = numeric_features.mean(dim=0, keepdim=True)
numeric_stds = numeric_features.std(dim=0, keepdim=True)

In [31]:
X[:, :8] = (numeric_features - numeric_means) / numeric_stds

In [32]:
X

tensor([[-1.3278,  1.0525,  0.9821,  ...,  1.0000,  0.0000,  0.0000],
        [-1.3228,  1.0432, -0.6070,  ...,  1.0000,  0.0000,  0.0000],
        [-1.3328,  1.0385,  1.8561,  ...,  1.0000,  0.0000,  0.0000],
        ...,
        [-0.8237,  1.7782, -0.9248,  ...,  0.0000,  1.0000,  0.0000],
        [-0.8736,  1.7782, -0.8454,  ...,  0.0000,  1.0000,  0.0000],
        [-0.8337,  1.7501, -1.0043,  ...,  0.0000,  1.0000,  0.0000]])

#### Preparing dataset and dataloaders

In [40]:
%pip install scikit-learn
from sklearn.model_selection import train_test_split

Collecting scikit-learnNote: you may need to restart the kernel to use updated packages.

  Using cached scikit_learn-1.6.1-cp311-cp311-win_amd64.whl.metadata (15 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Using cached scipy-1.15.2-cp311-cp311-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Using cached scikit_learn-1.6.1-cp311-cp311-win_amd64.whl (11.1 MB)
Using cached joblib-1.4.2-py3-none-any.whl (301 kB)
Using cached scipy-1.15.2-cp311-cp311-win_amd64.whl (41.2 MB)
Using cached threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scipy, joblib, scikit-learn
Successfully installed joblib-1.4.2 scikit-learn-1.6.1 scipy-1.15.2 threadpoolctl-3.5.0



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [41]:
# 这个函数将数据集随机分割成训练集和测试集（在这里用作验证集）。
# test_size=0.2：指定测试集（这里是验证集）的大小为总数据集的 20%。
"""X_train：训练集特征，包含 80% 的数据。
X_val：验证集特征，包含 20% 的数据。
y_train：训练集标签，对应 X_train。
y_val：验证集标签，对应 X_val。"""
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=9)

In [42]:
# Define custom PyTorch dataset
class RegressionDataset(Dataset):
    def __init__(self, X, y):
        # X 通常是特征数据（输入）。
        # y 通常是目标变量（输出）
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [43]:
# Create datasets and dataloaders
train_dataset = RegressionDataset(torch.tensor(X_train, dtype=torch.float32),
                                   torch.tensor(y_train, dtype=torch.float32))
val_dataset = RegressionDataset(torch.tensor(X_val, dtype=torch.float32),
                                 torch.tensor(y_val, dtype=torch.float32))

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

  train_dataset = RegressionDataset(torch.tensor(X_train, dtype=torch.float32),
  torch.tensor(y_train, dtype=torch.float32))
  val_dataset = RegressionDataset(torch.tensor(X_val, dtype=torch.float32),
  torch.tensor(y_val, dtype=torch.float32))


#### Defining the Model

In [44]:
class SimpleModel(nn.Module):
    def __init__(self, input_dim):
        super(SimpleModel, self).__init__()
        self.layer1 = nn.Linear(input_dim, 256)  # Single layer
        self.relu = nn.ReLU()
        self.layer2 = nn.Linear(256, 1)

    def forward(self, x):
        x = self.layer1(x)
        x = self.relu(x)
        x = self.layer2(x)
        return x

In [45]:
# features的个数
X.shape[1]

13

In [46]:
model = SimpleModel(input_dim=X.shape[1])

#### Training with GPU and Model Checkpoint Handling

In [47]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model.to(device)

cuda


SimpleModel(
  (layer1): Linear(in_features=13, out_features=256, bias=True)
  (relu): ReLU()
  (layer2): Linear(in_features=256, out_features=1, bias=True)
)

In [48]:
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)
# 初始化最佳验证损失为正无穷大。这是为了在训练过程中跟踪和保存性能最好的模型。
best_val_loss = float('inf')
# 定义了保存最佳模型的文件路径。
#.pth 是 PyTorch 模型常用的文件扩展名。
checkpoint_path = "best_model.pth"

In [50]:
for epoch in range(5):
    model.train()
    train_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        predictions = model(X_batch)
        # Check for NaN in predictions
        # 检查模型的预测结果中是否有 NaN（Not a Number）值。
        # 如果检测到 NaN，打印警告信息并中断当前 epoch。
        # 这是一个重要的安全检查，因为 NaN 可能表示模型训练出现了问题（如梯度爆炸）。
        if torch.isnan(predictions).any():
            print(f"NaN detected in predictions at epoch {epoch}")
            break
        loss = criterion(predictions, y_batch)
        loss.backward()
        ## was getting exploding gradients problem with this model, needed to do gradient clipping
        # 使用梯度裁剪来防止梯度爆炸问题。
        # max_norm=1.0 设置梯度的最大范数为 1.0。
        # 如果梯度的范数超过这个值，它们会被缩放到这个范数。
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()

        train_loss += loss.item()
        # print(train_loss)
    # 如果您的数据集有 1000 个样本，批次大小为 32，那么 len(train_loader) 通常会是 ceil(1000/32) = 32（向上取整）
   # a = a / b
    train_loss /= len(train_loader)

    # Validation phase
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            predictions = model(X_batch)
            loss = criterion(predictions, y_batch)
            val_loss += loss.item()

    val_loss /= len(val_loader)

    # Save the best model checkpoint
    # 当发现更好的模型时，将其保存为检查点。
    '''保存的内容包括：
    当前轮次（epoch）
    模型的状态字典（包含所有层的权重和偏置）
    优化器的状态字典（包含优化器的状态）
    最佳验证损失
    checkpoint_path 是之前定义的保存路径。
    '''
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'val_loss': best_val_loss
        }, checkpoint_path)

    print(f"Epoch {epoch+1}/{5}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

Epoch 1/5, Train Loss: 55966105024.4961, Val Loss: 56631327339.1628
Epoch 2/5, Train Loss: 55951323016.9302, Val Loss: 56611359275.6589
Epoch 3/5, Train Loss: 55925986296.0620, Val Loss: 56580129625.3023
Epoch 4/5, Train Loss: 55889219790.3876, Val Loss: 56537293339.7829
Epoch 5/5, Train Loss: 55840925422.1395, Val Loss: 56482729221.9535


In [None]:
# Load the best checkpoint
checkpoint = torch.load(checkpoint_path)
# 将保存的模型状态加载到当前模型中。这会恢复模型的所有参数（权重和偏置）。
model.load_state_dict(checkpoint['model_state_dict'])
# 将保存的优化器状态加载到当前优化器中。
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
print(f"Loaded model from epoch {checkpoint['epoch']+1} with validation loss {checkpoint['val_loss']:.4f}")

Loaded model from epoch 5 with validation loss 56482729221.9535


  checkpoint = torch.load(checkpoint_path)


#### Now, let's do the device comparision

In [52]:
import time

In [53]:
def train_model_on_device(model, data_loader, intended_device, epochs=5):
    """
    Trains the model on the specified device and returns the training time.
    """
    device = torch.device(intended_device)
    model.to(device)
    optimizer = optim.SGD(model.parameters(), lr=0.01)
    criterion = nn.MSELoss()
    # 它返回当前时间的时间戳，表示为从 Unix 纪元（1970年1月1日 00:00:00 UTC）开始的秒数。
    # 标记了某个操作或过程的开始时间点。
    start_time = time.time()

    for epoch in range(epochs):
        model.train()
        total_loss = 0

        for X_batch, y_batch in data_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)

            optimizer.zero_grad()
            predictions = model(X_batch)
            loss = criterion(predictions, y_batch)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(data_loader)
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}")

    end_time = time.time()
    training_time = end_time - start_time

    return training_time

In [54]:
input_dim = X.shape[1]
model_cpu = SimpleModel(input_dim)
model_gpu = SimpleModel(input_dim)

# Train on CPU
print("Training on CPU...")
cpu_time = train_model_on_device(model_cpu, train_loader, intended_device="cpu")
print(f"CPU Training Time: {cpu_time:.2f} seconds")

# Train on GPU (if available)
if torch.cuda.is_available():
    print("\nTraining on GPU...")
    gpu_time = train_model_on_device(model_gpu, train_loader, intended_device="cuda")
    print(f"GPU Training Time: {gpu_time:.2f} seconds")

    # Compare CPU and GPU times
    speedup = cpu_time / gpu_time
    print(f"\nGPU is approximately {speedup:.2f}x faster than CPU.")
else:
    print("\nGPU not available. Skipping GPU comparison.")

Training on CPU...
Epoch 1/5, Loss: 55965907646.5116
Epoch 2/5, Loss: 55950732811.9070
Epoch 3/5, Loss: 55924951794.1085
Epoch 4/5, Loss: 55887743785.6744
Epoch 5/5, Loss: 55838962898.3566
CPU Training Time: 2.13 seconds

Training on GPU...
Epoch 1/5, Loss: 55966073947.2868
Epoch 2/5, Loss: 55951081856.9922
Epoch 3/5, Loss: 55925628769.2403
Epoch 4/5, Loss: 55888597734.2016
Epoch 5/5, Loss: 55839787619.2248
GPU Training Time: 4.00 seconds

GPU is approximately 0.53x faster than CPU.
