In [130]:
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import mean_squared_error


# Load and Assessing Data

In [131]:
# load

df_train = pd.read_csv("dataset/california_housing_train.csv")
df_test = pd.read_csv("dataset/california_housing_train.csv")



In [132]:
df_train.head(10)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0
1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82,80100.0
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0
3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0
4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.925,65500.0
5,-114.58,33.63,29.0,1387.0,236.0,671.0,239.0,3.3438,74000.0
6,-114.58,33.61,25.0,2907.0,680.0,1841.0,633.0,2.6768,82400.0
7,-114.59,34.83,41.0,812.0,168.0,375.0,158.0,1.7083,48500.0
8,-114.59,33.61,34.0,4789.0,1175.0,3134.0,1056.0,2.1782,58400.0
9,-114.6,34.83,46.0,1497.0,309.0,787.0,271.0,2.1908,48100.0


In [133]:
df_test.head(10)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0
1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82,80100.0
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0
3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0
4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.925,65500.0
5,-114.58,33.63,29.0,1387.0,236.0,671.0,239.0,3.3438,74000.0
6,-114.58,33.61,25.0,2907.0,680.0,1841.0,633.0,2.6768,82400.0
7,-114.59,34.83,41.0,812.0,168.0,375.0,158.0,1.7083,48500.0
8,-114.59,33.61,34.0,4789.0,1175.0,3134.0,1056.0,2.1782,58400.0
9,-114.6,34.83,46.0,1497.0,309.0,787.0,271.0,2.1908,48100.0


# Data Preprocessing

In [134]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17000 entries, 0 to 16999
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           17000 non-null  float64
 1   latitude            17000 non-null  float64
 2   housing_median_age  17000 non-null  float64
 3   total_rooms         17000 non-null  float64
 4   total_bedrooms      17000 non-null  float64
 5   population          17000 non-null  float64
 6   households          17000 non-null  float64
 7   median_income       17000 non-null  float64
 8   median_house_value  17000 non-null  float64
dtypes: float64(9)
memory usage: 1.2 MB


In [135]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17000 entries, 0 to 16999
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           17000 non-null  float64
 1   latitude            17000 non-null  float64
 2   housing_median_age  17000 non-null  float64
 3   total_rooms         17000 non-null  float64
 4   total_bedrooms      17000 non-null  float64
 5   population          17000 non-null  float64
 6   households          17000 non-null  float64
 7   median_income       17000 non-null  float64
 8   median_house_value  17000 non-null  float64
dtypes: float64(9)
memory usage: 1.2 MB


In [136]:
df_train[["median_income"]]

Unnamed: 0,median_income
0,1.4936
1,1.8200
2,1.6509
3,3.1917
4,1.9250
...,...
16995,2.3571
16996,2.5179
16997,3.0313
16998,1.9797


In [137]:
df_train.corr()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
longitude,1.0,-0.925208,-0.11425,0.04701,0.071802,0.101674,0.059628,-0.015485,-0.044982
latitude,-0.925208,1.0,0.016454,-0.038773,-0.069373,-0.111261,-0.074902,-0.080303,-0.144917
housing_median_age,-0.11425,0.016454,1.0,-0.360984,-0.320434,-0.29589,-0.302754,-0.115932,0.106758
total_rooms,0.04701,-0.038773,-0.360984,1.0,0.928403,0.86017,0.919018,0.195383,0.130991
total_bedrooms,0.071802,-0.069373,-0.320434,0.928403,1.0,0.881169,0.98092,-0.013495,0.045783
population,0.101674,-0.111261,-0.29589,0.86017,0.881169,1.0,0.909247,-0.000638,-0.02785
households,0.059628,-0.074902,-0.302754,0.919018,0.98092,0.909247,1.0,0.007644,0.061031
median_income,-0.015485,-0.080303,-0.115932,0.195383,-0.013495,-0.000638,0.007644,1.0,0.691871
median_house_value,-0.044982,-0.144917,0.106758,0.130991,0.045783,-0.02785,0.061031,0.691871,1.0


In [138]:
df_train["avg population per household"] = df_train["population"] / df_train["households"]
df_test["avg population per household"] = df_test["population"] / df_test["households"]





In [139]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17000 entries, 0 to 16999
Data columns (total 10 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   longitude                     17000 non-null  float64
 1   latitude                      17000 non-null  float64
 2   housing_median_age            17000 non-null  float64
 3   total_rooms                   17000 non-null  float64
 4   total_bedrooms                17000 non-null  float64
 5   population                    17000 non-null  float64
 6   households                    17000 non-null  float64
 7   median_income                 17000 non-null  float64
 8   median_house_value            17000 non-null  float64
 9   avg population per household  17000 non-null  float64
dtypes: float64(10)
memory usage: 1.3 MB


In [140]:

X_train = df_train.drop(columns="median_house_value", axis=1).values
y_train = df_train[["median_house_value"]].values

X_test = df_test.drop(columns="median_house_value").values
y_test = df_test[["median_house_value"]].values


In [141]:
scaler = StandardScaler()
X_test = scaler.fit_transform(X_test)
X_train = scaler.fit_transform(X_train)


In [142]:
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)

X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).unsqueeze(1)

In [143]:
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Defining Model

In [144]:
model = nn.Sequential(
    nn.Linear(X_train.shape[1], 64),
    nn.ReLU(),
    nn.Linear(64, 32),
    nn.ReLU(),
    nn.Linear(32, 1)
    )

In [145]:
loss_fn = nn.MSELoss()
optimize = optim.Adam(model.parameters(), lr=0.01)

In [146]:
epochs = 20
for epoch in range(epochs):
    model.train()
    epoch_loss = 0.0
    for X_batch, y_batch in train_loader:
        preds = model(X_batch)
        loss = loss_fn(preds, y_batch)
        optimize.zero_grad()
        loss.backward()
        optimize.step()
        epoch_loss += loss.item()

    avg_loss = epoch_loss/len(train_loader)
    print(f"epoch [{epoch+1}/{epochs}], Loss: {avg_loss:.4f}")

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


epoch [1/20], Loss: 42584904315.1880
epoch [2/20], Loss: 17427455503.3985
epoch [3/20], Loss: 14807614577.5639
epoch [4/20], Loss: 14011805407.2782
epoch [5/20], Loss: 13717881832.9023
epoch [6/20], Loss: 13612381747.9699
epoch [7/20], Loss: 13531415652.0902
epoch [8/20], Loss: 13506828927.0376
epoch [9/20], Loss: 13499905919.0376
epoch [10/20], Loss: 13480481822.7970
epoch [11/20], Loss: 13477512220.8722
epoch [12/20], Loss: 13471936473.5038
epoch [13/20], Loss: 13456464393.6241
epoch [14/20], Loss: 13461192351.7594
epoch [15/20], Loss: 13474087930.2256
epoch [16/20], Loss: 13468445601.6842
epoch [17/20], Loss: 13454663089.0827
epoch [18/20], Loss: 13457218021.0526
epoch [19/20], Loss: 13466415527.4586
epoch [20/20], Loss: 13462173990.4962


In [147]:
model.eval()
with torch.no_grad():
    y_pred_list = []
    y_true_list = []

    for X_batch, y_batch in test_loader:
        preds = model(X_batch)
        y_pred_list.append(preds.numpy())
        y_true_list.append(y_batch.numpy())



    y_pred = torch.tensor([item for batch in y_pred_list for item in batch])
    y_true = torch.tensor([item for batch in y_true_list for item in batch])

    mse = loss_fn(y_pred, y_true)
    rmse = mse ** 0.5
    print(f"\nTest RMSE: {rmse:.2f}")
    print(f"\nTest MSE: {mse:.2f}")



Test RMSE: 116177.73

Test MSE: 13497267200.00


  return F.mse_loss(input, target, reduction=self.reduction)


In [148]:
from sklearn.ensemble import ExtraTreesRegressor, GradientBoostingRegressor

treeRegressor = ExtraTreesRegressor()

treeRegressor.fit(X_train, y_train)

  return fit_method(estimator, *args, **kwargs)


0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,False


In [149]:
mse = mean_squared_error(y_test, treeRegressor.predict(X_test))
rmse = mse ** 0.5

print(f"\nTest MSE: {mse:.2f}")
print(f"RMSE Tree = {rmse}")


Test MSE: 2.58
RMSE Tree = 1.6052853878580555


In [150]:
GBR = GradientBoostingRegressor()
GBR.fit(X_train, y_train)

mse = mean_squared_error(y_test, GBR.predict(X_test))
rmse = mse ** 0.5

print(f"\nTest MSE: {mse:.2f}")
print(f"RMSE Tree = {rmse}")

  y = column_or_1d(y, warn=True)  # TODO: Is this still required?



Test MSE: 2601847368.08
RMSE Tree = 51008.306853661335
