In [1]:
import numpy as np
import pandas as pd
import torch
from torch import nn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score

In [2]:
df = pd.read_csv("Housing.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB


In [3]:
df.describe()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking
count,545.0,545.0,545.0,545.0,545.0,545.0
mean,4766729.0,5150.541284,2.965138,1.286239,1.805505,0.693578
std,1870440.0,2170.141023,0.738064,0.50247,0.867492,0.861586
min,1750000.0,1650.0,1.0,1.0,1.0,0.0
25%,3430000.0,3600.0,2.0,1.0,1.0,0.0
50%,4340000.0,4600.0,3.0,1.0,2.0,0.0
75%,5740000.0,6360.0,3.0,2.0,2.0,1.0
max,13300000.0,16200.0,6.0,4.0,4.0,3.0


In [4]:
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [5]:
encoder = LabelEncoder()
for col in df.select_dtypes(include=['object']).columns:
    df[col] = encoder.fit_transform(df[col])

df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,0
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,0
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,1
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,0
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,0


In [6]:
X = df.drop("price", axis=1)
y = df["price"]

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((436, 12), (109, 12), (436,), (109,))

In [9]:
class LinearRegressionModel(nn.Module):
    def __init__(self, n_features):
        super().__init__()
        self.linear_layer = nn.Linear(in_features=n_features, out_features=1)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.linear_layer(x)

In [10]:
model_0 = LinearRegressionModel(n_features=X_train.shape[1])

In [11]:
model_0.state_dict()

OrderedDict([('linear_layer.weight',
              tensor([[-0.1687, -0.2541,  0.1859,  0.0625,  0.0861,  0.1911,  0.1920,  0.2066,
                       -0.0528, -0.2881, -0.1128,  0.1192]])),
             ('linear_layer.bias', tensor([-0.0672]))])

In [12]:
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model_0.parameters(),
                            lr=1e-4,
                            weight_decay=1e-5)

In [13]:
type(X_train), type(X_test), type(y_train), type(y_test)


(pandas.core.frame.DataFrame,
 pandas.core.frame.DataFrame,
 pandas.core.series.Series,
 pandas.core.series.Series)

In [14]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

y_train_scaled = scaler.fit_transform(y_train.values.reshape(-1, 1))
y_test_scaled = scaler.transform(y_test.values.reshape(-1, 1))

In [15]:
type(X_train_scaled), type(X_test_scaled), #type(y_train_scaled), type(y_test_scaled)

(numpy.ndarray, numpy.ndarray)

In [16]:
X_train = torch.tensor(X_train_scaled, dtype=torch.float32)
X_test = torch.tensor(X_test_scaled, dtype=torch.float32)
y_train = torch.tensor(y_train_scaled, dtype=torch.float32)
y_test = torch.tensor(y_test_scaled, dtype=torch.float32)

In [17]:
torch.manual_seed(42)

epochs = 10000

train_loss_values = []
test_loss_values = []
epoch_count = []

for epoch in range(epochs):

    model_0.train()

    y_pred = model_0(X_train)

    loss = loss_fn(y_pred, y_train)

    optimizer.zero_grad()

    loss.backward()

    optimizer.step()

    ### Testing

    model_0.eval()

    with torch.inference_mode():

      test_pred = model_0(X_test)

      test_loss = loss_fn(test_pred, y_test.type(torch.float))

      if epoch % 1000 == 0:
            epoch_count.append(epoch)
            train_loss_values.append(loss.detach().numpy())
            test_loss_values.append(test_loss.detach().numpy())
            print(f"Epoch: {epoch} | MSE Train Loss: {loss} | MSE Test Loss: {test_loss} ")
model_0.eval()
with torch.inference_mode():
    train_preds = model_0(X_train)
    test_preds = model_0(X_test)

r2_train = r2_score(y_train.numpy(), train_preds.numpy())
r2_test = r2_score(y_test.numpy(), test_preds.numpy())

print(f"\nR² Train Score: {r2_train:.4f}")
print(f"R² Test Score: {r2_test:.4f}")

Epoch: 0 | MSE Train Loss: 1.615675926208496 | MSE Test Loss: 2.4469330310821533 
Epoch: 1000 | MSE Train Loss: 0.8945664167404175 | MSE Test Loss: 1.5103590488433838 
Epoch: 2000 | MSE Train Loss: 0.5616042017936707 | MSE Test Loss: 1.0316065549850464 
Epoch: 3000 | MSE Train Loss: 0.4212474226951599 | MSE Test Loss: 0.799315333366394 
Epoch: 4000 | MSE Train Loss: 0.3588360548019409 | MSE Test Loss: 0.6858437657356262 
Epoch: 5000 | MSE Train Loss: 0.32982659339904785 | MSE Test Loss: 0.62554931640625 
Epoch: 6000 | MSE Train Loss: 0.31849855184555054 | MSE Test Loss: 0.5934544801712036 
Epoch: 7000 | MSE Train Loss: 0.3152198791503906 | MSE Test Loss: 0.5792757272720337 
Epoch: 8000 | MSE Train Loss: 0.31461483240127563 | MSE Test Loss: 0.5751858353614807 
Epoch: 9000 | MSE Train Loss: 0.31455859541893005 | MSE Test Loss: 0.5746246576309204 

R² Train Score: 0.6854
R² Test Score: 0.6495


In [18]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

(torch.Size([436, 12]),
 torch.Size([109, 12]),
 torch.Size([436, 1]),
 torch.Size([109, 1]))