In [73]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn import preprocessing
import matplotlib.pyplot as plt
import cv2
import os

In [74]:
def data_preparation(train_data_path: str, test_data_path: str) -> tuple[pd.DataFrame, pd.DataFrame]:
    raw_train_data_df = pd.read_csv(train_data_path, index_col=0)
    train_SalePrice = raw_train_data_df.pop("SalePrice")
    numeric_features = raw_train_data_df.select_dtypes(include="number").columns.tolist()
    train_data_df = pd.get_dummies(raw_train_data_df, dtype="float32")
    train_data_df.fillna(value=0.0, inplace=True)
    train_data_df["SalePrice"] = train_SalePrice.apply(np.log)

    raw_test_data_df = pd.read_csv(test_data_path, index_col=0)
    test_data_df = pd.get_dummies(raw_test_data_df, dtype="float32")
    test_data_df = test_data_df.reindex(columns=train_data_df.columns, fill_value=0)
    test_data_df.fillna(value=0.0, inplace=True)

    standard_scaler = preprocessing.StandardScaler()
    train_data_df[numeric_features] = standard_scaler.fit_transform(train_data_df[numeric_features])
    test_data_df[numeric_features] = standard_scaler.transform(test_data_df[numeric_features])

    return train_data_df, test_data_df

In [75]:
class HouseDataset(torch.utils.data.Dataset):
    def __init__(self, df: pd.DataFrame):
        prices = df.pop("SalePrice").values
        features = df.values
        
        self.features = torch.tensor(features, dtype=torch.float32)
        self.prices = torch.tensor(prices, dtype=torch.float32)
        
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        return self.features[idx], self.prices[idx]

In [76]:
train_path = "./data/train.csv"
test_path = "./data/test.csv"

full_train_dataset_df, test_dataset_df = data_preparation(train_path, test_path)
full_train_dataset = HouseDataset(full_train_dataset_df)

print(type(full_train_dataset))
print(type(full_train_dataset[0]))
print(type(full_train_dataset[0][0]))
print(type(full_train_dataset[0][1]))
print(full_train_dataset[0][1])
print(full_train_dataset[0][0].shape)
print(full_train_dataset[0][1].shape)
print(full_train_dataset[0][0])

<class '__main__.HouseDataset'>
<class 'tuple'>
<class 'torch.Tensor'>
<class 'torch.Tensor'>
tensor(12.2477)
torch.Size([287])
torch.Size([])
tensor([ 0.0734,  0.2129, -0.2071,  0.6515, -0.5172,  1.0510,  0.8787,  0.5141,
         0.5754, -0.2887, -0.9446, -0.4593, -0.7934,  1.1619, -0.1202,  0.3703,
         1.1078, -0.2411,  0.7897,  1.2276,  0.1638, -0.2115,  0.9122, -0.9512,
         0.2960,  0.3117,  0.3510, -0.7522,  0.2165, -0.3593, -0.1163, -0.2702,
        -0.0687, -0.0877, -1.5991,  0.1388,  0.0000,  0.0000,  0.0000,  1.0000,
         0.0000,  0.0000,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
         1.0000,  0.0000,  0.0000,  0.0000,  1.0000,  1.0000,  0.0000,  0.0000,
         0.0000,  0.0000,  0.0000,  1.0000,  1.0000,  0.0000,  0.0000,  0.0000,
         0.0000,  0.0000,  0.0000,  0.0000,  1.0000,  0.0000,  0.0000,  0.0000,
         0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
         0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0

In [77]:
train_path = "./data/train.csv"
test_path = "./data/test.csv"

full_train_dataset_df, test_dataset_df = data_preparation(train_path, test_path)

full_train_dataset = HouseDataset(full_train_dataset_df)

train_dataset, val_dataset = torch.utils.data.random_split(full_train_dataset, [0.95, 0.05])
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=16 ,shuffle=True)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=16)

test_dataset = HouseDataset(test_dataset_df)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=16)

In [78]:
print(len(full_train_dataset))
print(len(train_dataset))
print(len(val_dataset))
a, b = full_train_dataset[0]
print(a)
print(b)


1460
1387
73
tensor([ 0.0734,  0.2129, -0.2071,  0.6515, -0.5172,  1.0510,  0.8787,  0.5141,
         0.5754, -0.2887, -0.9446, -0.4593, -0.7934,  1.1619, -0.1202,  0.3703,
         1.1078, -0.2411,  0.7897,  1.2276,  0.1638, -0.2115,  0.9122, -0.9512,
         0.2960,  0.3117,  0.3510, -0.7522,  0.2165, -0.3593, -0.1163, -0.2702,
        -0.0687, -0.0877, -1.5991,  0.1388,  0.0000,  0.0000,  0.0000,  1.0000,
         0.0000,  0.0000,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
         1.0000,  0.0000,  0.0000,  0.0000,  1.0000,  1.0000,  0.0000,  0.0000,
         0.0000,  0.0000,  0.0000,  1.0000,  1.0000,  0.0000,  0.0000,  0.0000,
         0.0000,  0.0000,  0.0000,  0.0000,  1.0000,  0.0000,  0.0000,  0.0000,
         0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
         0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
         0.0000,  0.0000,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
         0.0000,  0.0000,  

In [79]:
class HouseNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(287, 200)
        self.dropout1 = nn.Dropout(0.2)
        self.fc2 = nn.Linear(200, 200)
        self.dropout2 = nn.Dropout(0.2)
        self.fc3 = nn.Linear(200, 1)
        self.activation = nn.ReLU()
        
    def forward(self, x):
        x = self.dropout1(self.activation(self.fc1(x)))
        x = self.dropout2(self.activation(self.fc2(x)))
        x = self.fc3(x)
        return x
    

In [80]:
model = HouseNetwork()
loss_fun = nn.MSELoss()
#optimizer = torch.optim.SGD(model.parameters(), lr = 0.0001, momentum=0.9)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

In [None]:
epochs = 20000

model.train()
# Цикл обучения
for epoch in range(epochs):
    epoch_loss = 0.0
    for house, price in train_dataloader:
        optimizer.zero_grad()
        prediction = model(house)
        prediction = prediction.squeeze()
        loss = loss_fun(prediction, price)
        epoch_loss += loss.item()
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch + 1}/{epochs}, Loss: {epoch_loss:.4f}')

Epoch 1/20000, Loss: 10590.7737
Epoch 2/20000, Loss: 2130.1077
Epoch 3/20000, Loss: 91.6277
Epoch 4/20000, Loss: 81.1357
Epoch 5/20000, Loss: 77.4949
Epoch 6/20000, Loss: 68.9460
Epoch 7/20000, Loss: 66.1729
Epoch 8/20000, Loss: 69.2328
Epoch 9/20000, Loss: 62.2966
Epoch 10/20000, Loss: 62.9174
Epoch 11/20000, Loss: 59.0725
Epoch 12/20000, Loss: 64.3834
Epoch 13/20000, Loss: 61.8184
Epoch 14/20000, Loss: 60.0395
Epoch 15/20000, Loss: 61.7244
Epoch 16/20000, Loss: 57.8152
Epoch 17/20000, Loss: 59.4751
Epoch 18/20000, Loss: 59.1442
Epoch 19/20000, Loss: 58.5111
Epoch 20/20000, Loss: 59.4145
Epoch 21/20000, Loss: 60.1558
Epoch 22/20000, Loss: 57.0298
Epoch 23/20000, Loss: 52.3923
Epoch 24/20000, Loss: 53.9708
Epoch 25/20000, Loss: 57.6833
Epoch 26/20000, Loss: 55.6118
Epoch 27/20000, Loss: 53.7456
Epoch 28/20000, Loss: 56.7840
Epoch 29/20000, Loss: 56.4510
Epoch 30/20000, Loss: 49.7147
Epoch 31/20000, Loss: 56.0771
Epoch 32/20000, Loss: 52.0264
Epoch 33/20000, Loss: 51.9668
Epoch 34/20000

In [None]:
print(model)

In [None]:
print(train_dataset[0][0])

In [None]:
model.eval()
print(np.exp(val_dataset[4][1]).item())
print(np.exp(model(val_dataset[4][0]).item()))