In [32]:
import pandas as pd
import data_describe as dd

# A. Read the file and print info

In [33]:
houses = pd.read_csv("./data/houses.csv")

In [34]:
houses.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             21613 non-null  int64  
 1   date           21613 non-null  object 
 2   price          21613 non-null  float64
 3   bedrooms       21613 non-null  int64  
 4   bathrooms      21613 non-null  float64
 5   sqft_living    21613 non-null  int64  
 6   sqft_lot       21613 non-null  int64  
 7   floors         21613 non-null  float64
 8   waterfront     21613 non-null  int64  
 9   view           21613 non-null  int64  
 10  condition      21613 non-null  int64  
 11  grade          21613 non-null  int64  
 12  sqft_above     21613 non-null  int64  
 13  sqft_basement  21613 non-null  int64  
 14  yr_built       21613 non-null  int64  
 15  yr_renovated   21613 non-null  int64  
 16  zipcode        21613 non-null  int64  
 17  lat            21613 non-null  float64
 18  long  

# B. Count Nan values

In [35]:
houses.isna().sum()

id               0
date             0
price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
grade            0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
zipcode          0
lat              0
long             0
sqft_living15    0
sqft_lot15       0
dtype: int64

In [36]:
# C. Correltaion Matrix and Price Correlation

In [38]:
correlation_matrix = houses.corr(numeric_only=True)

In [40]:
dd.correlation_matrix(houses, cluster=True, viz_backend="plotly")

None

<data_describe.core.correlation.CorrelationWidget at 0x7fc8f37414c0>

### `sqft_living` has the highest correlation with price

In [41]:
correlation_matrix.price.nlargest(n=2)

price          1.000000
sqft_living    0.702035
Name: price, dtype: float64

# D. Price dist

In [43]:
houses.price

0        221900.0
1        538000.0
2        180000.0
3        604000.0
4        510000.0
           ...   
21608    360000.0
21609    400000.0
21610    402101.0
21611    400000.0
21612    325000.0
Name: price, Length: 21613, dtype: float64

In [50]:
import plotly.express as px
fig = px.histogram(houses.price, x="price", title="Price Histogram")
fig.show()

In [63]:
fig = px.scatter(houses, x="price", y="sqft_living", trendline="ols", opacity=0.5, trendline_color_override="green")
fig.show()


# E. Date to year and month

In [68]:
houses["year"] =houses.date.str[0:4]
houses["month"] = houses.date.str[4:6]
houses = houses.drop(columns=["date"])

AttributeError: 'DataFrame' object has no attribute 'date'

# F. Test Train split

In [93]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(houses, test_size=0.2)

# G. MinMaxScaler

In [94]:
from sklearn import preprocessing

scalers = {}
def normalize(train, test):
    # dont' scale the id
    for feature in train.columns.drop("id"):
        min_max_scaler = preprocessing.MinMaxScaler()
        train[feature] = min_max_scaler.fit_transform(train[[feature]])
        test[feature] = min_max_scaler.transform(test[[feature]])
        scalers[feature] = min_max_scaler
    return train, test

In [95]:
train, test = normalize(train, test)

In [90]:
train

Unnamed: 0,id,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,...,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,year,month
17364,7853302130,0.045049,0.272727,0.31250,0.133585,0.002350,0.4,0.0,0.0,0.50,...,0.000000,0.930435,0.000000,0.323232,0.618147,0.527409,0.285837,0.004305,0.0,0.454545
13665,3579800180,0.049174,0.363636,0.18750,0.113962,0.005833,0.2,0.0,0.0,0.75,...,0.000000,0.504348,0.000000,0.166667,0.927058,0.230066,0.211840,0.011497,1.0,0.363636
10973,1725059187,0.068197,0.181818,0.21875,0.074717,0.004834,0.0,0.0,0.0,0.50,...,0.000000,0.460870,0.997519,0.161616,0.802200,0.273256,0.266908,0.011148,0.0,0.818182
1196,2112700030,0.036984,0.454545,0.34375,0.094340,0.002108,0.0,0.0,0.0,0.50,...,0.096852,0.782609,0.000000,0.530303,0.604399,0.137043,0.211840,0.003847,0.0,0.909091
9933,4054560140,0.097705,0.272727,0.31250,0.200755,0.020952,0.2,0.0,0.0,0.50,...,0.000000,0.826087,0.000000,0.383838,0.925602,0.401993,0.586990,0.039665,0.0,0.727273
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9367,1352300315,0.019010,0.181818,0.12500,0.054340,0.002181,0.4,0.0,0.0,0.50,...,0.000000,0.060870,0.969231,0.272727,0.531619,0.264950,0.137842,0.003985,0.0,0.636364
14694,4324210120,0.027148,0.272727,0.31250,0.104906,0.009202,0.0,0.0,0.0,0.50,...,0.106538,0.817391,0.000000,0.151515,0.426492,0.289037,0.175701,0.009117,0.0,0.363636
2025,2767600400,0.084584,0.272727,0.28125,0.143396,0.001149,0.8,0.0,0.0,0.50,...,0.000000,0.991304,0.000000,0.585859,0.835355,0.115449,0.191189,0.003405,0.0,0.909091
5526,1775800290,0.036590,0.272727,0.21875,0.073208,0.007154,0.0,0.0,0.0,0.50,...,0.000000,0.591304,0.000000,0.358586,0.941129,0.352159,0.158493,0.013956,0.0,0.636364


# H. MLP 2 layers

In [99]:
train.shape

(17290, 22)

In [107]:
col = train.columns.drop(["id", "price"])

In [112]:
train[["price"]]

Unnamed: 0,price
18950,0.025311
850,0.060433
3909,0.045246
328,0.014607
9947,0.053115
...,...
15344,0.015082
20496,0.101258
15954,0.097574
6532,0.055738


In [129]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import torch.nn as nn

In [130]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [131]:
class HousesDataset(Dataset):
    def __init__(self, df):
        self.df = df
        self.x = self.df[df.columns.drop(["id", "price"])].values
        self.y = self.df[["price"]].values
        self.x = torch.tensor(self.x, dtype=torch.float32)
        self.y = torch.tensor(self.y, dtype=torch.float32)

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]


In [150]:
train_dataset = HousesDataset(train)
test_dataset = HousesDataset(test)

train_loader = DataLoader(train_dataset, batch_size=64,
                          shuffle=True, num_workers=20)
test_loader = DataLoader(test_dataset, batch_size=64,
                         shuffle=True, num_workers=20)



This DataLoader will create 20 worker processes in total. Our suggested max number of worker in current system is 12, which is smaller than what this DataLoader is going to create. Please be aware that excessive worker creation might get DataLoader running slow or even freeze, lower the worker number to avoid potential slowness/freeze if necessary.



In [151]:
for i, (data, labels) in enumerate(train_loader):
  print(data.shape, labels.shape)
  print(data,labels)
  break;

torch.Size([64, 20]) torch.Size([64, 1])
tensor([[0.1212, 0.4062, 0.3318,  ..., 0.0558, 1.0000, 0.0909],
        [0.0909, 0.2812, 0.0866,  ..., 0.0076, 0.0000, 1.0000],
        [0.0606, 0.1250, 0.0266,  ..., 0.0030, 0.0000, 0.7273],
        ...,
        [0.0909, 0.3125, 0.0995,  ..., 0.0011, 1.0000, 0.1818],
        [0.0909, 0.1250, 0.0516,  ..., 0.0100, 0.0000, 0.3636],
        [0.0909, 0.2188, 0.1260,  ..., 0.0121, 0.0000, 1.0000]]) tensor([[0.1174],
        [0.0249],
        [0.0433],
        [0.0390],
        [0.0564],
        [0.1520],
        [0.0281],
        [0.0643],
        [0.0656],
        [0.0751],
        [0.0480],
        [0.0525],
        [0.0320],
        [0.0638],
        [0.0303],
        [0.0392],
        [0.0964],
        [0.0614],
        [0.0354],
        [0.0570],
        [0.0242],
        [0.0472],
        [0.0347],
        [0.0787],
        [0.0662],
        [0.0367],
        [0.0256],
        [0.0544],
        [0.1148],
        [0.0487],
        [0.0898],
   

In [152]:
class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(20, 20)
        self.fc2 = nn.Linear(20, 15)
        self.fc3 = nn.Linear(15, 15)
        self.fc4 = nn.Linear(15, 10)
        self.fc5 = nn.Linear(10, 5)

        self.out = nn.Linear(5, 1)
    
    def forward(self, x):
        x = self.fc1(x)
        x = F.relu(x)

        x = self.fc2(x)
        x = F.relu(x)

        x = self.fc3(x)
        x = F.relu(x)

        x = self.fc4(x)
        x = F.relu(x)

        x = self.fc5(x)
        x = F.relu(x)

        x = self.out(x)
        x = F.sigmoid(x)
        return x
    def initialize_weights(self):
        # TODO what to fill
        for m in self.modules():
            if isinstance(m, nn.Linear):
                m.weight.data.fill_(1)
                if m.bias is not None:
                    m.bias.data.zero_()

In [153]:
model = MLP().to(device)
model.initialize_weights()
criterion = torch.nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr = 0.01)

In [154]:
data_loader = {
    "train": train_loader,
    "val": test_loader,
}

In [155]:
for epoch in range(20):
    for phase in ["train", "val"]:
        if phase == "train":
            model.train(True)  # Set model to training mode
        else:
            model.train(False)  # Set model to evaluate mode
        for i, (data, labels) in enumerate(data_loader[phase], 0):
            data = data.to(device)
            labels = labels.to(device)
            optimizer.zero_grad()
            outputs = model(data)
            loss = criterion(outputs, labels)
            if phase == "train":
                loss.backward()
                optimizer.step()


nn.functional.sigmoid is deprecated. Use torch.sigmoid instead.



In [None]:
for i, (data, labels) in enumerate(test_loader):
    data = data.to(device)
    labels = labels.to(device)
    model.eval()
    pred = model(data)
    print("pred", pred)
    print("labels", labels)
    break;


nn.functional.sigmoid is deprecated. Use torch.sigmoid instead.



pred tensor([[1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.]], device='cuda:0', grad_fn=<SigmoidBackward0>)
labels tensor([[0.0393],
        [0.0256],
        [0