In [24]:
import numpy as np
from sklearn.linear_model import LinearRegression

import math

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F 
from torch.utils.data import TensorDataset, DataLoader
import matplotlib.pyplot as plt

train_data = pd.read_csv('dataset/price_data_tr.csv')
heldout_data = pd.read_csv('dataset/price_data_val.csv')
test_data = pd.read_csv('dataset/price_data_ts.csv')

In [25]:
# 최신 연도로 바꾸고 기존 yr_bult, yr_renovated 삭제
train_data['yr_max'] = np.maximum(np.array(train_data.yr_built), np.array(train_data.yr_renovated))
train_data = train_data.drop(['yr_built','yr_renovated'], axis=1)

heldout_data['yr_max'] = np.maximum(np.array(heldout_data.yr_built), np.array(heldout_data.yr_renovated))
heldout_data = heldout_data.drop(['yr_built','yr_renovated'], axis=1)

test_data['yr_max'] = np.maximum(np.array(test_data.yr_built), np.array(test_data.yr_renovated))
test_data = test_data.drop(['yr_built','yr_renovated'], axis=1)

In [26]:
# drop zipcode
train_data['zipcode'] = -train_data['zipcode']
heldout_data['zipcode'] = -heldout_data['zipcode']
test_data['zipcode'] = -test_data['zipcode']

In [27]:
e, pi, r2, re = math.e, math.pi, 2 ** 0.5, math.e ** 0.5

# bathrooms
train_data['bathrooms'] = train_data['bathrooms'].apply(lambda x: x**r2)
# sqft_lot
train_data['sqft_lot'] = train_data['sqft_lot'].apply(lambda x: x**(1/pi))
# floors
train_data['floors'] = train_data['floors'].apply(lambda x: x**(1/(pi**pi)))
# waterfront
train_data['waterfront'] = train_data['waterfront'].apply(lambda x: x**(e**e))
# condition
train_data['condition'] = train_data['condition'].apply(lambda x: x**(pi**2))
# grade
train_data['grade'] = train_data['grade'].apply(lambda x: x**(re**re))
# sqft_basement
train_data['sqft_basement'] = train_data['sqft_basement'].apply(lambda x: x**(r2**r2))
# lat
train_data['lat'] = train_data['lat'].apply(lambda x: x**(1/(pi**pi)))
# sqft_lot15
train_data['sqft_lot15'] = train_data['sqft_lot15'].apply(lambda x: x**(1/pi))
# yr_max
train_data['yr_max'] = train_data['yr_max'].apply(lambda x: x**(e**2))


# bathrooms
heldout_data['bathrooms'] = heldout_data['bathrooms'].apply(lambda x: x**r2)
# sqft_lot
heldout_data['sqft_lot'] = heldout_data['sqft_lot'].apply(lambda x: x**(1/pi))
# floors
heldout_data['floors'] = heldout_data['floors'].apply(lambda x: x**(1/(pi**pi)))
# waterfront
heldout_data['waterfront'] = heldout_data['waterfront'].apply(lambda x: x**(e**e))
# condition
heldout_data['condition'] = heldout_data['condition'].apply(lambda x: x**(pi**2))
# grade
heldout_data['grade'] = heldout_data['grade'].apply(lambda x: x**(re**re))
# sqft_basement
heldout_data['sqft_basement'] = heldout_data['sqft_basement'].apply(lambda x: x**(r2**r2))
# lat
heldout_data['lat'] = heldout_data['lat'].apply(lambda x: x**(1/(pi**pi)))
# sqft_lot15
heldout_data['sqft_lot15'] = heldout_data['sqft_lot15'].apply(lambda x: x**(1/pi))
# yr_max
heldout_data['yr_max'] = heldout_data['yr_max'].apply(lambda x: x**(e**2))


# bathrooms
test_data['bathrooms'] = test_data['bathrooms'].apply(lambda x: x**r2)
# sqft_lot
test_data['sqft_lot'] = test_data['sqft_lot'].apply(lambda x: x**(1/pi))
# floors
test_data['floors'] = test_data['floors'].apply(lambda x: x**(1/(pi**pi)))
# waterfront
test_data['waterfront'] = test_data['waterfront'].apply(lambda x: x**(e**e))
# condition
test_data['condition'] = test_data['condition'].apply(lambda x: x**(pi**2))
# grade
test_data['grade'] = test_data['grade'].apply(lambda x: x**(re**re))
# sqft_basement
test_data['sqft_basement'] = test_data['sqft_basement'].apply(lambda x: x**(r2**r2))
# lat
test_data['lat'] = test_data['lat'].apply(lambda x: x**(1/(pi**pi)))
# sqft_lot15
test_data['sqft_lot15'] = test_data['sqft_lot15'].apply(lambda x: x**(1/pi))
# yr_max
test_data['yr_max'] = test_data['yr_max'].apply(lambda x: x**(e**2))

In [28]:
nor_train = train_data.iloc[:, 3:].apply(lambda x: (x-x.mean()) / (x.std()))
nor_heldout = heldout_data.iloc[:, 3:].apply(lambda x: (x-x.mean()) / (x.std()))
nor_test = test_data.iloc[:, 3:].apply(lambda x: (x-x.mean()) / (x.std()))

nor_log_train_price = np.log(train_data.iloc[:, 2:3]).apply(lambda x: (x-x.mean()) / (x.std()))
nor_log_heldout_price = np.log(heldout_data.iloc[:, 2:3]).apply(lambda x: (x-x.mean()) / (x.std()))

In [29]:
train_log_label_mean, train_log_label_std = np.log(train_data.iloc[:, 2]).mean(), np.log(train_data.iloc[:, 2]).std()
print(train_log_label_mean, train_log_label_std)

heldout_log_label_mean, heldout_log_label_std = np.log(heldout_data.iloc[:, 2]).mean(), np.log(heldout_data.iloc[:, 2]).std()
print(heldout_log_label_mean, heldout_log_label_std)

uni_log_label_mean = np.log(pd.concat((train_data.iloc[:, 2], heldout_data.iloc[:, 2]))).mean()
uni_log_label_std = np.log(pd.concat((train_data.iloc[:, 2], heldout_data.iloc[:, 2]))).std()
print(uni_log_label_mean, uni_log_label_std)

def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())

13.04810655020148 0.5220080959315785
13.048257499579405 0.5355929705203868
13.048144289728448 0.5254218546587482


In [30]:
train_feats = np.array(nor_train.iloc[:, :].values)
train_labels = np.array(nor_log_train_price.values.reshape(-1, 1))

heldout_feats = np.array(nor_heldout.iloc[:, :].values)
heldout_labels = np.array(nor_log_heldout_price.values.reshape(-1, 1))

test_feat = np.array(nor_test.iloc[:, :].values)

In [31]:
train_feats.shape

(12968, 17)

In [32]:
train_labels.shape

(12968, 1)

In [33]:
reg = LinearRegression().fit(train_feats, train_labels)
reg.score(train_feats, train_labels)

0.7513356555263737

In [35]:
predicts = reg.predict(heldout_feats)

train_predicts = np.exp(predicts * train_log_label_std + train_log_label_mean)
held_predicts = np.exp(predicts * heldout_log_label_std + heldout_log_label_mean)
uni_predicts = np.exp(predicts * uni_log_label_std + uni_log_label_mean)
    
rmse_train = rmse(train_predicts, heldout_data.iloc[:, [2]].values)
rmse_held = rmse(held_predicts, heldout_data.iloc[:, [2]].values)
rmse_uni = rmse(uni_predicts, heldout_data.iloc[:, [2]].values)

print(rmse_train)
print(rmse_held)
print(rmse_uni)

196482.16820391844
200494.20169127468
197331.7907591412


In [21]:
final_predict = reg.predict(test_feat)
final_predict = np.exp(final_predict * uni_log_label_std + uni_log_label_mean)
final_predict

array([[429075.33204722],
       [432547.73680895],
       [241741.14347606],
       ...,
       [634670.83133893],
       [495539.89436785],
       [483015.11166743]])

In [22]:
test_data = pd.read_csv('dataset/price_data_ts.csv')
test_data['price'] = final_predict

test_data['id'] = test_data['id'].apply(lambda x : str(x) if len(str(x)) == 10 else '0' + str(x) if len(str(x)) == 9 else '00' + str(x))
test_data['id'] = test_data['id'].astype(str) + test_data['date'].astype(str)
submission = pd.concat([test_data['id'], test_data['price']], axis=1)
submission

Unnamed: 0,id,price
0,700010085020140926T000000,429075.332047
1,403740028020140923T000000,432547.736809
2,142340016020140618T000000,241741.143476
3,776740006020141119T000000,649340.995377
4,946590050020140617T000000,852638.753103
...,...,...
4317,733822037020141006T000000,347053.852664
4318,681910015020140721T000000,516446.500476
4319,082405914020140527T000000,634670.831339
4320,051450009020140513T000000,495539.894368
