In [1]:
import torch
import torch.nn as nn
import numpy as np
import csv
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from tensorflow import keras
from tensorflow.keras import layers

In [2]:
data = pd.read_csv('d81a040c-429c-441b-b2bb-ee7f6f07c11e.csv')

In [3]:
data_dropped = data.dropna(axis = 'columns',how='all')    #delete columns with all NaN

In [4]:
data_dropped = data_dropped.dropna(subset=["建物移轉總面積平方公尺"]) #只計算建物(刪除土地)
data_dropped = data_dropped.drop(data_dropped[(data_dropped['總價元']>80000000 )| (data_dropped['總價元']<1000000)].index) 
data_dropped = data_dropped.drop(data_dropped[(data_dropped['單價每平方公尺']>200000 )| (data_dropped['單價每平方公尺']<20000)].index)
data_dropped = data_dropped.drop(data_dropped[(data_dropped['主要建材']=='鋼造' ) ].index)

In [5]:
data_dropped['車位總價元'] = data_dropped['車位總價元'].fillna(0)
data_dropped['車位移轉總面積平方公尺'] = data_dropped['車位移轉總面積平方公尺'].fillna(0)

In [6]:
data_dropped['總價元'] = data_dropped['總價元'] - data_dropped['車位總價元']   #不計算車位價格
data_dropped['建物移轉總面積平方公尺'] = data_dropped['建物移轉總面積平方公尺'] - data_dropped['車位移轉總面積平方公尺']
data_dropped = data_dropped.drop('車位總價元', axis = 1)
data_dropped = data_dropped.drop('車位移轉總面積平方公尺', axis = 1)

In [7]:
data_dropped = data_dropped.drop('有無管理組織', axis=1)     #delete column (不重要(只有一個為有且看不出對價格有影響))
data_dropped = data_dropped.drop('土地區段位置或建物區門牌', axis=1)  #delete column(太難量化, 僅參考鄉政市區)
data_dropped = data_dropped.drop('非都市土地使用分區', axis=1)   #delete column (與非都市使用編地有為相同的參數)
data_dropped = data_dropped.drop('都市土地使用分區', axis=1)   #delete column (與非都市使用編地有為互補的參數)
data_dropped = data_dropped.drop('交易標的', axis=1)  #delete column (=土地,建物,車位)
data_dropped = data_dropped.drop('備註', axis=1)   #delete column(車位以外的備註分析起來過難,且為數不多)
data_dropped = data_dropped.drop('停車位所在樓層', axis=1) #delete column(影響過小且不好賦值)
data_dropped = data_dropped.drop('車位類別', axis=1) #delete column(不計算車位)
data_dropped = data_dropped.drop('特殊交易備註', axis=1) #delete column(難處理)
data_dropped = data_dropped.drop('建物現況格局-隔間', axis=1) #delete column (重複參數)
data_dropped = data_dropped.drop('主要用途', axis=1) #delete column (大部分資料為其他,不知道要怎麼補)
data_dropped = data_dropped.drop('交易年月日', axis=1)
data_dropped = data_dropped.drop('建築完成年月', axis=1)
data_dropped = data_dropped.drop('土地移轉總面積平方公尺', axis=1)

In [8]:
data_dropped['temp1'] = data_dropped['交易筆棟數'].map(lambda x:x.split('地')[1])   #separate column '交易筆棟數' to two columns (內有兩個參數)
data_dropped['土地'] = data_dropped['temp1'].map(lambda x:x.split('建')[0])
data_dropped['temp2'] = data_dropped['temp1'].map(lambda x:x.split('物')[1])
data_dropped['建物'] = data_dropped['temp2'].map(lambda x:x.split('車')[0])

data_dropped = data_dropped.drop('temp1', axis=1)
data_dropped = data_dropped.drop('temp2', axis=1)
data_dropped = data_dropped.drop('交易筆棟數', axis=1)

In [9]:
data_dropped = data_dropped.drop('總價元', axis=1) #delete column(target為單位價格)


In [10]:
#此欄為nan者為低樓層平房或透天厝,故填0   #轉換離散資料
floorabove5 = {'一層':0, '二層':0, '三層':0, '四層':0, '五層':1, '六層':1, '七層':1, '八層':1, '九層':1, '十層':1, '十一層':0, '十二層':0, '十三層':0, '十四層':0, '十五層':0, '十六層':0, '十七層':0, '十八層':0, '十九層':0, '二十層':0, '二十一層':0, '二十二層':0, '二十三層':0, '二十四層':0, '二十五層':0, '二十六層':0, '二十七層':0, '二十八層':0, '二十九層':0 }
data_dropped[ '五層至十層' ] = data_dropped[ '移轉層次' ].map(floorabove5) 
data_dropped['五層至十層'] = data_dropped['五層至十層'].fillna(0)
floorabove10 = {'一層':0, '二層':0, '三層':0, '四層':0, '五層':0, '六層':0, '七層':0, '八層':0, '九層':0, '十層':1, '十一層':1, '十二層':1, '十三層':1, '十四層':1, '十五層':1, '十六層':0, '十七層':0, '十八層':0, '十九層':0, '二十層':0, '二十一層':0, '二十二層':0, '二十三層':0, '二十四層':0, '二十五層':0, '二十六層':0, '二十七層':0, '二十八層':0, '二十九層':0 }
data_dropped[ '十層至十五層' ] = data_dropped[ '移轉層次' ].map(floorabove10)  
data_dropped['十層至十五層'] = data_dropped['十層至十五層'].fillna(0)
floorabove15 = { '一層':0, '二層':0, '三層':0, '四層':0, '五層':0, '六層':0, '七層':0, '八層':0, '九層':0, '十層':0, '十一層':0, '十二層':0, '十三層':0, '十四層':0, '十五層':1, '十六層':1, '十七層':1, '十八層':1, '十九層':1, '二十層':1, '二十一層':0, '二十二層':0, '二十三層':0, '二十四層':0, '二十五層':0, '二十六層':0, '二十七層':0, '二十八層':0, '二十九層':0 }
data_dropped[ '十五層至二十層' ] = data_dropped[ '移轉層次' ].map(floorabove15)  
data_dropped['十五層至二十層'] = data_dropped['十五層至二十層'].fillna(0)
floorabove20 = {'一層':0, '二層':0, '三層':0, '四層':0, '五層':0, '六層':0, '七層':0, '八層':0, '九層':0, '十層':0, '十一層':0, '十二層':0, '十三層':0, '十四層':0, '十五層':0, '十六層':0, '十七層':0, '十八層':0, '十九層':0, '二十層':1, '二十一層':1, '二十二層':1, '二十三層':1, '二十四層':1, '二十五層':1, '二十六層':1, '二十七層':1, '二十八層':1, '二十九層':1 }
data_dropped[ '二十層以上' ] = data_dropped[ '移轉層次' ].map(floorabove20) 
data_dropped['二十層以上'] = data_dropped['二十層以上'].fillna(0)
data_dropped = data_dropped.drop('移轉層次', axis=1)

In [11]:
elevator = { '無':0, '有':1} #轉換離散資料
data_dropped[ '電梯' ] = data_dropped[ '電梯' ].map(elevator)

In [12]:
type0 = { '透天厝':1, '公寓':0, '住宅大樓':0, '華夏':0 }  ##轉換離散資料
data_dropped[ '透天厝' ] = data_dropped[ '建物型態' ].map(type0)  
type1 = { '透天厝':0, '公寓':1, '住宅大樓':0, '華夏':0 }
data_dropped[ '公寓' ] = data_dropped[ '建物型態' ].map(type1) 
type2 = { '透天厝':0, '公寓':0, '住宅大樓':1, '華夏':0 }
data_dropped[ '住宅大樓' ] = data_dropped[ '建物型態' ].map(type2) 
data_dropped = data_dropped.drop('建物型態', axis=1)

In [13]:
materials1 = { '鋼骨鋼筋混凝土造':1, '鋼筋混凝土造':0, '加強磚造':0, '磚造':0, '見其他登記事項':0.25,'木造':0,} ##轉換離散資料
data_dropped[ '鋼骨鋼筋混凝土造' ] = data_dropped[ '主要建材' ].map(materials1)
materials2 = { '鋼骨鋼筋混凝土造':0, '鋼筋混凝土造':1, '加強磚造':0, '磚造':0, '見其他登記事項':0.25,'木造':0,}
data_dropped[ '鋼筋混凝土造' ] = data_dropped[ '主要建材' ].map(materials2)
materials3 = { '鋼骨鋼筋混凝土造':0, '鋼筋混凝土造':0, '加強磚造':1, '磚造':0, '見其他登記事項':0.25,'木造':0,}
data_dropped[ '加強磚造' ] = data_dropped[ '主要建材' ].map(materials3)
materials4 = { '鋼骨鋼筋混凝土造':0, '鋼筋混凝土造':0, '加強磚造':0, '磚造':1, '見其他登記事項':0.25,'木造':0,}
data_dropped[ '磚造' ] = data_dropped[ '主要建材' ].map(materials4)
data_dropped = data_dropped.drop('主要建材', axis=1)

In [14]:
location1 = { '安平區':1, '北區':1, '中西區':1, '東區':1, '永康區':0, '仁德區':0,'南區':1,'新市區':0,'善化區':0,'歸仁區':0,'新化區':0,'安定區':0,'安南區':1, '新營區':0, '鹽水區':0, '白河區':0, '麻豆區':0, '佳里區':0, '學甲區':0, '後壁區':0, '東山區':0, '下營區':0, '六甲區':0, '官田區':0, '大內區':0, '西港區':0, '七股區':0, '將軍區':0, '北門區':0, '安定區':0, '山上區':0, '玉井區':0, '楠西區':0, '南化區':0, '左鎮區':0, '關廟區':0, '龍崎區':0,}
data_dropped[ '原台南市' ] = data_dropped[ '鄉鎮市區' ].map(location1)   #轉換離散資料
data_dropped = data_dropped.drop('鄉鎮市區', axis=1)

In [15]:
use = { '農牧用地':1, '乙種建築用地':1, '甲種建築用地':1, '交通用地':1, '特定目的事業用地':1,'林業用地':1 }   #無特殊標記與其他
data_dropped[ '非都市土地使用編定' ] = data_dropped[ '非都市土地使用編定' ].map(use)
data_dropped['非都市土地使用編定'] = data_dropped['非都市土地使用編定'].fillna(0)

In [16]:
data_dropped['土地'] = data_dropped['土地'].astype(str).astype(int)
data_dropped['建物'] = data_dropped['建物'].astype(str).astype(int)

In [17]:
data_dropped = data_dropped.dropna()

In [18]:
data_dropped

Unnamed: 0,非都市土地使用編定,總樓層數,建物移轉總面積平方公尺,建物現況格局-房,建物現況格局-廳,建物現況格局-衛,單價每平方公尺,電梯,土地,建物,...,十五層至二十層,二十層以上,透天厝,公寓,住宅大樓,鋼骨鋼筋混凝土造,鋼筋混凝土造,加強磚造,磚造,原台南市
0,0.0,4.0,273.59,5.0,2.0,5.0,86554,0,2,1,...,0.0,0.0,1.0,0.0,0.0,0.00,1.00,0.00,0.00,1.0
1,0.0,4.0,234.84,5.0,2.0,5.0,77500,0,2,1,...,0.0,0.0,1.0,0.0,0.0,0.00,1.00,0.00,0.00,1.0
2,0.0,4.0,194.10,5.0,2.0,4.0,82330,0,2,1,...,0.0,0.0,1.0,0.0,0.0,0.00,1.00,0.00,0.00,1.0
3,0.0,4.0,259.43,5.0,2.0,5.0,77093,0,2,1,...,0.0,0.0,1.0,0.0,0.0,0.00,1.00,0.00,0.00,1.0
4,0.0,4.0,234.84,5.0,2.0,5.0,76223,0,2,1,...,0.0,0.0,1.0,0.0,0.0,0.00,1.00,0.00,0.00,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10415,0.0,2.0,78.68,3.0,2.0,1.0,50839,0,1,1,...,0.0,0.0,1.0,0.0,0.0,0.00,0.00,1.00,0.00,0.0
10416,0.0,6.0,119.26,3.0,1.0,2.0,53665,1,1,1,...,0.0,0.0,0.0,0.0,0.0,0.00,1.00,0.00,0.00,0.0
10419,0.0,5.0,82.28,2.0,2.0,1.0,38283,1,2,1,...,0.0,0.0,0.0,0.0,0.0,0.00,1.00,0.00,0.00,0.0
10421,0.0,5.0,252.14,5.0,2.0,5.0,108987,0,1,1,...,0.0,0.0,1.0,0.0,0.0,0.25,0.25,0.25,0.25,0.0


In [19]:
data_dropped.dtypes  

非都市土地使用編定      float64
總樓層數           float64
建物移轉總面積平方公尺    float64
建物現況格局-房       float64
建物現況格局-廳       float64
建物現況格局-衛       float64
單價每平方公尺          int64
電梯               int64
土地               int32
建物               int32
五層至十層          float64
十層至十五層         float64
十五層至二十層        float64
二十層以上          float64
透天厝            float64
公寓             float64
住宅大樓           float64
鋼骨鋼筋混凝土造       float64
鋼筋混凝土造         float64
加強磚造           float64
磚造             float64
原台南市           float64
dtype: object

In [20]:
X = data_dropped.drop('單價每平方公尺', axis=1)   #將目標切出來
y = data_dropped['單價每平方公尺']

In [21]:
X_train, X_VT, y_train, y_VT = train_test_split(X, y, test_size=0.3, random_state=42)  #切分資料 
X_valid, X_test, y_valid, y_test = train_test_split(X_VT, y_VT, test_size=0.67, random_state=42)

In [22]:
X_test.iloc[1].values
X_test.iloc[1]


非都市土地使用編定        0.00
總樓層數            22.00
建物移轉總面積平方公尺    135.39
建物現況格局-房         0.00
建物現況格局-廳         2.00
建物現況格局-衛         2.00
電梯               1.00
土地               1.00
建物               1.00
五層至十層            0.00
十層至十五層           0.00
十五層至二十層          1.00
二十層以上            1.00
透天厝              0.00
公寓               0.00
住宅大樓             1.00
鋼骨鋼筋混凝土造         0.00
鋼筋混凝土造           1.00
加強磚造             0.00
磚造               0.00
原台南市             1.00
Name: 289, dtype: float64

In [23]:
scaler = StandardScaler()   
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)

In [24]:
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)   
X_valid_tensor = torch.tensor(X_valid_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)
y_valid_tensor = torch.tensor(y_valid.values, dtype=torch.float32)

batch_size = 64
train_data = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor)
train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)

In [25]:
class Tainanhouse(nn.Module):  #三層全連接層
    def __init__(self, input_size):
        super(Tainanhouse, self).__init__()
        self.fc1 = nn.Linear(input_size,64)  
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(64,32)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(32,1)

    def forward(self, x):
        
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.fc3(x)
        x = x.squeeze(-1)
        return x

In [26]:
model = Tainanhouse(input_size = X_train.shape[1])
criterion = nn.MSELoss()   #均方誤差
optimizer1 = torch.optim.SGD(model.parameters(), lr=0.01)
optimizer2 = torch.optim.Adam(model.parameters(), lr=0.01, betas=(0.9, 0.99), eps=1e-08, weight_decay=0, amsgrad=False)
optimizer3 = torch.optim.Adagrad(model.parameters(), lr=0.01, lr_decay=0, weight_decay=0, initial_accumulator_value=0)
optimizer4 = torch.optim.RMSprop(model.parameters(), lr=0.01, alpha=0.9, eps=1e-08, weight_decay=0, momentum=0, centered=False)
optimizer5 = torch.optim.RMSprop(model.parameters(), lr=0.1, alpha=0.99, eps=1e-08, weight_decay=0, momentum=0, centered=False)

In [27]:
num_epochs = 1000
        


for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0

    for inputs, targets in train_loader:
        optimizer4.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer4.step()
        
        running_loss += loss.item()

    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {running_loss/len(train_loader)}')

Epoch [10/1000], Loss: 450553844.26666665
Epoch [20/1000], Loss: 424817722.4533333
Epoch [30/1000], Loss: 417814127.5733333
Epoch [40/1000], Loss: 413735263.04
Epoch [50/1000], Loss: 416191301.76
Epoch [60/1000], Loss: 414648217.17333335
Epoch [70/1000], Loss: 411174266.6666667
Epoch [80/1000], Loss: 411001886.29333335
Epoch [90/1000], Loss: 409329370.4533333
Epoch [100/1000], Loss: 410535723.94666666
Epoch [110/1000], Loss: 410860185.81333333
Epoch [120/1000], Loss: 409896403.84
Epoch [130/1000], Loss: 407657732.26666665
Epoch [140/1000], Loss: 407016040.74666667
Epoch [150/1000], Loss: 405148789.5466667
Epoch [160/1000], Loss: 405994167.25333333
Epoch [170/1000], Loss: 402916601.6
Epoch [180/1000], Loss: 403848412.58666664
Epoch [190/1000], Loss: 403204827.3066667
Epoch [200/1000], Loss: 403928488.32
Epoch [210/1000], Loss: 401245976.1066667
Epoch [220/1000], Loss: 410854776.74666667
Epoch [230/1000], Loss: 404193685.5466667
Epoch [240/1000], Loss: 402313753.6
Epoch [250/1000], Loss:

In [28]:
model.eval()
with torch.no_grad():
    y_pred = model(X_valid_tensor)
    rmse = torch.sqrt(criterion(y_pred, y_valid_tensor))
    print(f'Root Mean Squared Error:{rmse.item()}')

Root Mean Squared Error:23218.373046875


In [29]:
sample_input = np.array([0,0,50,10,0,1,1,1,
        1,1,1,   1 ,   1  ,   0  ,    0  ,
          0,   0 ,       0  ,   0  ,   0  ,   0 ])
sample_input_scaled = scaler.transform(sample_input.reshape(1, -1))
sample_input_tensor = torch.tensor(sample_input_scaled, dtype=torch.float32)

model.eval()
with torch.no_grad():
    predicted_price = model(sample_input_tensor)
    print(predicted_price)
    print(f'Predicted Price: ${predicted_price.item() :.2f}')
                        

tensor([181354.0625])
Predicted Price: $181354.06




In [56]:
#num = data_dropped.shape[0]   #切分為訓練,驗證及測試三份
#indexs = np.random.permutation(num)
#train_indexs = indexs[:int(num*0.6)]
#val_indexs = indexs[int(num*0.6):int(num*0.8)]
#test_indexs = indexs[int(num*0.8)]
#train_data = data_dropped.iloc[train_indexs]
#val_data = data_dropped.iloc[val_indexs]
#test_data = data_dropped.iloc[test_indexs]

In [57]:
#TV_data = pd.concat([train_data, val_data])
#mu = TV_data.mean()
#std = TV_data.std()
#train_data = (train_data-mu)/std
#val_data = (val_data-mu)/std

In [29]:
#x_train = np.array(train_data.drop('總價元', axis=1))
#y_train = np.array(train_data['總價元'])
#x_val = np.array(val_data.drop('總價元', axis=1))
#y_val = np.array(val_data['總價元'])

In [30]:
#train_data

In [None]:
#model = tf.keras.Sequential()
#model.add(keras.layers.Dense(512, activation = 'relu', input_shape=(23,)))
#model.add(keras.layers.Dense(64, activation = 'relu'))
#model.add(keras.layers.Dense(1))

In [24]:
#model.compile(keras.optimizers.Adam(0.001),
#loss = keras.losses.MeanSquaredError(),
#metrics = [keras.metrics.MeanAbsoluteError()])

In [28]:
#model_dir = 'models/m1/'

In [29]:
#log_dir = os.path.join('models', 'm1')
#model_cbk=keras.callbacks.TensorBoard(log_dir=log_dir)
#model_mckp=keras.callbacks.ModelCheckpoint(model_dir+'/Best-model-1.keras',
#monitor='val_mean_absolute_error',
#save_best_only=True,
#mode='min')

In [None]:
#history = model.fit(x_train, y_train, 
#batch_size=64,
#epochs=1000,
#validation_data=(x_val, y_val),
#callbacks=[model_cbk, model_mckp])