## crawler data

In [2]:
import requests
import os
import zipfile
import time

def real_estate_crawler(year, season):
  if year > 1000:
    year -= 1911

  # download real estate zip content
  res = requests.get("https://plvr.land.moi.gov.tw//DownloadSeason?season="+str(year)+"S"+str(season)+"&type=zip&fileName=lvr_landcsv.zip")

  # save content to file
  fname = str(year)+str(season)+'.zip'
  open(fname, 'wb').write(res.content)

  # make additional folder for files to extract
  folder = 'real_estate' + str(year) + str(season)
  if not os.path.isdir(folder):
    os.mkdir(folder)

  # extract files to the folder
  with zipfile.ZipFile(fname, 'r') as zip_ref:
      zip_ref.extractall(folder)

  time.sleep(10)
  
for year in range(112, 113):
  """which year range you want to check"""
  for season in range(1,5):
    print(year, season)
    real_estate_crawler(year, season)

## Get data



In [None]:
import os
import pandas as pd

# 歷年資料夾
dirs = [d for d in os.listdir() if d[:4] == 'real']

dfs = []

for d in dirs:
    print(d)
    df = pd.read_csv(os.path.join(d,'b_lvr_land_a.csv'), index_col=False)
    df['Q'] = d[-1]
    dfs.append(df.iloc[1:])
    
df = pd.concat(dfs, sort=True)

In [15]:
def drop_nan_columns(df, columns):
    ## input a dataframe assign a columns name, drop columns with all nan values base on certain columns
    for column in columns:
        df = df.dropna(subset=[column])
    return df

In [None]:
# 新增交易年份
df['year'] = df['交易年月日'].str[:-4].astype(int) + 1911

# 不同名稱同項目資料合併
df['單價元平方公尺'].fillna(df['單價元/平方公尺'], inplace=True)
df.drop(columns='單價元/平方公尺')

# 平方公尺換成坪
df = drop_nan_columns(df, ['單價元平方公尺'])
df['單價元平方公尺'] = df['單價元平方公尺'].astype(float)
df['單價元坪'] = df['單價元平方公尺'] * 3.30579

# 建物型態
df['建物型態2'] = df['建物型態'].str.split('(').str[0]

# 刪除有備註之交易（多為親友交易、價格不正常之交易）
df = df[df['備註'].isnull()]

## give some list and drop columns
df = df.drop(['備註','移轉編號','編號','非都市土地使用分區','非都市土地使用編定'], axis=1)

In [None]:
# 將index改成年月日
# df = drop_nan_columns(df, ['交易年月日'])
# df.index = pd.to_datetime((df['交易年月日'].str[:-4].astype(int) + 1911).astype(str) + df['交易年月日'].str[-4:] ,errors='coerce')

In [57]:
df.columns

Index(['Q', '主建物面積', '主要建材', '主要用途', '交易年月日', '交易標的', '交易筆棟數', '單價元平方公尺',
       '土地位置建物門牌', '土地移轉總面積平方公尺', '建物型態', '建物現況格局-廳', '建物現況格局-房', '建物現況格局-衛',
       '建物現況格局-隔間', '建物移轉總面積平方公尺', '建築完成年月', '有無管理組織', '移轉層次', '總價元', '總樓層數',
       '車位移轉總面積(平方公尺)', '車位總價元', '車位類別', '都市土地使用分區', '鄉鎮市區', '附屬建物面積', '陽台面積',
       '電梯', '單價元坪', '建物型態2'],
      dtype='object')

In [42]:
#exclude some unwant data
df = df[df['建物型態'].isin(['住宅大樓(11層含以上有電梯)','華廈(10層含以下有電梯)','公寓(5樓含以下無電梯)'])]

In [62]:
df = df[df['鄉鎮市區'].isin(['后里區','豐原區'])]

In [64]:
df.reset_index(inplace=False)

In [76]:
## I had a dataframe, one of column is list of string format is 1051004, left 3 need to add 1911, middle 10 is month, right 2 is day
import pandas as pd
def  add_date(df):
    df['date'] = df['交易年月日'].astype(int).apply(lambda x: x[:3] + '1911' + x[3:11] + x[11:])
    return df

In [3]:
import pandas as pd
df = pd.read_csv('holi_fonuan.csv')

In [4]:
df

Unnamed: 0,Q,主建物面積,主要建材,主要用途,交易年月日,交易標的,交易筆棟數,單價元平方公尺,土地位置建物門牌,土地移轉總面積平方公尺,...,車位移轉總面積(平方公尺),車位總價元,車位類別,都市土地使用分區,鄉鎮市區,附屬建物面積,陽台面積,電梯,單價元坪,建物型態2
0,1,80.61,鋼筋混凝土造,住家用,1051004,房地(土地+建物)+車位,土地1建物1車位1,45975.0,臺中市豐原區大仁街２１巷１６之１號三樓,23.05,...,0.00,0,坡道平面,住,豐原區,0.00,8.85,,151983.69525,華廈
1,1,34.30,鋼筋混凝土造,住家用,1051017,房地(土地+建物),土地1建物1車位0,25398.0,臺中市豐原區三和路２６之１號四樓,13.34,...,0.00,0,,住,豐原區,1.23,3.68,,83960.45442,華廈
2,1,54.30,鋼筋混凝土造,住家用,1050924,房地(土地+建物),土地1建物1車位0,24401.0,臺中市豐原區圓環東路569號5樓之3,7.70,...,0.00,0,,商,豐原區,1.36,11.76,,80664.58179,住宅大樓
3,1,82.91,鋼筋混凝土造,住家用,1050924,房地(土地+建物),土地3建物1車位0,30190.0,臺中市后里區甲后路一段４３號四樓,20.00,...,0.00,0,,商,后里區,0.00,9.44,,99801.80010,華廈
4,1,80.27,鋼筋混凝土造,住家用,1051003,房地(土地+建物),土地1建物1車位0,31627.0,臺中市豐原區中正路７４９巷２２弄４號二樓之１,24.16,...,0.00,0,,住,豐原區,0.00,4.05,,104552.22033,公寓
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3906,1,21.97,鋼筋混凝土造,住家用,1090604,房地(土地+建物),土地1建物1車位0,80085.0,臺中市豐原區自強南街２７６號三樓之１,10.16,...,0.00,0,,住,豐原區,0.00,1.92,有,264744.19215,華廈
3907,1,19.93,鋼筋混凝土造,住家用,1090616,房地(土地+建物),土地1建物1車位0,81672.0,臺中市豐原區自強南街２７６號二樓之２,10.05,...,0.00,0,,住,豐原區,0.00,3.25,有,269990.48088,華廈
3908,1,19.93,鋼筋混凝土造,住家用,1090605,房地(土地+建物),土地1建物1車位0,79213.0,臺中市豐原區自強南街２７６號三樓之２,10.05,...,0.00,0,,住,豐原區,0.00,3.25,有,261861.54327,華廈
3909,1,20.28,鋼筋混凝土造,住家用,1090610,房地(土地+建物),土地1建物1車位0,80951.0,臺中市豐原區自強南街２７６號三樓之５,10.18,...,0.00,0,,住,豐原區,0.00,3.55,有,267607.00629,華廈


In [69]:
df.to_csv('holi_fonuan.csv', index=False, encoding='utf-8-sig')