## crawler data

In [7]:
import requests
import os
import zipfile
import time
import datetime

def real_estate_crawler(year, season):
  if year > 1000:
    year -= 1911

  # download real estate zip content
  res = requests.get("https://plvr.land.moi.gov.tw//DownloadSeason?season="+str(year)+"S"+str(season)+"&type=zip&fileName=lvr_landcsv.zip")

  # check if there's error return, if not, continue
  try:
    if '系統訊息' in res.content.decode():
      pass
  except:
      # save content to file
      fname = str(year)+str(season)+'.zip'
      open(fname, 'wb').write(res.content)

      # make additional folder for files to extract
      folder = 'real_estate' + str(year) + str(season)
      if not os.path.isdir(folder):
        os.mkdir(folder)  

      # extract files to the folder
      with zipfile.ZipFile(fname, 'r') as zip_ref:
          zip_ref.extractall(folder)
      time.sleep(5)
      #delete zip file
      os.remove(fname)
      
def get_folder_name():
    folder_name = os.listdir()
    return [x.split('real_estate')[1] for x in folder_name if x.startswith('real_estate')]

def get_file():
    #get current folder name
    current_list=get_folder_name()
    print(f'current list: {current_list}')
    #get current year in TW format
    current_year = datetime.datetime.now().year -1911
    
    for year in range(112, current_year+1):
        """which year range you want to check"""
        for season in range(1,5):
            ## check if already had extract files
            if str(year)+str(season) not in current_list:
                print(f' now in year:{year} and quarter: {season}')
                real_estate_crawler(year, season)
    print('Get data complete.')

In [None]:
get_file()

## Get data



In [None]:
import os
import pandas as pd

# 歷年資料夾
dirs = [d for d in os.listdir() if d[:4] == 'real']

dfs = []

for d in dirs:
    print(d)
    df = pd.read_csv(os.path.join(d,'b_lvr_land_a.csv'), index_col=False)
    df['Q'] = d[-1]
    dfs.append(df.iloc[1:])
    
df = pd.concat(dfs, sort=True)
df.reset_index(inplace=True, drop=True)

In [105]:
df['單價元平方公尺'].fillna(df['單價元/平方公尺'], inplace=True)
df.drop(columns='單價元/平方公尺')

<class 'str'>
<class 'int'>


In [None]:
def drop_nan_columns(df, columns):
    ## input a dataframe assign a columns name, drop columns with all nan values base on certain columns
    for column in columns:
        df = df.dropna(subset=[column])
    return df

# 新增交易年份
df['year'] = df['交易年月日'].astype(str).str[:-4].astype(int) + 1911

# 不同名稱同項目資料合併
# df['單價元平方公尺'].fillna(df['單價元/平方公尺'], inplace=True)
# df.drop(columns='單價元/平方公尺')

# 平方公尺換成坪
df = drop_nan_columns(df, ['單價元平方公尺'])
df['單價元平方公尺'] = df['單價元平方公尺'].astype(float)
df['單價元坪'] = df['單價元平方公尺'] * 3.30579

# 建物型態
df['建物型態2'] = df['建物型態'].str.split('(').str[0]

# 刪除有備註之交易（多為親友交易、價格不正常之交易）
df = df[df['備註'].isnull()]

## give some list and drop columns
df = df.drop(['備註','移轉編號','編號','非都市土地使用分區','非都市土地使用編定'], axis=1)

##交易筆棟數 change to 土地 建物 車位
## a datafram column 交易筆棟數 element format is 土地1建物1車位1, I want to split to three columns which is ['土地','建物','車位',] element is 1, 1, 1
df['土地'] = df['交易筆棟數'].str.split('土地',expand=True)[1].str.split('建物',expand=True)[0]
df['建物'] = df['交易筆棟數'].str.split('建物',expand=True)[1].str.split('車位',expand=True)[0]
df['車位'] = df['交易筆棟數'].str.split('車位',expand=True)[1]
df.drop(columns=['交易筆棟數'],axis=1,inplace=True)

#split to solely 
df['交易標的'] = df['交易標的'].str.split('(',expand=True)[0]

df.drop(columns=['建物現況格局-隔間'],axis=1,inplace=True)

## rename
df.rename(columns={'建物現況格局-廳': '廳數', '建物現況格局-房':'房','建物現況格局-衛':'衛'}, inplace=True)


In [6]:
df

Unnamed: 0,Q,主建物面積,主要建材,主要用途,交易年月日,交易標的,交易筆棟數,單價元平方公尺,土地位置建物門牌,土地移轉總面積平方公尺,...,車位總價元,車位類別,都市土地使用分區,鄉鎮市區,附屬建物面積,陽台面積,電梯,year,單價元坪,建物型態2
0,1,58.33,鋼筋混凝土造,住家用,1050928,房地(土地+建物)+車位,土地1建物1車位1,52398.0,臺中市南區工學一街１７９巷８號３樓之５,13.56,...,0,坡道機械,住,南區,0.0,4.48,,2016,173216.78442,住宅大樓
1,1,24.57,鋼筋混凝土造,住家用,1051013,房地(土地+建物),土地3建物1車位0,34562.0,臺中市西區三民路一段３９巷３弄９號６樓之６,4.38,...,0,,住,西區,0.0,6.21,,2016,114254.71398,住宅大樓
2,1,100.93,鋼筋混凝土造,見其他登記事項,1051001,房地(土地+建物)+車位,土地1建物1車位2,68688.0,臺中市南區工學北路３５７號１１樓之２,25.77,...,0,坡道平面,住,南區,6.38,12.49,,2016,227068.10352,住宅大樓
5,1,79.7,鋼筋混凝土造,見其他登記事項,1051005,房地(土地+建物)+車位,土地1建物1車位1,58407.0,臺中市南區五權南一路５５號十五樓之一,15.7,...,0,坡道平面,住,南區,2.37,3.79,,2016,193081.27653,住宅大樓
6,1,161.03,鋼筋混凝土造,見其他登記事項,1050908,房地(土地+建物)+車位,土地1建物1車位3,75108.0,臺中市西區大墩十一街９號十二樓之五,49.57,...,0,坡道平面,住,西區,8.34,16.07,,2016,248291.27532,住宅大樓
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
313209,1,52.82,鋼筋混凝土造,住家用,1071017,房地(土地+建物)+車位,土地1建物1車位1,77966.0,臺中市北屯區四平路５７６號九樓之３,10.4,...,1200000,坡道平面,住,北屯區,0.0,7.84,有,2018,257739.22314,住宅大樓
313210,1,52.82,鋼筋混凝土造,住家用,1071017,房地(土地+建物)+車位,土地1建物1車位1,77966.0,臺中市北屯區四平路５７６號十樓之３,10.4,...,1200000,坡道平面,住,北屯區,0.0,7.84,有,2018,257739.22314,住宅大樓
313246,1,53.42,鋼筋混凝土造,住家用,1070512,房地(土地+建物)+車位,土地1建物1車位1,72122.0,臺中市北屯區四平路５６８巷５號二十樓之３,10.4,...,1100000,坡道平面,住,北屯區,0.0,7.24,有,2018,238420.18638,住宅大樓
313270,1,116.1,鋼筋混凝土造,住家用,1000205,房地(土地+建物)+車位,土地1建物1車位2,96477.0,臺中市北屯區軍福七路１５８號五樓之１,28.56,...,0,坡道平面,住,北屯區,5.59,17.22,有,2011,318932.70183,住宅大樓


In [None]:
# 將index改成年月日
# df = drop_nan_columns(df, ['交易年月日'])
# df.index = pd.to_datetime((df['交易年月日'].str[:-4].astype(int) + 1911).astype(str) + df['交易年月日'].str[-4:] ,errors='coerce')

In [44]:
df['year'] = df['交易年月日'].astype(str).str[:-4].astype(int) + 1911

In [39]:
df.drop(columns=['建物型態2'],axis=1,inplace=True)

In [43]:
df['建物現況格局-衛'].drop_duplicates()

0          1
1          2
6          3
8          0
26         6
          ..
192645    58
207088    61
228197    55
228718    66
291827    41
Name: 建物現況格局-衛, Length: 72, dtype: object

In [45]:
df.columns


Index(['Q', '主建物面積', '主要建材', '主要用途', '交易年月日', '交易標的', '單價元平方公尺', '土地位置建物門牌',
       '土地移轉總面積平方公尺', '建物型態', '廳數', '房', '衛', '建物移轉總面積平方公尺', '建築完成年月',
       '有無管理組織', '移轉層次', '總價元', '總樓層數', '車位移轉總面積(平方公尺)', '車位總價元', '車位類別',
       '都市土地使用分區', '鄉鎮市區', '附屬建物面積', '陽台面積', '電梯', 'year', '單價元坪', '土地', '建物',
       '車位', '交易標的2'],
      dtype='object')

In [46]:
df

Unnamed: 0,Q,主建物面積,主要建材,主要用途,交易年月日,交易標的,單價元平方公尺,土地位置建物門牌,土地移轉總面積平方公尺,建物型態,...,鄉鎮市區,附屬建物面積,陽台面積,電梯,year,單價元坪,土地,建物,車位,交易標的2
0,1,58.33,鋼筋混凝土造,住家用,1050928,房地,52398.0,臺中市南區工學一街１７９巷８號３樓之５,13.56,住宅大樓(11層含以上有電梯),...,南區,0.0,4.48,,2016,173216.78442,1,1,1,房地
1,1,24.57,鋼筋混凝土造,住家用,1051013,房地,34562.0,臺中市西區三民路一段３９巷３弄９號６樓之６,4.38,住宅大樓(11層含以上有電梯),...,西區,0.0,6.21,,2016,114254.71398,3,1,0,房地
2,1,100.93,鋼筋混凝土造,見其他登記事項,1051001,房地,68688.0,臺中市南區工學北路３５７號１１樓之２,25.77,住宅大樓(11層含以上有電梯),...,南區,6.38,12.49,,2016,227068.10352,1,1,2,房地
5,1,79.7,鋼筋混凝土造,見其他登記事項,1051005,房地,58407.0,臺中市南區五權南一路５５號十五樓之一,15.7,住宅大樓(11層含以上有電梯),...,南區,2.37,3.79,,2016,193081.27653,1,1,1,房地
6,1,161.03,鋼筋混凝土造,見其他登記事項,1050908,房地,75108.0,臺中市西區大墩十一街９號十二樓之五,49.57,住宅大樓(11層含以上有電梯),...,西區,8.34,16.07,,2016,248291.27532,1,1,3,房地
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
313209,1,52.82,鋼筋混凝土造,住家用,1071017,房地,77966.0,臺中市北屯區四平路５７６號九樓之３,10.4,住宅大樓(11層含以上有電梯),...,北屯區,0.0,7.84,有,2018,257739.22314,1,1,1,房地
313210,1,52.82,鋼筋混凝土造,住家用,1071017,房地,77966.0,臺中市北屯區四平路５７６號十樓之３,10.4,住宅大樓(11層含以上有電梯),...,北屯區,0.0,7.84,有,2018,257739.22314,1,1,1,房地
313246,1,53.42,鋼筋混凝土造,住家用,1070512,房地,72122.0,臺中市北屯區四平路５６８巷５號二十樓之３,10.4,住宅大樓(11層含以上有電梯),...,北屯區,0.0,7.24,有,2018,238420.18638,1,1,1,房地
313270,1,116.1,鋼筋混凝土造,住家用,1000205,房地,96477.0,臺中市北屯區軍福七路１５８號五樓之１,28.56,住宅大樓(11層含以上有電梯),...,北屯區,5.59,17.22,有,2011,318932.70183,1,1,2,房地


In [42]:
#exclude some unwant data
df = df[df['建物型態'].isin(['住宅大樓(11層含以上有電梯)','華廈(10層含以下有電梯)','公寓(5樓含以下無電梯)'])]

In [62]:
df = df[df['鄉鎮市區'].isin(['后里區','豐原區'])]

In [64]:
df.reset_index(inplace=False)

In [76]:
## I had a dataframe, one of column is list of string format is 1051004, left 3 need to add 1911, middle 10 is month, right 2 is day
import pandas as pd
def  add_date(df):
    df['date'] = df['交易年月日'].astype(int).apply(lambda x: x[:3] + '1911' + x[3:11] + x[11:])
    return df

In [3]:
import pandas as pd
df = pd.read_csv('holi_fonuan.csv')

In [4]:
df

Unnamed: 0,Q,主建物面積,主要建材,主要用途,交易年月日,交易標的,交易筆棟數,單價元平方公尺,土地位置建物門牌,土地移轉總面積平方公尺,...,車位移轉總面積(平方公尺),車位總價元,車位類別,都市土地使用分區,鄉鎮市區,附屬建物面積,陽台面積,電梯,單價元坪,建物型態2
0,1,80.61,鋼筋混凝土造,住家用,1051004,房地(土地+建物)+車位,土地1建物1車位1,45975.0,臺中市豐原區大仁街２１巷１６之１號三樓,23.05,...,0.00,0,坡道平面,住,豐原區,0.00,8.85,,151983.69525,華廈
1,1,34.30,鋼筋混凝土造,住家用,1051017,房地(土地+建物),土地1建物1車位0,25398.0,臺中市豐原區三和路２６之１號四樓,13.34,...,0.00,0,,住,豐原區,1.23,3.68,,83960.45442,華廈
2,1,54.30,鋼筋混凝土造,住家用,1050924,房地(土地+建物),土地1建物1車位0,24401.0,臺中市豐原區圓環東路569號5樓之3,7.70,...,0.00,0,,商,豐原區,1.36,11.76,,80664.58179,住宅大樓
3,1,82.91,鋼筋混凝土造,住家用,1050924,房地(土地+建物),土地3建物1車位0,30190.0,臺中市后里區甲后路一段４３號四樓,20.00,...,0.00,0,,商,后里區,0.00,9.44,,99801.80010,華廈
4,1,80.27,鋼筋混凝土造,住家用,1051003,房地(土地+建物),土地1建物1車位0,31627.0,臺中市豐原區中正路７４９巷２２弄４號二樓之１,24.16,...,0.00,0,,住,豐原區,0.00,4.05,,104552.22033,公寓
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3906,1,21.97,鋼筋混凝土造,住家用,1090604,房地(土地+建物),土地1建物1車位0,80085.0,臺中市豐原區自強南街２７６號三樓之１,10.16,...,0.00,0,,住,豐原區,0.00,1.92,有,264744.19215,華廈
3907,1,19.93,鋼筋混凝土造,住家用,1090616,房地(土地+建物),土地1建物1車位0,81672.0,臺中市豐原區自強南街２７６號二樓之２,10.05,...,0.00,0,,住,豐原區,0.00,3.25,有,269990.48088,華廈
3908,1,19.93,鋼筋混凝土造,住家用,1090605,房地(土地+建物),土地1建物1車位0,79213.0,臺中市豐原區自強南街２７６號三樓之２,10.05,...,0.00,0,,住,豐原區,0.00,3.25,有,261861.54327,華廈
3909,1,20.28,鋼筋混凝土造,住家用,1090610,房地(土地+建物),土地1建物1車位0,80951.0,臺中市豐原區自強南街２７６號三樓之５,10.18,...,0.00,0,,住,豐原區,0.00,3.55,有,267607.00629,華廈


In [69]:
df.to_csv('holi_fonuan.csv', index=False, encoding='utf-8-sig')