## crawler data

In [7]:
import requests
import os
import zipfile
import time
import datetime

def real_estate_crawler(year, season):
  if year > 1000:
    year -= 1911

  # download real estate zip content
  res = requests.get("https://plvr.land.moi.gov.tw//DownloadSeason?season="+str(year)+"S"+str(season)+"&type=zip&fileName=lvr_landcsv.zip")

  # check if there's error return, if not, continue
  try:
    if '系統訊息' in res.content.decode():
      pass
  except:
      # save content to file
      fname = str(year)+str(season)+'.zip'
      open(fname, 'wb').write(res.content)

      # make additional folder for files to extract
      folder = 'real_estate' + str(year) + str(season)
      if not os.path.isdir(folder):
        os.mkdir(folder)  

      # extract files to the folder
      with zipfile.ZipFile(fname, 'r') as zip_ref:
          zip_ref.extractall(folder)
      time.sleep(5)
      #delete zip file
      os.remove(fname)
      
def get_folder_name():
    folder_name = os.listdir()
    return [x.split('real_estate')[1] for x in folder_name if x.startswith('real_estate')]

def get_file():
    #get current folder name
    current_list=get_folder_name()
    print(f'current list: {current_list}')
    #get current year in TW format
    current_year = datetime.datetime.now().year -1911
    
    for year in range(112, current_year+1):
        """which year range you want to check"""
        for season in range(1,5):
            ## check if already had extract files
            if str(year)+str(season) not in current_list:
                print(f' now in year:{year} and quarter: {season}')
                real_estate_crawler(year, season)
    print('Get data complete.')

In [None]:
get_file()

## Get data



In [5]:
import os
import pandas as pd

# 歷年資料夾
dirs = [d for d in os.listdir() if d[:4] == 'real']

dfs = []

for d in dirs:
    print(d)
    df = pd.read_csv(os.path.join(d,'b_lvr_land_a.csv'), index_col=False)
    df['Q'] = d[-1]
    dfs.append(df.iloc[1:])
    
df = pd.concat(dfs, sort=True)
df.reset_index(inplace=True, drop=True)

real_estate1061
real_estate1062
real_estate1063
real_estate1064
real_estate1071
real_estate1072
real_estate1073
real_estate1074
real_estate1081
real_estate1082
real_estate1083
real_estate1084
real_estate1091
real_estate1092
real_estate1093
real_estate1094


  df = pd.read_csv(os.path.join(d,'b_lvr_land_a.csv'), index_col=False)
  df = pd.read_csv(os.path.join(d,'b_lvr_land_a.csv'), index_col=False)


real_estate1101
real_estate1102
real_estate1103
real_estate1104
real_estate1111
real_estate1112
real_estate1113
real_estate1114
real_estate1121


In [None]:
df['單價元平方公尺'].fillna(df['單價元/平方公尺'], inplace=True)
df.drop(columns='單價元/平方公尺')

### Data pre-processing

In [None]:
def drop_nan_columns(df, columns):
    ## input a dataframe assign a columns name, drop columns with all nan values base on certain columns
    for column in columns:
        df = df.dropna(subset=[column])
    return df

# 不同名稱同項目資料合併
# df['單價元平方公尺'].fillna(df['單價元/平方公尺'], inplace=True)
# df.drop(columns='單價元/平方公尺')

# 平方公尺換成坪
df = drop_nan_columns(df, ['單價元平方公尺'])
df['單價元平方公尺'] = df['單價元平方公尺'].astype(float)
df['單價元坪'] = df['單價元平方公尺'] * 3.30579

# 建物型態
df['建物型態2'] = df['建物型態'].str.split('(').str[0]
df.drop(columns=['建物型態'],axis=1,inplace=True)

# 刪除有備註之交易（多為親友交易、價格不正常之交易）
df = df[df['備註'].isnull()]

## give some list and drop columns
df = df.drop(['備註','移轉編號','編號','非都市土地使用分區','非都市土地使用編定'], axis=1)

##交易筆棟數 change to 土地 建物 車位
## a datafram column 交易筆棟數 element format is 土地1建物1車位1, I want to split to three columns which is ['土地','建物','車位',] element is 1, 1, 1
df['土地'] = df['交易筆棟數'].str.split('土地',expand=True)[1].str.split('建物',expand=True)[0]
df['建物'] = df['交易筆棟數'].str.split('建物',expand=True)[1].str.split('車位',expand=True)[0]
df['車位'] = df['交易筆棟數'].str.split('車位',expand=True)[1]
df.drop(columns=['交易筆棟數'],axis=1,inplace=True)

#split to solely 
df['交易標的'] = df['交易標的'].str.split('(',expand=True)[0]

df.drop(columns=['建物現況格局-隔間'],axis=1,inplace=True)

## rename
df.rename(columns={'建物現況格局-廳': '廳數', '建物現況格局-房':'房','建物現況格局-衛':'衛'}, inplace=True)

#some data form is corrupted need to drop in 交易年月日 and 建築完成年月
df = df[df['交易年月日'].astype(str).apply(lambda x:(x.split('.')[0][-4:]))!='0000']
df = df[df['建築完成年月'].astype(str).apply(lambda x:(x.split('.')[0][-2:]))!='00']
df = df[df['建築完成年月'].astype(str).apply(lambda x:(x.split('.')[0][-4:-2]))!='00']
df = df[df['建築完成年月'].astype(str).apply(lambda x:(len(x)))>5]

#translate to date format
df['year'] = df['交易年月日'].astype(str).str[:-4].astype(int) + 1911
df['trade_date'] = pd.to_datetime(dict(year=df['year'], month=df['交易年月日'].astype(str).apply(lambda x:(x[-4:-2])), day=df['交易年月日'].astype(str).apply(lambda x:(x[-2:]))))

df['build_year'] = df['建築完成年月'].astype(str).str[:-4].astype(int) + 1911
# space in this column cause error
df = df[df['建築完成年月'].apply(lambda x:(' ' not in str(x)))]


### filter data

In [53]:
#exclude only 土地
df = df[df['交易標的']!='土地']

#建物型態2 to only 華夏 住宅大樓 公寓 透天
df = df[df['建物型態2'].isin(['住宅大樓','華廈','透天厝','公寓'])]

#exclude 商業工業用... but many is 見其他登記事項
df = df[df['主要用途'].apply(lambda x:(('住' in str(x)) or ('見其他' in str(x))))]


### Save data

In [54]:
df.to_csv('holi_fonuan.csv', index=False, encoding='utf-8-sig')

In [None]:
# 將index改成年月日
# df = drop_nan_columns(df, ['交易年月日'])
# df.index = pd.to_datetime((df['交易年月日'].str[:-4].astype(int) + 1911).astype(str) + df['交易年月日'].str[-4:] ,errors='coerce')

In [55]:
df[['主要用途','廳數']].groupby(['主要用途']).size().loc[lambda x: x>10].sort_values(ascending=False).reset_index()

Unnamed: 0,主要用途,0
0,住家用,71862
1,見其他登記事項,64528
2,住商用,5076
3,集合住宅,3056
4,住宅、停車空間,192
5,住宅、停車空間、樓梯間,101
6,住宅，停車空間,76
7,住工用,67
8,住宅、樓梯間,59
9,住宅,49


In [99]:
df['建築完成年月'].astype(str).apply(lambda x:(x[-2:]))

0         30
1         04
2         29
3         28
4         29
          ..
145538    28
145539    28
145540    28
145541    13
145542    20
Name: 建築完成年月, Length: 145543, dtype: object

In [146]:
df.reset_index(inplace=True, drop=True)

In [138]:
df = df[df['建築完成年月'].astype(str).apply(lambda x:(x.split('.')[0][-4:]))!='0000']

In [144]:
df.iloc[93196]

level_0                        93197
index                         165857
Q                                  3
主建物面積                          52.26
主要建材                         見其他登記事項
主要用途                             住家用
交易年月日                        1090222
交易標的                              房地
單價元平方公尺                     105243.0
土地位置建物門牌                中清路六段５１３巷５６號
土地移轉總面積平方公尺                   133.52
廳數                                 0
房                                  0
衛                                  0
建物移轉總面積平方公尺                    52.26
建築完成年月                       0690100
有無管理組織                             無
移轉層次                               全
總價元                          5500000
總樓層數                              一層
車位移轉總面積(平方公尺)                    0.0
車位總價元                              0
車位類別                             NaN
都市土地使用分區                都市：其他:第二種住宅區
鄉鎮市區                             沙鹿區
附屬建物面積                           0.0
陽台面積                             0.0
電

In [149]:
df['建築完成年月'].astype(int).astype(str).apply(lambda x:(x.split('.')[0][-4:-2])).drop_duplicates()

0     09
1     05
2     10
5     07
6     11
9     08
10    12
14    04
17    02
30    06
31    01
40    03
Name: 建築完成年月, dtype: object

In [None]:
df['build_date'] = pd.to_datetime(dict(year=df['build_year'], month=df['建築完成年月'].astype(str).apply(lambda x:(x.split('.')[0][-4:-2])), day=df['建築完成年月'].astype(int).astype(str).apply(lambda x:(x.split('.')[0][-2:]))))

In [104]:
df['build_year'] = df['建築完成年月'].astype(str).str[:-4].astype(int) + 1911

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['build_year'] = df['建築完成年月'].astype(str).str[:-4].astype(int) + 1911


In [76]:
## I had a dataframe, one of column is list of string format is 1051004, left 3 need to add 1911, middle 10 is month, right 2 is day
import pandas as pd
def  add_date(df):
    df['date'] = df['交易年月日'].astype(int).apply(lambda x: x[:3] + '1911' + x[3:11] + x[11:])
    return df