In [187]:
#!/usr/bin/env python
# cofing: utf-8

# from retry import retry
import requests
from bs4 import BeautifulSoup
import pandas as pd
import datetime as dt
from tqdm import tqdm
import re
import math
# from logging import getLogger, StreamHandler, Formatter, FileHandler, DEBUG
import yaml
import os
from retry import retry
import sys
import time

notebook = True
if notebook:
    work_dir = '/home/satakei/property_valuation_calculator'
    with open(work_dir + '/setting/kenbiya_scraping_config.yaml', 'r') as yml:
        config = yaml.safe_load(yml)
    area_name = 'tokyo'
else:
    work_dir = os.getcwd()
    with open(work_dir + '/setting/kenbiya_scraping_config.yaml', 'r') as yml:
        config = yaml.safe_load(yml)
    area_name = sys.argv[1]
if area_name == 'tokyo':
    base_url = config['base_url_tokyo']
elif area_name == 'osaka':
    base_url = config['base_url_osaka']
elif area_name == 'fukuoka':
    base_url = config['base_url_fukuoka']

def write_log(log_file, text):
    f = open(log_file, 'a', encoding='UTF-8')
    f.write(text)
    f.close()
    print(text)

diff_jst_from_utc = 0
start_time = dt.datetime.now() + dt.timedelta(hours=diff_jst_from_utc)
now_time = (dt.datetime.now() +
            dt.timedelta(hours=diff_jst_from_utc)).strftime('%Y%m%d_%H%M')

log_dir = work_dir + f'/log/scraping'
os.makedirs(log_dir, exist_ok=True)
log_file = log_dir + f'/{now_time}_log.txt'
f = open(log_file, 'w', encoding='UTF-8')
f.close()

text = 'processing_start_time:' + str(start_time.replace(microsecond=0)) + '\n'
write_log(log_file, text)
excution_date = dt.datetime.today().strftime('%Y%m%d')

# file_name = 'suumo_baibai'
# excution_date = dt.datetime.today().strftime('%Y%m%d')

def get_html(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.content, "html.parser")
    return soup
# @retry(tries=3, delay=10, backoff=2)
# def main():
all_data = []

# 基本ページurl 
page = '1'
url = base_url.format(page = page)
write_log(log_file,'base_url:'+url+'\n')
# get html
item = get_html(url)

# extract all items
total_rooms = int(re.sub(r"\D", "",item.find(True,"strong", class_="result_num").get_text()))
max_page = math.floor(total_rooms/50)+ 1
text = f"max_page:{max_page} \n"
write_log(log_file,text)

url_list = []
all_data = []
error_page = []
# 物件URLの取得
data= {}

def extract_value(pattern,temp_property):
    match = re.search(pattern, temp_property)
    if match:
        return match.group(1)

# for i in tqdm(range(max_page+1)): 
for i in tqdm(range(2)): 
    all_data = []
    url = base_url.format(page = str(i))
    # get html
    item = get_html(url)
    for j in item.findAll(href=re.compile('/pp1/s/tokyo/.+/re')):
        room_url = 'https://www.kenbiya.com/'+j.get('href')
        write_log(log_file,'room_url:'+room_url+'\n')
        room_item = get_html(room_url)
        write_log(log_file,'room_item:'+str(room_item)+'\n')
        property_dict = {
            'property_name': '物件名',
            'price': '価格',
            'transportation': '交通',
            'address': '住所',
            'year_built': '築年月',
            'building_structure': '建物構造/階数',
            'exclusive_area': '専有面積',
            'floor_plan': '間取り',
            'transaction_method': '取引態様',
            'delivery': '引渡',
            'current_condition': '現況',
            'cap_rate': '満室時利回り',
            'full_occupancy_incom': '満室時年収/月収',
            'property_name': '物件名',
            'land_rights': '土地権利',
            'management_fee_repair_reserve_fund': '管理費/修繕積立',
            'management_company': '管理会社',
            'management_method': '管理方式/管理人',
            'last_update_date': '直前の更新日',
            'register_date': '情報公開日',
            'scheduled_update_date': '更新予定日',
            'management_id': '管理ID'
        }
        for k, l in property_dict.items():
            try:
                temp_property_data[k] = room_item.find('dt', text=l).find_next('dd').get_text()
            except:
                temp_property_data[k] = ''
        # 物件詳細のデータを収集
        # マンション名
        data["property_name"] = temp_property_data['property_name']

        # 価格
        data["price"] = int(temp_property_data['price'].replace(',', '').replace('万円', '')) * 10000

        # 交通
        data["train_line"] = temp_property_data['transportation'].split()[0]
        data["station"] = temp_property_data['transportation'].split()[1]
        temp_minutes_from_station = temp_property_data['transportation'].split()[2]
        pattern = r'徒歩(\d.+)分'
        data["minutes_from_station"] = extract_value(pattern,temp_minutes_from_station)

        # 住所
        temp_address= temp_property_data['address']
        pattern = r'^(.+(都|道|府|県))'
        data["prefecture_name"] = extract_value(pattern,temp_address)
        pattern = r'^(.+(市|区))'
        data["city_name"] = extract_value(pattern,temp_address)
        data["town_name"] = temp_address.replace(str(data["prefecture_name"]),'').replace(str(data["city_name"]),'')

        # 築年数
        temp_year_built = temp_property_data['year_built']
        pattern = r'築(\d*)年'
        data["year_built"] = int(extract_value(pattern,temp_year_built))

        # 構造
        temp_building_structure= temp_property_data['building_structure']
        pattern = r'^(.+)造'
        data["structure"] = extract_value(pattern,temp_building_structure)

        pattern = r'^.+造(\d+)階'
        data["floor"] = extract_value(pattern,temp_building_structure)

        pattern = r'^.+(\d.+)階建'
        data["max_floor"] = extract_value(pattern,temp_building_structure)

        # 総戸数
        temp_total_rooms = temp_property_data['building_structure'].strip()
        pattern = r'総戸数(\d+)戸'
        data["total_rooms"] = extract_value(pattern,temp_total_rooms)

        # 専有面積
        temp_exclusive_area= temp_property_data['exclusive_area']
        pattern = r'^(\d.+)m²'
        data["exclusive_area"] = extract_value(pattern,temp_exclusive_area)

        # 間取り
        data["floor_plan"] = temp_property_data['floor_plan'].split()[0]

        # 方角
        try:
            data["direction"] = temp_property_data['floor_plan'].split()[1]
        except:
            data["direction"] = None

        # 取引態様
        data["transaction_method"] = temp_property_data['transaction_method']

        # 引渡
        data["delivery"] = temp_property_data['delivery']

        # 現況
        data["current_condition"] = temp_property_data['current_condition']

        # 満室時利回り
        try:
            data["cap_rate"] = float(temp_property_data['cap_rate'].replace('％','')) / 100
        except:
            data["cap_rate"] = None

        # 満室時年収
        try:
            data["full_occupancy_incom"] = int(float(temp_property_data['full_occupancy_incom'].split()[0].replace('万円','')) * 10000)
        except:
            data["full_occupancy_incom"] = None

        # 土地権利
        data["land_rights"] = temp_property_data['land_rights']

        # 'management_fee_repair_reserve_fund': '管理費/修繕積立',
        data["management_fee"] = int(temp_property_data['management_fee_repair_reserve_fund'].split('/')[0].replace('円','').replace(',',''))
        data["repair_reserve_fund"] = int(temp_property_data['management_fee_repair_reserve_fund'].split('/')[1].replace('円','').replace(',',''))

        # 管理会社
        data["management_company"] = temp_property_data['management_company']

        # 'management_method': '管理方式/管理人',
        data["management_method"] = re.findall('(.*)\r\n  \r\n*', temp_property_data['management_method'].split('/')[0])[0]
        data["management_person"] = re.findall('\r\n    (.*)', temp_property_data['management_method'].split('/')[1])[0]

        # 直前の更新日
        date_format = '%Y年%m月%d日'
        try:
            text = temp_property_data['last_update_date'].replace(' ','')
            data["last_update_date"] = dt.datetime.strptime(text, date_format).date()
        except:
            text = temp_property_data['register_date'].replace(' ','')
            data["last_update_date"] = dt.datetime.strptime(text, date_format).date()

        # 'scheduled_update_date': '更新予定日',
        text = temp_property_data['scheduled_update_date'].replace(' ','')
        data["scheduled_update_date"] = dt.datetime.strptime(text, date_format).date()

        # 'management_id': '管理ID'
        data["management_id"] = temp_property_data['management_id']
        write_log(log_file,'Done:'+data["property_name"])
        time.sleep(3)
        all_data.append(data)
    # except:
    #     time.sleep(3)
    #     write_log(log_file,'error')

    df = pd.DataFrame(data)
    df.to_csv(work_dir+f'/scraping_raw/kenbiya_baibai_{excution_date}.csv',index = False)
    
text = 'df_shape:{}\n'.format(df.shape)
write_log(log_file,text)

end_time = dt.datetime.now() + dt.timedelta(hours=diff_jst_from_utc)
text = 'predicting done.\nend_time:{}\n'.format(end_time)
write_log(log_file,text)

processing_time = end_time - start_time
text = 'processing_time:{}\n'.format(processing_time)
write_log(log_file,text)

processing_start_time:2023-03-12 08:26:24

base_url:https://www.kenbiya.com/pp1/s/tokyo/n-1/ctk=292_300_311_299_293_302_307_298_294_301_310_314_295_303_308_313_304_306_309_312_296_305_297/

max_page:164 



 50%|█████     | 1/2 [00:00<00:00,  2.28it/s]

room_url:https://www.kenbiya.com//pp1/s/tokyo/bunkyo-ku/re_2973419xcg/

room_item:<!DOCTYPE html>

<html lang="ja">
<head>
<meta charset="utf-8"/>
<title>『東京都文京区』バストイレ別◆21平米越え（No.2973419xcg）｜健美家</title>
<meta content="WILL LINK JAPANが掲載する『東京都文京区』バストイレ別◆21平米越え（No.2973419xcg）の物件詳細ページです。不動産投資と収益物件の総合サイト健美家では、似た条件の物件や価格別、利回りが高い順、新着順などの条件で簡単に探せます。" name="description"/>
<meta content="不動産投資,東京都,文京区,『東京都文京区』バストイレ別◆21平米越え,2973419xcg,健美家,けんびや" name="keywords"/>
<meta content="noarchive,index,follow" name="robots"/>
<script>
window.dataLayer = window.dataLayer || [];
var ga_uid = ga_getUID();
if (ga_uid.login == 1) {
    dataLayer.push({'loginStatus' : 'Logged-in'});
}
else {
    dataLayer.push({'loginStatus' : 'Not_Logged-in'});
}
dataLayer.push({'userId' : ga_uid.uid});

function ga_getUID() {
    var cookies = document.cookie;
    var ret = {login: 0, uid: ''};
    if( cookies != '' ) {
        var c = cookies.split( '; ' );
        for( var i=0; i<c.length; i++ ) {
            var cookie = c

  temp_property_data[k] = room_item.find('dt', text=l).find_next('dd').get_text()
 50%|█████     | 1/2 [00:04<00:04,  4.05s/it]

room_url:https://www.kenbiya.com//pp1/s/tokyo/katsushika-ku/re_2973415sgl/

room_item:<!DOCTYPE html>

<html lang="ja">
<head>
<meta charset="utf-8"/>
<title>仯仯棙夞傝10亾仏捓戄拞仏僋儘僗慡柺挘傝懼偊嵪仯仯乮No.2973415sgl乯乥寬旤壠</title>
<meta content="WILL LINK JAPAN偑宖嵹偡傞仯仯棙夞傝10亾仏捓戄拞仏僋儘僗慡柺挘傝懼偊嵪仯仯乮No.2973415sgl乯偺暔審徻嵶儁乕僕偱偡丅晄摦嶻搳帒偲廂塿暔審偺憤崌僒僀僩寬旤壠偱偼丄帡偨忦審偺暔審傗壙奿暿丄棙夞傝偑崅偄弴丄怴拝弴側偳偺忦審偱娙扨偵扵偣傑偡丅" name="description"/>
<meta content="晄摦嶻搳帒,搶嫗搒,妺忺嬫,仯仯棙夞傝10亾仏捓戄拞仏僋儘僗慡柺挘傝懼偊嵪仯仯,2973415sgl,寬旤壠,偗傫傃傗" name="keywords"/>
<meta content="noarchive,index,follow" name="robots"/>
<script>
window.dataLayer = window.dataLayer || [];
var ga_uid = ga_getUID();
if (ga_uid.login == 1) {
    dataLayer.push({'loginStatus' : 'Logged-in'});
}
else {
    dataLayer.push({'loginStatus' : 'Not_Logged-in'});
}
dataLayer.push({'userId' : ga_uid.uid});

function ga_getUID() {
    var cookies = document.cookie;
    var ret = {login: 0, uid: ''};
    if( cookies != '' ) {
        var c = cookies.split( '; ' );
        for( var i=0; i<c.length; i++ ) {
          




ValueError: invalid literal for int() with base 10: ''

In [188]:
temp_property_data

{'property_name': '',
 'price': '',
 'transportation': '',
 'address': '',
 'year_built': '',
 'building_structure': '',
 'exclusive_area': '',
 'floor_plan': '',
 'transaction_method': '',
 'delivery': '',
 'current_condition': '',
 'cap_rate': '',
 'full_occupancy_incom': '',
 'land_rights': '',
 'management_fee_repair_reserve_fund': '',
 'management_company': '',
 'management_method': '',
 'last_update_date': '',
 'register_date': '',
 'scheduled_update_date': '',
 'management_id': ''}

In [175]:
data

{'property_name': ''}

In [183]:
room_url

'https://www.kenbiya.com//pp1/s/tokyo/katsushika-ku/re_2973415sgl/'

In [153]:
extract_value('築(\d*)年',temp_year_built)

'9'

In [47]:
for i in item.findAll(href=re.compile('/pp1/s/tokyo/.+/re')):
    print(i.get('href'))

/pp1/s/tokyo/ota-ku/re_297325082j/
/pp1/s/tokyo/katsushika-ku/re_2973245uz3/
/pp1/s/tokyo/minato-ku/re_2973243nhb/
/pp1/s/tokyo/nerima-ku/re_2973227i4k/
/pp1/s/tokyo/nerima-ku/re_2973225yph/
/pp1/s/tokyo/katsushika-ku/re_2973198j1s/
/pp1/s/tokyo/katsushika-ku/re_2973194zgg/
/pp1/s/tokyo/chuo-ku/re_2973193zun/
/pp1/s/tokyo/sumida-ku/re_2973192zte/
/pp1/s/tokyo/chuo-ku/re_2973181rcn/
/pp1/s/tokyo/meguro-ku/re_2973176rqp/
/pp1/s/tokyo/nakano-ku/re_2973167jwg/
/pp1/s/tokyo/taito-ku/re_2973157n1a/
/pp1/s/tokyo/shinjuku-ku/re_297315596t/
/pp1/s/tokyo/sumida-ku/re_2973153tl2/
/pp1/s/tokyo/shibuya-ku/re_29731399de/
/pp1/s/tokyo/taito-ku/re_2973136wq8/
/pp1/s/tokyo/ota-ku/re_2973135nip/
/pp1/s/tokyo/itabashi-ku/re_2973129s27/
/pp1/s/tokyo/minato-ku/re_2973128abi/
/pp1/s/tokyo/koto-ku/re_2973127qm4/
/pp1/s/tokyo/ota-ku/re_2973125tjm/
/pp1/s/tokyo/setagaya-ku/re_29731238ok/
/pp1/s/tokyo/sumida-ku/re_2973116fir/
/pp1/s/tokyo/chuo-ku/re_2973111dlh/
/pp1/s/tokyo/suginami-ku/re_2973105bjh/
/pp1/s/tok

In [16]:
url

'https://www.kenbiya.com/pp1/s/tokyo/n-1/ctk=292_300_311_299_293_302_307_298_294_301_310_314_295_303_308_313_304_306_309_312_296_305_297/'

In [19]:
room_url_list

[<a href="/pp1/s/tokyo/ota-ku/re_297325082j/" target="_blank">
 <ul class="prop_block">
 <li class="photo">
 <input data-closed_flg="N" id="ck_1" name="ck_pp" type="checkbox" value="297325082j"/><label for="ck_1">
 <img alt="不動産投資の投資用マンション" class="cate_icon" loading="lazy" src="/images/prop/cate_ONE.svg"/>
 <p><img alt="東京都大田区の投資用マンション／大森町_画像" loading="lazy" src="/upload/p2973/2973250/1678593527450-d7r_000.jpg"/></p>
 </label>
 </li>
 <li class="main">
 <ul>
 <li><h3>▲▼大田区・平成7年築▲内装リフォーム済・室内洗置▲▼</h3></li>
 <li>東京都大田区大森西</li>
 <li>京急本線 大森町駅 歩3分</li>
 <li><span class="new">New</span></li>
 </ul>
 </li>
 <li class="price">
 <ul>
 <li><span>1,400</span>万円</li>
 <li><span>5<span>.57</span></span>％</li>
 </ul>
 </li>
 <li>
 <ul>
 <li>専:17.43m²</li>
 </ul>
 </li>
 <li>
 <ul>
 <li>1995年6月</li>
 <li>4階/8階建</li>
 </ul>
 </li>
 </ul>
 </a>,
 <a href="/pp1/s/tokyo/katsushika-ku/re_2973245uz3/" target="_blank">
 <ul class="prop_block">
 <li class="photo">
 <input data-closed_flg="N" id="ck_2" name="