In [9]:
# importing the necessary packages
import pandas as pd
import numpy as np 
import requests
from random import randint
from time import sleep
from bs4 import BeautifulSoup

In [2]:
#Create a dictionary with the key are districts and values are number of page to get data for one time
district_page = {
    'quan-1': 10,
    'quan-2': 10,
    'quan-3': 10,
    'quan-4': 10,
    'quan-5': 10,
    'quan-6': 10,
    'quan-7': 10,
    'quan-8': 10,
    'quan-9': 10,
    'quan-10': 10,
    'quan-11': 10,
    'quan-12': 10,
    'quan-binh-tan': 10,
    'quan-binh-thanh': 10,
    'quan-go-vap': 10,
    'quan-phu-nhuan': 10,
    'quan-tan-binh': 10,
    'quan-tan-phu': 10,
    'quan-thu-duc': 10,
    'huyen-hoc-mon': 10,
    'huyen-binh-chanh': 10,
    'huyen-nha-be': 10
}

In [4]:
def get_id_house(district,total_page=10,scale=0):
    """
    Return id of each house of total page in the selected district

    Parameters:
    district: str
        Districts in HCM City such as "quan-1", "quan-2", "quan-tan-binh"...
    total_page: int
        Number of pages to get data
    scale: int
        Determine the beginning page to get data corresponds to total_page by the following formula:
            begin_page = total_page*scale+1   
    """
    
    id = []
    begin_page = total_page*scale+1
    end_page = begin_page+total_page
    
    for page in range(begin_page,end_page):
        url = 'https://propzy.vn/mua/bat-dong-san/hcm/'+ str(district) + '/p' + str(page)
        id_r = requests.get(url)
        id_coverpage = id_r.content
        id_soup = BeautifulSoup(id_coverpage, 'html5lib')
        id_class = id_soup.find_all('div', class_='item-listing listing-card view-as-list item-compare')
        for i in id_class:
            id.append(int(i['data-id']))
        print('Got id page',page)
    return id 

In [5]:
def get_info_house(district,total_page=10,scale=0):
    """
    Return the house attribute and price of total page in the selected district

    Parameters:
    district: str
        Districts in HCM City such as "quan-1", "quan-2", "quan-tan-binh"...
    total_page: int
        Number of pages to get data
    scale: int
        Determine the beginning page to get data corresponds to total_page by the following formula:
            begin_page = total_page*scale+1
    """
    full_attribute_list = []
    list_id = get_id_house(district,total_page,scale)
    cumsum = 0
    if len(list_id) == 0:
        return list()
    for id in list_id:
        url = 'https://propzy.vn/mua/nha/hcm/' + str(district) + '/id' + str(id)
        attribute_r = requests.get(url)
        attribute_coverpage = attribute_r.content
        attribute_soup = BeautifulSoup(attribute_coverpage)
        attribute = {}
        #Get name_district
        attribute['Địa điểm'] = district
        try:
            #Category type of house
            attribute['Phân Loại'] = attribute_soup.find_all('h1',class_='h3-title')[0].text
            #Get main_attribute
            attribute_class = attribute_soup.find_all('div', class_='tab-content entry-content')[1]
            attribute_list = []
            for a  in attribute_class.strings:
                if a != ' ':
                    attribute_list.append(a.strip())
            key_att = []
            value_att = []
            for index in range(len(attribute_list)):
                if index % 2 != 0:
                    value_att.append(attribute_list[index])
                else: 
                    key_att.append(attribute_list[index])
            for index in range(len(key_att)):
                attribute[key_att[index]] = value_att[index]
            #Get extra_attribute
            extra_class = attribute_soup.find_all('div', class_='tab-content entry-content')[2]
            extra_list = []
            for i in extra_class.strings:
                if i != ' ':
                    extra_list.append(i.strip())
            attribute['Khác'] = '. '.join(extra_list)
            #Get price_house
            price_list = []
            price_tag = attribute_soup.find_all('div',class_='p-price-n')[0].strings
            for i in price_tag:
                if i != ' ':
                    price_list.append(i.strip())
            attribute['Giá'] = price_list[0]
            attribute['Giá/m2'] = price_list[1]
            #Get full attribute and price of district
            full_attribute_list.append(attribute)
            cumsum += 1
            print('Got id',id,'home at',district,'. Total:',cumsum)
        except:
            pass
    
    # if len(full_attribute_list) > 0:
    #     pd.DataFrame(full_attribute_list).to_csv(district+'_p'+str(scale+1)+'.csv',index=False)
    
    return full_attribute_list

In [6]:
def get_info_full_district(district_page,scale=0):
    """
    Return the house attribute and price of total page in multiple selected district
    
    Parameters:
    district_page: dict
        A dictionary containing the key is district and the value is total page to get data
    scale: int
        Determine the beginning page to get data corresponds to total_page by the following formula:
            begin_page = total_page*scale+1
    """
    final = []
    for district in list(district_page.keys()):
        final += get_info_house(district,district_page[district],scale)
        print('Done:',district)
        sleep(randint(2,5))
    df = pd.DataFrame(final)
    df.to_csv(f'data_chunk/part{scale+1}.csv',index=False)
    
    return 'Done' 

In [33]:
# get_info_full_district(district_page,scale=0)

Got id page 1
Got id page 2
Got id page 3
Got id page 4
Got id page 5
Got id page 6
Got id page 7
Got id page 8
Got id page 9
Got id page 10
Got id 263171 home at quan-1 . Total: 1
Got id 406622 home at quan-1 . Total: 2
Got id 366592 home at quan-1 . Total: 3
Got id 406359 home at quan-1 . Total: 4
Got id 287050 home at quan-1 . Total: 5
Got id 300134 home at quan-1 . Total: 6
Got id 405110 home at quan-1 . Total: 7
Got id 357423 home at quan-1 . Total: 8
Got id 327578 home at quan-1 . Total: 9
Got id 311864 home at quan-1 . Total: 10
Got id 236708 home at quan-1 . Total: 11
Got id 406476 home at quan-1 . Total: 12
Got id 123198 home at quan-1 . Total: 13
Got id 406344 home at quan-1 . Total: 14
Got id 406245 home at quan-1 . Total: 15
Got id 406220 home at quan-1 . Total: 16
Got id 127977 home at quan-1 . Total: 17
Got id 250505 home at quan-1 . Total: 18
Got id 179860 home at quan-1 . Total: 19
Got id 349795 home at quan-1 . Total: 20
Got id 318344 home at quan-1 . Total: 21
Got id 

'Done'

In [6]:
# get_info_full_district(district_page,scale=1)

Got id page 11
Got id page 12
Got id page 13
Got id page 14
Got id page 15
Got id page 16
Got id page 17
Got id page 18
Got id page 19
Got id page 20
Got id 50371 home at quan-1 . Total: 1
Got id 393307 home at quan-1 . Total: 2
Got id 108881 home at quan-1 . Total: 3
Got id 152785 home at quan-1 . Total: 4
Got id 246297 home at quan-1 . Total: 5
Got id 91157 home at quan-1 . Total: 6
Got id 293066 home at quan-1 . Total: 7
Got id 371635 home at quan-1 . Total: 8
Got id 338410 home at quan-1 . Total: 9
Got id 338264 home at quan-1 . Total: 10
Got id 392700 home at quan-1 . Total: 11
Got id 171418 home at quan-1 . Total: 12
Got id 185211 home at quan-1 . Total: 13
Got id 288357 home at quan-1 . Total: 14
Got id 327559 home at quan-1 . Total: 15
Got id 324235 home at quan-1 . Total: 16
Got id 323292 home at quan-1 . Total: 17
Got id 303760 home at quan-1 . Total: 18
Got id 250556 home at quan-1 . Total: 19
Got id 298104 home at quan-1 . Total: 20
Got id 264255 home at quan-1 . Total: 21


'Done'

In [7]:
# get_info_full_district(district_page,scale=2)

Got id page 21
Got id page 22
Got id page 23
Got id page 24
Got id page 25
Got id page 26
Got id page 27
Got id page 28
Got id page 29
Got id page 30
Got id 174682 home at quan-1 . Total: 1
Got id 289061 home at quan-1 . Total: 2
Got id 287659 home at quan-1 . Total: 3
Got id 287634 home at quan-1 . Total: 4
Got id 287595 home at quan-1 . Total: 5
Got id 137488 home at quan-1 . Total: 6
Got id 266234 home at quan-1 . Total: 7
Got id 265934 home at quan-1 . Total: 8
Got id 74254 home at quan-1 . Total: 9
Got id 68137 home at quan-1 . Total: 10
Got id 45142 home at quan-1 . Total: 11
Got id 265186 home at quan-1 . Total: 12
Got id 249940 home at quan-1 . Total: 13
Got id 127090 home at quan-1 . Total: 14
Got id 110114 home at quan-1 . Total: 15
Got id 110111 home at quan-1 . Total: 16
Got id 140017 home at quan-1 . Total: 17
Got id 106375 home at quan-1 . Total: 18
Got id 154395 home at quan-1 . Total: 19
Got id 164381 home at quan-1 . Total: 20
Got id 158060 home at quan-1 . Total: 21
G

'Done'

In [7]:
# get_info_full_district(district_page,scale=3)

Got id page 31
Got id page 32
Got id page 33
Got id page 34
Got id page 35
Got id page 36
Got id page 37
Got id page 38
Got id page 39
Got id page 40
Got id 330020 home at quan-1 . Total: 1
Got id 55570 home at quan-1 . Total: 2
Got id 297952 home at quan-1 . Total: 3
Got id 90782 home at quan-1 . Total: 4
Got id 244402 home at quan-1 . Total: 5
Got id 94873 home at quan-1 . Total: 6
Got id 282801 home at quan-1 . Total: 7
Got id 332008 home at quan-1 . Total: 8
Got id 124924 home at quan-1 . Total: 9
Got id 86848 home at quan-1 . Total: 10
Got id 327567 home at quan-1 . Total: 11
Got id 89815 home at quan-1 . Total: 12
Got id 63907 home at quan-1 . Total: 13
Got id 87145 home at quan-1 . Total: 14
Got id 309654 home at quan-1 . Total: 15
Got id 71011 home at quan-1 . Total: 16
Got id 339239 home at quan-1 . Total: 17
Got id 194785 home at quan-1 . Total: 18
Got id 111257 home at quan-1 . Total: 19
Got id 177782 home at quan-1 . Total: 20
Got id 68413 home at quan-1 . Total: 21
Got id 

'Done'

In [None]:
# get_info_full_district(district_page,scale=4)
# get_info_full_district(district_page,scale=5)
# get_info_full_district(district_page,scale=6)
# get_info_full_district(district_page,scale=7)
# get_info_full_district(district_page,scale=8)
# get_info_full_district(district_page,scale=9)
# get_info_full_district(district_page,scale=10)

In [55]:
# #Concatenate to one data frame
# full_part = []
# for i in range(1,11):
#     try:
#         full_part.append(pd.read_csv(f'data_chunk/part{i}.csv'))
#     except:
#         pass
    
# pd.concat(full_part).to_csv('data/full_extra.csv',index=False)