In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

from tqdm.notebook import tqdm

In [2]:
def get_property_data(url):
    try:
        response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
        text = response.text
        soup = BeautifulSoup(text, 'lxml')

        price_list = soup.find_all("span", class_='f343d9ce')
        add_small_list = soup.find_all('div', class_='_7afabd84')
        type_list = soup.find_all('div', class_='_9a4e3964')
        desc_list = soup.find_all('h2', class_='_7f17f34f')
        beds_list = soup.find_all('span', class_='b6a29bc0') # 72 instances
        href_list = soup.find_all('a', class_='_287661cb')

        prices = []
        for price_s in price_list:
            price = float(str(price_s).split(' ')[2].replace('class="f343d9ce">', '').replace('</span>', '').replace(',', ''))
            prices.append(price)

        blocks = []
        areas = []
        districts = []

        for loc in add_small_list:
            block, area, district = str(loc).split('>')[1].replace('</div', '').strip().split(',')
            block = block.strip()
            area = area.strip()
            district = district.strip()

            blocks.append(block)
            areas.append(area)
            districts.append(district)

        types = []

        for type in type_list:
            type = str(type).split('>')[1].replace('</div', '').strip()

            types.append(type)

        descriptions = []

        for desc in desc_list:
            desc = str(desc).split('>')[1].replace('</h2', '').strip()

            descriptions.append(desc)

        hrefs = []

        for href in href_list:
            link = str(href).split('"')[5]
            hrefs.append(link)

        i = 0
        beds = []
        while i < len(beds_list):
            bed = str(beds_list[i]).replace('<span aria-label="Beds" class="b6a29bc0">', '').replace('</span>', '')
            beds.append(bed)

            i += 3

        i = 1
        baths = []
        while i < len(beds_list):
            bath = str(beds_list[i]).replace('<span aria-label="Baths" class="b6a29bc0">', '').replace('</span>', '')
            baths.append(bath)

            i += 3

        i = 2
        dims = []
        while i < len(beds_list):
            dim = str(beds_list[i]).replace('<span aria-label="Area" class="b6a29bc0">', '').replace('</span>', '').replace('<span>', '')
            dims.append(dim)

            i += 3


        df = pd.DataFrame(list(zip(dims, beds, baths, types, districts, areas, 
                                   blocks, descriptions, hrefs, prices)), 
                          columns=['dimensions', 'num_beds', 'num_baths', 'house_type', 'city', 
                                   'location', 'area_block', 'description', 'url', 'price'])

        return df
    except:
        print("Error: ", url)

In [3]:
dfs = []

for i in tqdm(range(15)):
    if i == 0:
        continue
    if i == 1:
        url = 'https://www.bproperty.com/en/dhaka/apartments-for-rent-in-gulshan/?occupancy_status=vacant'
    else:
        url = f'https://www.bproperty.com/en/dhaka/apartments-for-rent-in-gulshan/page-{i}/?occupancy_status=vacant'
    
    dfs.append(get_property_data(url))

  0%|          | 0/15 [00:00<?, ?it/s]

In [4]:
df = pd.concat(dfs)

In [5]:
df.head()

Unnamed: 0,dimensions,num_beds,num_baths,house_type,city,location,area_block,description,url,price
0,"2,500 sqft",3,3,Apartment,Dhaka,Gulshan,Gulshan 2,With An Awesome Environment A 2500 Sq Ft Flat ...,/en/property/details-3806614.html,100000.0
1,"3,500 sqft",4,4,Apartment,Dhaka,Gulshan,Gulshan 1,Establish Your Peace In This Nice 3500 Sq Ft A...,/en/property/details-3859227.html,110000.0
2,"2,000 sqft",3,3,Apartment,Dhaka,Gulshan,Gulshan 2,Fairly Large Apartment Of 2000 Sq Ft Is Ready ...,/en/property/details-3431811.html,70000.0
3,"2,700 sqft",3,3,Apartment,Dhaka,Gulshan,Gulshan 1,Experience The Ultimate Luxury Lifestyle Here ...,/en/property/details-3428883.html,75000.0
4,"3,000 sqft",4,4,Apartment,Dhaka,Gulshan,Gulshan 1,3000 Sq Ft An Elegant Apartment Is Up For Rent...,/en/property/details-5234200.html,100000.0


In [6]:
df.tail()

Unnamed: 0,dimensions,num_beds,num_baths,house_type,city,location,area_block,description,url,price
5,"3,000 sqft",4,4,Apartment,Dhaka,Gulshan,Gulshan 2,"Choose your destination, 3000 SQ FT full furni...",/en/property/details-1707968.html,110000.0
6,"2,500 sqft",3,3,Apartment,Dhaka,Gulshan,Gulshan 2,Take a look to a residential apartment built i...,/en/property/details-1705462.html,100000.0
7,"2,300 sqft",3,3,Apartment,Dhaka,Gulshan,Gulshan 2,A Worthy 2300 Sq Ft Residential Apartment Is R...,/en/property/details-1700897.html,110000.0
8,"3,000 sqft",3,3,Apartment,Dhaka,Gulshan,Gulshan 2,You can move into this well planned and comfor...,/en/property/details-1697630.html,140000.0
9,"2,100 sqft",3,3,Apartment,Dhaka,Gulshan,Gulshan 2,Find Your Home In A 2100 Sq Ft Apartment For R...,/en/property/details-1694883.html,70000.0


In [13]:
df['area_block'].value_counts()

Gulshan 2    146
Gulshan 1     80
Name: area_block, dtype: int64

In [7]:
df.shape

(226, 10)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 226 entries, 0 to 9
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   dimensions   226 non-null    object 
 1   num_beds     226 non-null    object 
 2   num_baths    226 non-null    object 
 3   house_type   226 non-null    object 
 4   city         226 non-null    object 
 5   location     226 non-null    object 
 6   area_block   226 non-null    object 
 7   description  226 non-null    object 
 8   url          226 non-null    object 
 9   price        226 non-null    float64
dtypes: float64(1), object(9)
memory usage: 19.4+ KB


In [14]:
df.to_csv('bproperty_gulshan.csv', index=False)