In [33]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

from tqdm.notebook import tqdm

In [34]:
files = ['bproperty_bashundhara_RA.csv',
         'bproperty_dhanmondi.csv',
         'bproperty_gulshan.csv',
         'bproperty_mirpur.csv',
         'bproperty_uttara.csv' ]

In [35]:
dfs = []
for file in files:
    dfs.append(pd.read_csv(file))
df = pd.concat(dfs, ignore_index=True)

In [39]:
def get_unique_id(url):
    url_ = url
    unique = str(url_).replace('/en/property/details-', '').replace('.html', '')
    return unique

def fix_url(url):
    url = 'https://www.bproperty.com' + str(url)
    return url

df['unique_id'] = df['url'].apply(get_unique_id)
df['url'] = df['url'].apply(fix_url)

In [40]:
df.head()

Unnamed: 0,dimensions,num_beds,num_baths,house_type,city,location,area_block,description,url,price,unique_id
0,"2,055 sqft",4,4,Apartment,Dhaka,Bashundhara R-A,Block D,2055 Sq Ft Cozy Flat For Rent In Bashundhara R...,https://www.bproperty.com/en/property/details-...,35000.0,5237946
1,"1,911 sqft",3,3,Apartment,Dhaka,Bashundhara R-A,Block A,Emanate Your Knack For Gardening By Renting Th...,https://www.bproperty.com/en/property/details-...,45000.0,3875341
2,"1,200 sqft",3,3,Apartment,Dhaka,Bashundhara R-A,Block K,"To Secure Your Better State Of Living, Conside...",https://www.bproperty.com/en/property/details-...,20000.0,5182996
3,"1,910 sqft",3,3,Apartment,Dhaka,Bashundhara R-A,Block A,For Rental Purpose 1910 Sq Ft Commendable Desi...,https://www.bproperty.com/en/property/details-...,45000.0,3875358
4,800 sqft,2,2,Apartment,Dhaka,Bashundhara R-A,Block G,Bright And Cozy Apartment Featuring 800 Sq Ft ...,https://www.bproperty.com/en/property/details-...,18000.0,3651253


In [28]:
df.shape

(13415, 11)

In [7]:
df['location'].value_counts()

Mirpur             7935
Uttara             2731
Bashundhara R-A    1542
Dhanmondi           981
Gulshan             226
Name: location, dtype: int64

In [78]:
secondary_scrape = {}

for i in tqdm(range(len(df))):
    url = df['url'][i]
    
    response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
    text = response.text
    soup = BeautifulSoup(text, 'lxml')
    
    features = soup.find_all("span", class_='_005a682a')
    
    apartment_features = {}
    
    for j in range(len(features)):
        features[j] = str(features[j]).replace('<span class="_005a682a">', '').replace('</span>', '').replace('<!-- -->', '').replace(': ', ':')
        try:
            key, value = features[j].split(':')
        except:
            key = features[j]
            value = 'Yes'
        apartment_features[key] = value
    features = list(set(features))
    
    secondary_scrape[ df['unique_id'][i] ] = apartment_features

  0%|          | 0/13415 [00:00<?, ?it/s]

In [80]:
scraped = pd.DataFrame(secondary_scrape)

In [82]:
scraped.shape

(37, 13415)

In [84]:
scraped = scraped.T

In [85]:
scraped.head()

Unnamed: 0,View,Floor Level,Balcony or Terrace,Flooring,Parking Spaces,Elevators in Building,24 Hours Concierge,Double Glazed Windows,Freehold,Waste Disposal,...,Business Center,Laundry Facility,Jacuzzi,Steam Room,Day Care Center,First Aid Medical Center,Shared Kitchen,ATM Facility,Facilities for Disabled,Sauna
5237946,Yes,Yes,Yes,Yes,1.0,1,Yes,Yes,Yes,Yes,...,,,,,,,,,,
3875341,Yes,Yes,Yes,Yes,1.0,3,,Yes,Yes,Yes,...,,,,,,,,,,
5182996,Yes,Yes,Yes,Yes,,1,Yes,,,,...,,,,,,,,,,
3875358,,Yes,Yes,Yes,1.0,3,,Yes,Yes,Yes,...,,,,,,,,,,
3651253,Yes,Yes,Yes,Yes,,1,Yes,Yes,,,...,,,,,,,,,,


In [86]:
scraped.to_csv('secondary.csv', index=False)

In [87]:
scraped.columns

Index(['View', 'Floor Level', 'Balcony or Terrace', 'Flooring',
       'Parking Spaces', 'Elevators in Building', '24 Hours Concierge',
       'Double Glazed Windows', 'Freehold', 'Waste Disposal',
       'Lobby in Building', 'Electricity Backup', 'Intercom', 'CCTV Security',
       'Maintenance Staff', 'Swimming Pool', 'Storage Areas', 'Barbeque Area',
       'Cleaning Services', 'Service Elevators', 'Cafeteria or Canteen',
       'Broadband Internet', 'Central Heating', 'Lawn or Garden', 'Furnished',
       'Prayer Room', 'Conference Room', 'Business Center', 'Laundry Facility',
       'Jacuzzi', 'Steam Room', 'Day Care Center', 'First Aid Medical Center',
       'Shared Kitchen', 'ATM Facility', 'Facilities for Disabled', 'Sauna'],
      dtype='object')

In [88]:
scraped.fillna(0, inplace=True)

In [89]:
scraped

Unnamed: 0,View,Floor Level,Balcony or Terrace,Flooring,Parking Spaces,Elevators in Building,24 Hours Concierge,Double Glazed Windows,Freehold,Waste Disposal,...,Business Center,Laundry Facility,Jacuzzi,Steam Room,Day Care Center,First Aid Medical Center,Shared Kitchen,ATM Facility,Facilities for Disabled,Sauna
5237946,Yes,Yes,Yes,Yes,1,1,Yes,Yes,Yes,Yes,...,0,0,0,0,0,0,0,0,0,0
3875341,Yes,Yes,Yes,Yes,1,3,0,Yes,Yes,Yes,...,0,0,0,0,0,0,0,0,0,0
5182996,Yes,Yes,Yes,Yes,0,1,Yes,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3875358,0,Yes,Yes,Yes,1,3,0,Yes,Yes,Yes,...,0,0,0,0,0,0,0,0,0,0
3651253,Yes,Yes,Yes,Yes,0,1,Yes,Yes,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1701632,Yes,0,0,Yes,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1701287,Yes,0,0,Yes,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1667106,0,0,Yes,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1665314,Yes,0,Yes,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [91]:
scraped = scraped.replace('Yes', 1)

In [92]:
scraped

Unnamed: 0,View,Floor Level,Balcony or Terrace,Flooring,Parking Spaces,Elevators in Building,24 Hours Concierge,Double Glazed Windows,Freehold,Waste Disposal,...,Business Center,Laundry Facility,Jacuzzi,Steam Room,Day Care Center,First Aid Medical Center,Shared Kitchen,ATM Facility,Facilities for Disabled,Sauna
5237946,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
3875341,1,1,1,1,1,3,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
5182996,1,1,1,1,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3875358,0,1,1,1,1,3,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
3651253,1,1,1,1,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1701632,1,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1701287,1,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1667106,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1665314,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [94]:
scraped['Parking Spaces'].value_counts()

0     9246
1     4062
2       67
8       10
6        7
10       6
4        5
3        4
5        3
7        2
12       1
90       1
18       1
Name: Parking Spaces, dtype: int64

In [97]:
scraped[scraped['Parking Spaces'] == '90']

Unnamed: 0,View,Floor Level,Balcony or Terrace,Flooring,Parking Spaces,Elevators in Building,24 Hours Concierge,Double Glazed Windows,Freehold,Waste Disposal,...,Business Center,Laundry Facility,Jacuzzi,Steam Room,Day Care Center,First Aid Medical Center,Shared Kitchen,ATM Facility,Facilities for Disabled,Sauna
5351836,1,1,1,1,90,3,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [98]:
scraped['Elevators in Building'].value_counts()

0     6695
1     6043
2      544
3       83
6       33
4       11
8        3
10       2
11       1
Name: Elevators in Building, dtype: int64

In [99]:
scraped[scraped['Elevators in Building'] == '11']

Unnamed: 0,View,Floor Level,Balcony or Terrace,Flooring,Parking Spaces,Elevators in Building,24 Hours Concierge,Double Glazed Windows,Freehold,Waste Disposal,...,Business Center,Laundry Facility,Jacuzzi,Steam Room,Day Care Center,First Aid Medical Center,Shared Kitchen,ATM Facility,Facilities for Disabled,Sauna
1777430,1,0,1,1,1,11,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [101]:
scraped['Elevators in Building'] = scraped['Elevators in Building'].astype('int64')
scraped['Parking Spaces'] = scraped['Parking Spaces'].astype('int64')

In [110]:
scraped = scraped.reset_index()
scraped.rename(columns={'index':'unique_id'}, inplace=True)

scraped

Unnamed: 0,unique_id,View,Floor Level,Balcony or Terrace,Flooring,Parking Spaces,Elevators in Building,24 Hours Concierge,Double Glazed Windows,Freehold,...,Business Center,Laundry Facility,Jacuzzi,Steam Room,Day Care Center,First Aid Medical Center,Shared Kitchen,ATM Facility,Facilities for Disabled,Sauna
0,5237946,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,3875341,1,1,1,1,1,3,0,1,1,...,0,0,0,0,0,0,0,0,0,0
2,5182996,1,1,1,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3875358,0,1,1,1,1,3,0,1,1,...,0,0,0,0,0,0,0,0,0,0
4,3651253,1,1,1,1,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13410,1701632,1,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13411,1701287,1,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13412,1667106,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13413,1665314,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [102]:
scraped.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13415 entries, 5237946 to 1664303
Data columns (total 37 columns):
 #   Column                    Non-Null Count  Dtype
---  ------                    --------------  -----
 0   View                      13415 non-null  int64
 1   Floor Level               13415 non-null  int64
 2   Balcony or Terrace        13415 non-null  int64
 3   Flooring                  13415 non-null  int64
 4   Parking Spaces            13415 non-null  int64
 5   Elevators in Building     13415 non-null  int64
 6   24 Hours Concierge        13415 non-null  int64
 7   Double Glazed Windows     13415 non-null  int64
 8   Freehold                  13415 non-null  int64
 9   Waste Disposal            13415 non-null  int64
 10  Lobby in Building         13415 non-null  int64
 11  Electricity Backup        13415 non-null  int64
 12  Intercom                  13415 non-null  int64
 13  CCTV Security             13415 non-null  int64
 14  Maintenance Staff         13415 non

In [103]:
scraped.to_csv('secondary_formatted.csv', index=False)

In [105]:
def fix_dimensions(dim):
    dim = float(str(dim).replace(' sqft', '').replace(',', ''))
    return dim

df['dimensions'] = df['dimensions'].apply(fix_dimensions)

df['num_beds'] = df['num_beds'].astype('int64')
df['num_baths'] = df['num_baths'].astype('int64')

In [106]:
df

Unnamed: 0,dimensions,num_beds,num_baths,house_type,city,location,area_block,description,url,price,unique_id
0,2055.0,4,4,Apartment,Dhaka,Bashundhara R-A,Block D,2055 Sq Ft Cozy Flat For Rent In Bashundhara R...,https://www.bproperty.com/en/property/details-...,35000.0,5237946
1,1911.0,3,3,Apartment,Dhaka,Bashundhara R-A,Block A,Emanate Your Knack For Gardening By Renting Th...,https://www.bproperty.com/en/property/details-...,45000.0,3875341
2,1200.0,3,3,Apartment,Dhaka,Bashundhara R-A,Block K,"To Secure Your Better State Of Living, Conside...",https://www.bproperty.com/en/property/details-...,20000.0,5182996
3,1910.0,3,3,Apartment,Dhaka,Bashundhara R-A,Block A,For Rental Purpose 1910 Sq Ft Commendable Desi...,https://www.bproperty.com/en/property/details-...,45000.0,3875358
4,800.0,2,2,Apartment,Dhaka,Bashundhara R-A,Block G,Bright And Cozy Apartment Featuring 800 Sq Ft ...,https://www.bproperty.com/en/property/details-...,18000.0,3651253
...,...,...,...,...,...,...,...,...,...,...,...
13410,2500.0,3,4,Apartment,Dhaka,Uttara,Sector 4,For You 2500 Sq Ft Flat Is Now For Rent Near T...,https://www.bproperty.com/en/property/details-...,50000.0,1701632
13411,1600.0,3,3,Apartment,Dhaka,Uttara,Sector 6,A noteworthy Residence up for rent in Uttara a...,https://www.bproperty.com/en/property/details-...,25000.0,1701287
13412,900.0,2,2,Apartment,Dhaka,Uttara,Sector 14,A 900 SQ FT vacant apartment is ready to be re...,https://www.bproperty.com/en/property/details-...,16000.0,1667106
13413,1200.0,3,3,Apartment,Dhaka,Uttara,Sector 5,Residential Flat Of 1200 Sq Ft Is Available Fo...,https://www.bproperty.com/en/property/details-...,25000.0,1665314


In [107]:
df.drop('description', axis=1, inplace=True)
df.drop('url', axis=1, inplace=True)

Unnamed: 0,dimensions,num_beds,num_baths,house_type,city,location,area_block,price,unique_id
0,2055.0,4,4,Apartment,Dhaka,Bashundhara R-A,Block D,35000.0,5237946
1,1911.0,3,3,Apartment,Dhaka,Bashundhara R-A,Block A,45000.0,3875341
2,1200.0,3,3,Apartment,Dhaka,Bashundhara R-A,Block K,20000.0,5182996
3,1910.0,3,3,Apartment,Dhaka,Bashundhara R-A,Block A,45000.0,3875358
4,800.0,2,2,Apartment,Dhaka,Bashundhara R-A,Block G,18000.0,3651253
...,...,...,...,...,...,...,...,...,...
13410,2500.0,3,4,Apartment,Dhaka,Uttara,Sector 4,50000.0,1701632
13411,1600.0,3,3,Apartment,Dhaka,Uttara,Sector 6,25000.0,1701287
13412,900.0,2,2,Apartment,Dhaka,Uttara,Sector 14,16000.0,1667106
13413,1200.0,3,3,Apartment,Dhaka,Uttara,Sector 5,25000.0,1665314


In [112]:
merged = pd.merge(df, scraped, on='unique_id')

In [114]:
merged.head(10)

Unnamed: 0,dimensions,num_beds,num_baths,house_type,city,location,area_block,price,unique_id,View,...,Business Center,Laundry Facility,Jacuzzi,Steam Room,Day Care Center,First Aid Medical Center,Shared Kitchen,ATM Facility,Facilities for Disabled,Sauna
0,2055.0,4,4,Apartment,Dhaka,Bashundhara R-A,Block D,35000.0,5237946,1,...,0,0,0,0,0,0,0,0,0,0
1,1911.0,3,3,Apartment,Dhaka,Bashundhara R-A,Block A,45000.0,3875341,1,...,0,0,0,0,0,0,0,0,0,0
2,1200.0,3,3,Apartment,Dhaka,Bashundhara R-A,Block K,20000.0,5182996,1,...,0,0,0,0,0,0,0,0,0,0
3,1910.0,3,3,Apartment,Dhaka,Bashundhara R-A,Block A,45000.0,3875358,0,...,0,0,0,0,0,0,0,0,0,0
4,800.0,2,2,Apartment,Dhaka,Bashundhara R-A,Block G,18000.0,3651253,1,...,0,0,0,0,0,0,0,0,0,0
5,1200.0,3,3,Apartment,Dhaka,Bashundhara R-A,Block K,20000.0,5182994,1,...,0,0,0,0,0,0,0,0,0,0
6,1250.0,3,3,Apartment,Dhaka,Bashundhara R-A,Block J,25000.0,5001346,1,...,0,0,0,0,0,0,0,0,0,0
7,1250.0,3,3,Apartment,Dhaka,Bashundhara R-A,Block J,25000.0,5001350,1,...,0,0,0,0,0,0,0,0,0,0
8,1250.0,3,3,Apartment,Dhaka,Bashundhara R-A,Block J,25000.0,5001351,1,...,0,0,0,0,0,0,0,0,0,0
9,1250.0,3,3,Apartment,Dhaka,Bashundhara R-A,Block J,25000.0,5001352,1,...,0,0,0,0,0,0,0,0,0,0


In [115]:
merged.to_csv('dataset/merged.csv', index=False)