In [15]:
import requests
from pprint import pprint
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import time
import random

# Setup Crawler

In [19]:
headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36", 
    "Accept-Encoding": "gzip, deflate", 
    "Accept-Language": "en-GB,en-US;q=0.9,en;q=0.8",
    'Dnt': '1',
    'Upgrade-Insecure-Requests': '1',
  }

In [20]:
r = requests.get('http://httpbin.org/headers', headers=headers)
pprint(r.json())

{'headers': {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
             'Accept-Encoding': 'gzip, deflate',
             'Accept-Language': 'en-GB,en-US;q=0.9,en;q=0.8',
             'Dnt': '1',
             'Host': 'httpbin.org',
             'Upgrade-Insecure-Requests': '1',
             'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) '
                           'AppleWebKit/537.36 (KHTML, like Gecko) '
                           'Chrome/83.0.4103.97 Safari/537.36',
             'X-Amzn-Trace-Id': 'Root=1-5ef33f1a-e296cdc0c0c1a9b84a4a6d38'}}


# Setup attributes to be parsed

In [58]:
ads_dict = dict(
    list_title = [],
    url = [],
    price = [],
    area = [],
    category = [],
    prop_type = [],
    prop_title1 = [],
    prop_title2 = [],
    bedrooms = [],
    bathroom = [],
    facilities = [],
    size = []
)

# Crawl from Mudah

In [59]:
# Crawl mudah properties page for 10 pages:
count=0
for pg in range(1,11):
    main_url = f'https://www.mudah.my/Kuala-Lumpur/Properties-for-sale-2000?o={pg}&q=&th=1'
    main_page = requests.get(main_url, headers=headers)
    print(f'Crawling listing from page-{pg}')
    main_soup = BeautifulSoup(main_page.content, 'html.parser')
    
    # Get all posted properties links
    listings = main_soup.find_all(class_='listing_params_container')
    
    for item in listings:
        count+=1
        # sleep randomly between 1 to 3 seconds to avoid being banned from site
        time.sleep(random.randint(1,3))
        title = item.find(class_='list_title').text
        listing_url = item.find('a', href=True)['href']
        print(f'Ad #{count}: {listing_url}')
        ads_price = item.find(class_='ads_price').text \
                    if item.find(class_='ads_price') \
                    else item.find(class_='ads_price_highlight').text
        area = item.find(class_='area').text
        category = item.find(title='Category').text

        page = requests.get(listing_url, headers=headers)
        page_soup = BeautifulSoup(page.content, 'html.parser')
        details = page_soup.find(class_='params').find_all('div')

        prop_type = np.NaN
        prop_title = np.NaN
        prop_title2 = np.NaN
        bedrooms = np.NaN
        bathroom = np.NaN
        facilities = np.NaN
        size = np.NaN

        for detail in details:
            if detail.dt.text == 'Property Type':
                prop_type = detail.dd.text
            elif detail.dt.text == 'Title type':
                prop_title1 = detail.dd.text
            elif detail.dt.text == 'Other Info':
                prop_title2 = detail.dd.text
            elif detail.dt.text == 'Bedrooms':
                bedrooms = detail.dd.text
            elif detail.dt.text == 'Bathroom':
                bathroom = detail.dd.text
            elif detail.dt.text =='Facilities':
                facilities = detail.dd.text
            elif detail.dt.text == 'Size':
                size = detail.dd.text

        ads_dict['list_title'].append(title)
        ads_dict['url'].append(listing_url)
        ads_dict['price'].append(ads_price)
        ads_dict['area'].append(area)
        ads_dict['category'].append(category)
        ads_dict['prop_type'].append(prop_type)
        ads_dict['prop_title1'].append(prop_title1)
        ads_dict['prop_title2'].append(prop_title2)
        ads_dict['bedrooms'].append(bedrooms)
        ads_dict['bathroom'].append(bathroom)
        ads_dict['facilities'].append(facilities)
        ads_dict['size'].append(size)
print('DONE!!')

Crawling listing from page-1
Ad #1: https://www.mudah.my/New+Luxury+Freehold+Residence+4min+Walk+to+Mid+Valley-83376113.htm
Ad #2: https://www.mudah.my/Sri+Putramas+1+1100sqft+Jalan+Kuching+Below+Market+Good+Condition-83716116.htm
Ad #3: https://www.mudah.my/0+DOWNPAYMENT+Arena+Green+750SF+Bukit+Jalil+FREEHOLD+RENOVATED+-83716108.htm
Ad #4: https://www.mudah.my/+Duplex+Penthouse+Silk+Residence+Duplex+Double+Storey+Penthouse+Sale-83716267.htm
Ad #5: https://www.mudah.my/BELOW+MARKET+Menara+D+Sara+Condo+Sri+Damansara+FULLY+F+RENO+100+LOAN-83716046.htm
Ad #6: https://www.mudah.my/Tmn+Cheras+Ria+Apt+Near+Ampang+Pandan+Indah+full+Loan+-83147394.htm
Ad #7: https://www.mudah.my/Pandan+Indah+Ampang+Dahlia+Court+Tingkat+2+Promo+Bawah+Harga+Pasaran-80603261.htm
Ad #8: https://www.mudah.my/Danau+Impian+Condo+Taman+Desa+918sqft+Below+Market+Good+Condition-83716077.htm
Ad #9: https://www.mudah.my/+Freehold+Sri+Putramas+2+Condo+Jalan+Kuching+Dutamas-83716235.htm
Ad #10: https://www.mudah.my/Taman+De

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Ad #95: https://www.mudah.my/HOT+UNIT+BERBALOI+Condominium+Sentul+Utama+Jalan+Dato+Senu-83713935.htm
Ad #96: https://www.mudah.my/Plaza+Medan+Putra+Menjalara+Kepong+Well+Kept+Near+Desa+Park+City-83713925.htm
Ad #97: https://www.mudah.my/Pv+12+pv12+1384sf+with+2+car+parking+genting+view+with+furnish-83409691.htm
Ad #98: https://www.mudah.my/Casa+Magna+Kepong+Metro+Prima+Walking+Distance+to+KFC+MRT+Aeon-82919511.htm
Ad #99: https://www.mudah.my/Vista+Mutiara+Non+Bumi+1081sf+2+C+P+Kepong+Next+To+Tzu+Chi+MRT-81969224.htm
Ad #100: https://www.mudah.my/Putra+Majestik+Jalan+Ipoh+Putra+Majestic+RENO+PoolView+Sentul-81925295.htm
Ad #101: https://www.mudah.my/Hijau+Ria+RENO+Corner+Lot+Kepong+Indah+Near+FIRM+Selayang+KL-82807291.htm
Ad #102: https://www.mudah.my/Serdang+Villa+Apartment+Bukit+Serdang+850sqft+RENOVATED+100+Full+Loan-81978473.htm
Ad #103: https://www.mudah.my/Pangsapuri+Jati+Selatan+Desa+Petaling+826sqft+100+Full+Loan+Nice+View-79658780.htm
Ad #104: https://www.mudah.my/Huge+Size+Sr

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Ad #204: https://www.mudah.my/Semi+D+l+Taman+Yarl+l+OUG+l+Bukit+Jalil+l+Old+Klang+Road-83521977.htm
Ad #205: https://www.mudah.my/Setapak+Taman+Melati+New+Condo+100m+to+TARUC+LRT-83001733.htm
Ad #206: https://www.mudah.my/SD+Apartment+2+Bandar+Sri+Damansara+Kepong-83711965.htm
Ad #207: https://www.mudah.my/Cheras+Taman+Connaught+2sty+House+Good+Location-83711953.htm
Ad #208: https://www.mudah.my/De+Tropicana+Kuchai+Lama+989sf+renoated+100+FULL+LOAN-83711942.htm
Ad #209: https://www.mudah.my/The+Lumayan+Apartment+Bandar+Sri+Permaisuri+Cheras+Kuala+Lumpur-83598820.htm
Ad #210: https://www.mudah.my/FH+Double+Storey+Banglow+at+Yulek+Taman+Bukit+Cheras+near+Leisure+Mall-83711817.htm
Ad #211: https://www.mudah.my/Taman+bukit+permai+shop+apartment+big+good+for+investment+cheap-83711812.htm
Ad #212: https://www.mudah.my/Tuan+puan+jom+happy+ii+Taman+Sri+Bahagia+Cheras+Kuala+Lumpur-83711778.htm
Ad #213: https://www.mudah.my/+Best+Utk+Sendiri+Tinggal+Plaza+Medan+Putra+Menara+Menjalara+Kepong-8371

In [67]:
crawled_df = pd.DataFrame.from_dict(ads_dict)
crawled_df.head()

Unnamed: 0,list_title,url,price,area,category,prop_type,prop_title1,prop_title2,bedrooms,bathroom,facilities,size
0,New Luxury Freehold Residence 4min Walk to M...,https://www.mudah.my/New+Luxury+Freehold+Resid...,RM 597 000,Mid Valley City,Apartments,Condo / Services residence / Penthouse / Townh...,Freehold,Non Bumi Lot,1,2,,689 sq.ft.
1,Sri Putramas 1 1100sqft Jalan Kuching Below ...,https://www.mudah.my/Sri+Putramas+1+1100sqft+J...,RM 405 000,Jalan Kuching,Apartments,Condo / Services residence / Penthouse / Townh...,Freehold,Non Bumi Lot,3,2,"Swimming Pool, Gymnasium, Tennis Court, Squash...",1100 sq.ft.
2,0% DOWNPAYMENT Arena Green 750SF Bukit Jalil...,https://www.mudah.my/0+DOWNPAYMENT+Arena+Green...,RM 320 000,Bukit Jalil,Apartments,Condo / Services residence / Penthouse / Townh...,Freehold,Non Bumi Lot,3,2,"Mini Market, Playground, Jogging Track, 24 Hou...",878 sq.ft.
3,[Duplex Penthouse] Silk Residence Duplex Dou...,https://www.mudah.my/+Duplex+Penthouse+Silk+Re...,RM 900 000,Cheras,Apartments,Condo / Services residence / Penthouse / Townh...,Freehold,Non Bumi Lot,6,7,"Swimming Pool, Gymnasium, Mini Market, Playgro...",4177 sq.ft.
4,BELOW MARKET!! Menara D'Sara Condo Sri Daman...,https://www.mudah.my/BELOW+MARKET+Menara+D+Sar...,RM 380 000,Sri Damansara,Apartments,Condo / Services residence / Penthouse / Townh...,Freehold,Non Bumi Lot,3,2,"Swimming Pool, Gymnasium, Tennis Court, Squash...",1130 sq.ft.


# Clean Crawled Data

### Format Price to Integer

In [73]:
crawled_df['price'] = crawled_df['price'].apply(lambda x: x.replace(' ','').strip('RM')).astype(int)

### Split size column to size with float type and size_unit with respective size metrics

In [85]:
split_size = crawled_df['size'].str.split(expand=True)
crawled_df['size'] = split_size[0].astype(float)
crawled_df['size_unit'] = split_size[1]

### Trim White Space on both ends of the cell values

In [108]:
crawled_df = crawled_df.apply(lambda x: x.str.strip() if x.dtypes == object else x)

# View clean data and save data to file

In [111]:
crawled_df.head()

Unnamed: 0,list_title,url,price,area,category,prop_type,prop_title1,prop_title2,bedrooms,bathroom,facilities,size,size_unit
0,New Luxury Freehold Residence 4min Walk to Mid...,https://www.mudah.my/New+Luxury+Freehold+Resid...,597000,Mid Valley City,Apartments,Condo / Services residence / Penthouse / Townh...,Freehold,Non Bumi Lot,1,2,,689.0,sq.ft.
1,Sri Putramas 1 1100sqft Jalan Kuching Below Ma...,https://www.mudah.my/Sri+Putramas+1+1100sqft+J...,405000,Jalan Kuching,Apartments,Condo / Services residence / Penthouse / Townh...,Freehold,Non Bumi Lot,3,2,"Swimming Pool, Gymnasium, Tennis Court, Squash...",1100.0,sq.ft.
2,0% DOWNPAYMENT Arena Green 750SF Bukit Jalil [...,https://www.mudah.my/0+DOWNPAYMENT+Arena+Green...,320000,Bukit Jalil,Apartments,Condo / Services residence / Penthouse / Townh...,Freehold,Non Bumi Lot,3,2,"Mini Market, Playground, Jogging Track, 24 Hou...",878.0,sq.ft.
3,[Duplex Penthouse] Silk Residence Duplex Doubl...,https://www.mudah.my/+Duplex+Penthouse+Silk+Re...,900000,Cheras,Apartments,Condo / Services residence / Penthouse / Townh...,Freehold,Non Bumi Lot,6,7,"Swimming Pool, Gymnasium, Mini Market, Playgro...",4177.0,sq.ft.
4,BELOW MARKET!! Menara D'Sara Condo Sri Damansa...,https://www.mudah.my/BELOW+MARKET+Menara+D+Sar...,380000,Sri Damansara,Apartments,Condo / Services residence / Penthouse / Townh...,Freehold,Non Bumi Lot,3,2,"Swimming Pool, Gymnasium, Tennis Court, Squash...",1130.0,sq.ft.


In [112]:
crawled_df.to_csv('Q1_Mudah_PropAds.csv', index=False)