<div class="alert alert-success"><b>
Data Collection from websites
</div>

# Property listings

In [2]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

%matplotlib inline

#### Access Houses in NBO webpage 

In [3]:
r = requests.get("https://house.jumia.co.ke/for-sale/")
c = r.content
soup = BeautifulSoup(c, "html.parser")
# check source code of loaded page
# print(soup.prettify())

# # extract division
all = soup.find_all("div",{"class":"listing-info"})
len(all)

30

#### check  price element 

In [4]:
price = all[0].find("span",{"class":"listing-price"}).text.replace(",", "")
price

'KSh 5250000'

## Data Collection 

        - Crawl web pages 
        - Extract selected elements 
        - Colect dict and store in a csv

In [6]:
base_url = "https://house.jumia.co.ke/nairobi/house/for-sale/?page="
data = []
#   crawl though available pages to extract page no. links
for page in range(1,51,1): #used 51 since its the last page
    prop_page = base_url+str(page)+str("&size=30")
    r = requests.get(prop_page)
    c = r.content
    soup = BeautifulSoup(c, "html.parser")
    all = soup.find_all("div",{"class":"listing-info"})
    # collect price, detail and size from each page
    for item in all:
        d={}
    #  property value
        d["value"] = item.find("span",{"class":"listing-price"}).text.replace(",","")
    #   property description 
        d["desc"] = (item.find("a",{"class":"main-link"}).text).strip() 
    #   property size / bedroom capacity
        try:
             d["size"] = item.find("span",{"class":"listing-attributes-value"}).text.replace("\n","")
        except:
            d["size"] = None
    #   property address
        d["location"] = item.find("p",{"class":"listing-address icon-location"}).text
    
    #   collect data in a list
        data.append(d)
        
    #   store data in a dataframe
        df = pd.DataFrame(data)
    #   save to csv
        df.to_csv("scrapped_data/listing.csv")

## Load Data to DataFrame for cleaning

In [7]:
df1 = pd.read_csv('scrapped_data/listing.csv',
                      usecols=['desc', 'location', 'size', 'value'])

df1.head()

Unnamed: 0,desc,location,size,value
0,ROCKVILLA GARDENS,"Kangundo Road., Kangundo Road, Nairobi, Nairobi",3 Bedrooms,KSh 2950000
1,A Lovely And Modern 4 Bedroom House All En Sui...,"Mombasa Road., Mombasa Road, Nairobi, Nairobi",4 Bedrooms,KSh 32000000
2,"Mombasa Rd Near Kapa, 4 Bedroom Maisonette for...","Syokimau, Syokimau, Nairobi, Nairobi",4 Bedrooms,KSh 12800000
3,Farm(3 ACRES) and Redhill House for sale,"Red Hill, Nairobi, Nairobi",4 Bedrooms,KSh 180000000
4,Beautiful 3 Bedroom Bungalow With Own Compound...,"Saika Kangundo Road, Kangundo Road, Nairobi, N...",3 Bedrooms,KSh 6700000


#### clean columns

In [8]:
# Save value as Numeric in Ksh
df1['value'] = pd.to_numeric(df1['value'].str.replace("Contact seller for price|~|KSh| ","0"))

# Size to no of Bedrooms
df1['size'] = df1['size'].str.split(' ').str[0]

df1.head()

Unnamed: 0,desc,location,size,value
0,ROCKVILLA GARDENS,"Kangundo Road., Kangundo Road, Nairobi, Nairobi",3,2950000
1,A Lovely And Modern 4 Bedroom House All En Sui...,"Mombasa Road., Mombasa Road, Nairobi, Nairobi",4,32000000
2,"Mombasa Rd Near Kapa, 4 Bedroom Maisonette for...","Syokimau, Syokimau, Nairobi, Nairobi",4,12800000
3,Farm(3 ACRES) and Redhill House for sale,"Red Hill, Nairobi, Nairobi",4,180000000
4,Beautiful 3 Bedroom Bungalow With Own Compound...,"Saika Kangundo Road, Kangundo Road, Nairobi, N...",3,6700000


In [11]:
# save clean dataset
df1.to_csv('cleaned_listing_data.csv')

In [12]:
# df1.drop(562,axis=0,inplace=True)
df1.shape



(1500, 4)