# Nairobi House Price Prediction – Day 1  
Data Collection from BuyRentKenya

Goal: scrape ~400 Nairobi house listings  
Fields: Location, Property Type, Bedrooms, Bathrooms, Size, Amenities, Price, Date  


In [15]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
from datetime import datetime
import os
import re   # for cleaning phones

print("Started:", datetime.now().strftime("%Y-%m-%d %H:%M"))

Started: 2026-02-18 16:18


In [16]:
# Save location
data_folder = "../data/raw/"
output_file = "raw_listings_buyrentkenya_2026-02-18.csv"

if not os.path.exists(data_folder):
    os.makedirs(data_folder)
    print("Created:", data_folder)

In [23]:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/120.0 Safari/537.36"
}

base_url = "https://www.buyrentkenya.com/houses-for-sale/nairobi"

min_delay = 3.0
max_delay = 8.0

target = 400
max_pages_limit = 40   # safety

print(f"Target: {target} listings | Max pages: {max_pages_limit}")
print("Note: page 1 has no ?page= param, pages 2+ use ?page=N")

Target: 400 listings | Max pages: 40
Note: page 1 has no ?page= param, pages 2+ use ?page=N


In [24]:
# ────────────────────────────────────────────────────────────────
# Function to scrape ONE page from BuyRentKenya
# ────────────────────────────────────────────────────────────────
def scrape_one_page(page_num):
    """Fetch and parse one page of house listings"""
    
    # Special handling: page 1 has no ?page= parameter
    if page_num == 1:
        url = base_url
    else:
        url = base_url + "?page=" + str(page_num)
    
    print(f"Scraping page {page_num}: {url}")
    
    try:
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status()
    except Exception as e:
        print(f"   → Request failed: {e}")
        return []
    
    soup = BeautifulSoup(response.text, "html.parser")
    
    # Debug: check if we actually got content with listings
    page_text = soup.get_text().lower()
    print("DEBUG: 'bedroom' found?", "YES" if "bedroom" in page_text else "NO")
    print("DEBUG: 'ksh' or 'kes' found?", "YES" if "ksh" in page_text or "kes" in page_text else "NO")
    print("DEBUG: 'price' keyword found?", "YES" if "price" in page_text else "NO")
    
    # Main selector - most common for BuyRentKenya listing cards
    cards = soup.find_all("div", class_="listing-card")
    
    # If nothing found, try common alternatives
    if len(cards) == 0:
        print("   → 'listing-card' gave 0, trying other classes...")
        cards = soup.find_all("div", class_=["property-card", "card", "listing-item", "search-result"])
    
    # Last resort fallback: any div containing key words
    if len(cards) == 0:
        print("   → Still no cards, using broad fallback")
        cards = [
            tag for tag in soup.find_all("div")
            if tag.get_text().lower() and ("bedroom" in tag.get_text().lower() or "ksh" in tag.get_text().lower())
        ]
    
    if len(cards) == 0:
        print("   → No listings found on this page. Selector probably wrong or JS-loaded content.")
        return []
    
    print(f"   → Found {len(cards)} potential listing cards")
    
    results = []
    
    for card in cards:
        try:
            # Location (often in a p or div with specific class)
            location = ""
            loc_tag = card.find("p", class_=["location", "ml-1", "text-grey-650"])
            if loc_tag:
                location = loc_tag.get_text(strip=True)
            else:
                # fallback
                loc_span = card.find("span", class_="capitalize")
                if loc_span:
                    location = loc_span.get_text(strip=True)
            
            # Title → used to guess property type
            title_tag = card.find(["h2", "h3", "span"], class_=["title", "property-title", "relative"])
            title = title_tag.get_text(strip=True) if title_tag else ""
            prop_type = "house"
            t = title.lower()
            if "townhouse" in t:
                prop_type = "townhouse"
            elif "villa" in t:
                prop_type = "villa"
            elif "apartment" in t or "flat" in t:
                prop_type = "apartment"
            
            # Beds, Baths, Size – usually in spans or li tags
            beds = baths = size = None
            feature_spans = card.find_all("span", class_="whitespace-nowrap")
            if feature_spans:
                for i, span in enumerate(feature_spans):
                    txt = span.get_text(strip=True).lower()
                    if "bed" in txt:
                        beds = txt.split()[0]
                    elif "bath" in txt:
                        baths = txt.split()[0]
                    elif any(unit in txt for unit in ["m²", "sqm", "acre", "plot", "ft"]):
                        size = txt
            
            # Price – look for bold/highlighted price text
            price = None
            price_tag = card.find("p", class_=["font-bold", "text-xl", "text-grey-900"])
            if price_tag:
                price_text = price_tag.get_text(strip=True)
                digits = "".join(c for c in price_text if c.isdigit())
                if digits:
                    try:
                        price = int(digits)
                    except ValueError:
                        pass
            
            # Amenities / short description
            amenities = ""
            desc_tag = card.find("p", class_=["text-grey-500", "truncate"])
            if desc_tag:
                amenities = desc_tag.get_text(strip=True)
                # Remove obvious phone numbers
                amenities = re.sub(r'(\+?254|07|01)\d{8,}|call|contact|whats[a]?pp', '', amenities, flags=re.I)
                amenities = re.sub(r'\s+', ' ', amenities).strip()
            
            # Listing date – rarely present on list pages
            listing_date = ""
            
            # Save the row
            results.append({
                "Location": location,
                "Property Type": prop_type,
                "Bedrooms": beds,
                "Bathrooms": baths,
                "Size": size,
                "Amenities": amenities,
                "Price": price,
                "Listing Date": listing_date,
                "Source_URL": url,
                "Scrape_Date": datetime.now().strftime("%Y-%m-%d")
            })
        
        except Exception as e:
            # print(f"Error parsing one card: {e}")  # uncomment if you want to see errors
            continue
    
    print(f"   → Successfully parsed {len(results)} listings from page {page_num}")
    return results

In [25]:
all_data = []
page = 1

print("\nStarting scrape...\n")

while len(all_data) < target and page <= max_pages_limit:
    page_data = scrape_one_page(page)
    
    if not page_data:
        print("Stopping – no more listings or selector broken")
        break
    
    all_data.extend(page_data)
    print(f"Total so far: {len(all_data)}\n")
    
    wait = random.uniform(min_delay, max_delay)
    print(f"   Sleeping {wait:.1f} sec...")
    time.sleep(wait)
    
    page += 1

print("Finished scraping")


Starting scrape...

Scraping page 1: https://www.buyrentkenya.com/houses-for-sale/nairobi
DEBUG: 'bedroom' found? YES
DEBUG: 'ksh' or 'kes' found? YES
DEBUG: 'price' keyword found? YES
   → Found 25 potential listing cards
   → Successfully parsed 25 listings from page 1
Total so far: 25

   Sleeping 3.5 sec...
Scraping page 2: https://www.buyrentkenya.com/houses-for-sale/nairobi?page=2
DEBUG: 'bedroom' found? YES
DEBUG: 'ksh' or 'kes' found? YES
DEBUG: 'price' keyword found? YES
   → Found 25 potential listing cards
   → Successfully parsed 25 listings from page 2
Total so far: 50

   Sleeping 3.8 sec...
Scraping page 3: https://www.buyrentkenya.com/houses-for-sale/nairobi?page=3
DEBUG: 'bedroom' found? YES
DEBUG: 'ksh' or 'kes' found? YES
DEBUG: 'price' keyword found? YES
   → Found 25 potential listing cards
   → Successfully parsed 25 listings from page 3
Total so far: 75

   Sleeping 5.5 sec...
Scraping page 4: https://www.buyrentkenya.com/houses-for-sale/nairobi?page=4
DEBUG: 'b

In [26]:
if all_data:
    df = pd.DataFrame(all_data)
    save_path = os.path.join(data_folder, output_file)
    df.to_csv(save_path, index=False, encoding="utf-8")
    
    print("\n" + "="*60)
    print(f"SAVED {len(df)} rows → {save_path}")
    print("="*60)
    
    print("\nFirst 6 rows:")
    print(df.head(6))
    
    print("\nSummary:")
    print(f"→ Prices found: {df['Price'].notna().sum()} / {len(df)}")
    print(f"→ Bedrooms parsed: {df['Bedrooms'].notna().sum()}")
    print(f"→ Unique locations: {df['Location'].nunique()}")
    print("Sample locations:", ", ".join(df['Location'].dropna().unique()[:8].tolist()))
    
else:
    print("Nothing saved – check selectors")


SAVED 407 rows → ../data/raw/raw_listings_buyrentkenya_2026-02-18.csv

First 6 rows:
  Location Property Type Bedrooms Bathrooms  Size           Amenities  \
0                  house        6         6  None  Thigiri, Westlands   
1                  house        4      None  None         Kiambu Road   
2              townhouse        6         7  None           Lavington   
3                  villa        5         5  None           Lavington   
4                  villa        5         6  None           Lavington   
5                  villa        5         5  None           Lavington   

         Price Listing Date  \
0  260000000.0                
1   78000000.0                
2  160000000.0                
3   60000000.0                
4   60000000.0                
5   85000000.0                

                                          Source_URL Scrape_Date  
0  https://www.buyrentkenya.com/houses-for-sale/n...  2026-02-18  
1  https://www.buyrentkenya.com/houses-for-sale/n.

In [27]:
# Cleaning
import pandas as pd

# Load your scraped file
df = pd.read_csv("../data/raw/raw_listings_buyrentkenya_2026-02-18.csv")

print("Original shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nFirst 8 rows:")
print(df.head(8))
print("\nMissing values per column:")
print(df.isna().sum())

Original shape: (407, 10)

Columns: ['Location', 'Property Type', 'Bedrooms', 'Bathrooms', 'Size', 'Amenities', 'Price', 'Listing Date', 'Source_URL', 'Scrape_Date']

First 8 rows:
   Location Property Type  Bedrooms  Bathrooms    Size           Amenities  \
0       NaN         house         6        6.0     NaN  Thigiri, Westlands   
1       NaN         house         4        NaN     NaN         Kiambu Road   
2       NaN     townhouse         6        7.0     NaN           Lavington   
3       NaN         villa         5        5.0     NaN           Lavington   
4       NaN         villa         5        6.0     NaN           Lavington   
5       NaN         villa         5        5.0     NaN           Lavington   
6       NaN         villa         5        6.0     NaN           Lavington   
7       NaN         villa         5        9.0  485 m²           Lavington   

         Price  Listing Date  \
0  260000000.0           NaN   
1   78000000.0           NaN   
2  160000000.0      

In [28]:
# 1. Drop useless / empty columns
df_clean = df.drop(columns=['Location', 'Scrape_Date'], errors='ignore')

# If Listing Date is completely empty, drop it too
if df_clean['Listing Date'].isna().all() or (df_clean['Listing Date'] == '').all():
    df_clean = df_clean.drop(columns=['Listing Date'], errors='ignore')

# 2. Rename Amenities → Location (this is the key fix)
df_clean = df_clean.rename(columns={'Amenities': 'Location'})

# 3. Optional: strip any extra spaces in Location
df_clean['Location'] = df_clean['Location'].astype(str).str.strip()

# 4. Move Location to the front (looks nicer)
cols = ['Location'] + [col for col in df_clean.columns if col != 'Location']
df_clean = df_clean[cols]

print("\nAfter cleaning:")
print("Shape:", df_clean.shape)
print("Columns:", df_clean.columns.tolist())
print("\nFirst 10 rows:")
print(df_clean.head(10))
print("\nUnique locations now:", df_clean['Location'].nunique())
print(df_clean['Location'].value_counts().head(15))


After cleaning:
Shape: (407, 7)
Columns: ['Location', 'Property Type', 'Bedrooms', 'Bathrooms', 'Size', 'Price', 'Source_URL']

First 10 rows:
             Location Property Type  Bedrooms  Bathrooms    Size        Price  \
0  Thigiri, Westlands         house         6        6.0     NaN  260000000.0   
1         Kiambu Road         house         4        NaN     NaN   78000000.0   
2           Lavington     townhouse         6        7.0     NaN  160000000.0   
3           Lavington         villa         5        5.0     NaN   60000000.0   
4           Lavington         villa         5        6.0     NaN   60000000.0   
5           Lavington         villa         5        5.0     NaN   85000000.0   
6           Lavington         villa         5        6.0     NaN   85000000.0   
7           Lavington         villa         5        9.0  485 m²   85000000.0   
8           Lavington         villa         5        6.0     NaN   95000000.0   
9           Lavington         villa         5 

In [29]:
# Saving the clean version
# Save as new clean file for Day 2
clean_path = "../data/raw/clean_listings_buyrentkenya_2026-02-18.csv"
df_clean.to_csv(clean_path, index=False)
print("Saved cleaned dataset to:", clean_path)

Saved cleaned dataset to: ../data/raw/clean_listings_buyrentkenya_2026-02-18.csv
