In [None]:
import re
import requests
from urllib.parse import quote
import pandas as pd

# List of cities to scrape
cities = [
    "Makati",
    "Muntinlupa",
    "Pasig",
    "Bacoor",
    "Trece Martires",
    "Calamba",
    "Batangas City",
    "Cainta",
    "Rodriguez",
    "San Fernando"
]

base_url = "https://www.lamudi.com.ph/buy/{}/house/"

headers = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    )
}

# Regex pattern to extract prices
pattern = r'<div class="snippet__content__price">\s*([^<]+)\s*</div>'

# Create a list to store results
results = []

for city in cities:
    encoded_city = quote(city.lower().replace(" ", "-"))
    url = base_url.format(encoded_city)

    print(f"\n--- {city} ---")
    print(f"Fetching: {url}")

    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        html = response.text

        prices = re.findall(pattern, html)
        prices = [p.strip() for p in prices][:30]  # limit to 10 samples

        if prices:
            for price in prices:
                results.append({
                    "City": city,
                    "Price": price
                })
        else:
            print("No prices found.")

    except requests.RequestException as e:
        print(f"Error fetching {city}: {e}")

# Convert results to a DataFrame
df = pd.DataFrame(results)

print("\n‚úÖ Scraping complete! Here's a sample:")
print(df.head())


--- Makati ---
Fetching: https://www.lamudi.com.ph/buy/makati/house/

--- Muntinlupa ---
Fetching: https://www.lamudi.com.ph/buy/muntinlupa/house/

--- Pasig ---
Fetching: https://www.lamudi.com.ph/buy/pasig/house/

--- Bacoor ---
Fetching: https://www.lamudi.com.ph/buy/bacoor/house/

--- Trece Martires ---
Fetching: https://www.lamudi.com.ph/buy/trece-martires/house/

--- Calamba ---
Fetching: https://www.lamudi.com.ph/buy/calamba/house/

--- Batangas City ---
Fetching: https://www.lamudi.com.ph/buy/batangas-city/house/

--- Cainta ---
Fetching: https://www.lamudi.com.ph/buy/cainta/house/

--- Rodriguez ---
Fetching: https://www.lamudi.com.ph/buy/rodriguez/house/

--- San Fernando ---
Fetching: https://www.lamudi.com.ph/buy/san-fernando/house/

‚úÖ Scraping complete! Here's a sample:
     City            Price
0  Makati  ‚Ç± 1,500,000,000
1  Makati  ‚Ç± 1,500,000,000
2  Makati  ‚Ç± 1,700,000,000
3  Makati  ‚Ç± 1,900,000,000
4  Makati    ‚Ç± 175,000,000


In [9]:
# Convert prices to numeric
def clean_price(price_str):
    # Remove Peso sign, commas, spaces, and non-numeric characters
    clean = re.sub(r"[^\d.]", "", price_str)
    return float(clean) if clean else None

df["Price_clean"] = df["Price"].apply(clean_price)

print(df[["City", "Price", "Price_clean"]].head())

     City            Price   Price_clean
0  Makati  ‚Ç± 1,500,000,000  1.500000e+09
1  Makati  ‚Ç± 1,500,000,000  1.500000e+09
2  Makati  ‚Ç± 1,700,000,000  1.700000e+09
3  Makati  ‚Ç± 1,900,000,000  1.900000e+09
4  Makati    ‚Ç± 175,000,000  1.750000e+08


In [10]:
df

Unnamed: 0,City,Price,Price_clean
0,Makati,"‚Ç± 1,500,000,000",1.500000e+09
1,Makati,"‚Ç± 1,500,000,000",1.500000e+09
2,Makati,"‚Ç± 1,700,000,000",1.700000e+09
3,Makati,"‚Ç± 1,900,000,000",1.900000e+09
4,Makati,"‚Ç± 175,000,000",1.750000e+08
...,...,...,...
95,San Fernando,"‚Ç± 2,073,000",2.073000e+06
96,San Fernando,"‚Ç± 1,922,000",1.922000e+06
97,San Fernando,"‚Ç± 6,600,000",6.600000e+06
98,San Fernando,"‚Ç± 5,500,000",5.500000e+06


In [12]:
import plotly.graph_objects as go

# Drop missing values
df_plot = df.dropna(subset=["Price_clean"])

# Sort cities alphabetically (optional)
df_plot = df_plot.sort_values(by="City")

# Create a trace (one box per city)
fig = go.Figure()

for city in df_plot["City"].unique():
    fig.add_trace(go.Box(
        y=df_plot.loc[df_plot["City"] == city, "Price_clean"],
        name=city,
        # boxmean='sd',  # show mean and standard deviation
        marker=dict(outliercolor='rgba(255, 0, 0, 0.6)', size=4)
    ))

# Customize layout
fig.update_layout(
    title="üèòÔ∏è Distribution of Property Prices by City",
    yaxis_title="Price (PHP)",
    xaxis_title="City",
    boxmode="group",
    template="plotly_white",
    height=600,
    width=1000
)

fig.show()


Old Code

In [5]:
import re
import requests

# URL for demo (you can replace it with your Lamudi link)
url = "https://www.lamudi.com.ph/buy/metro-manila/muntinlupa/alabang/house/"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/120.0.0.0 Safari/537.36"
}

# Fetch page
response = requests.get(url, headers=headers)
html = response.text

# Regex pattern to match:
# <div class="snippet__content__price"> ‚Ç± 45,000,000 </div>
pattern = r'<div class="snippet__content__price">\s*([^<]+)\s*</div>'

# Find all matches
prices = re.findall(pattern, html)

print("Extracted prices:")
for price in prices:
    print(price.strip())


Extracted prices:
‚Ç± 45,000,000
‚Ç± 63,000,000
‚Ç± 105,000,000
‚Ç± 95,000,000
‚Ç± 185,000,000
‚Ç± 90,000,000
‚Ç± 63,000,000
‚Ç± 63,000,000
‚Ç± 95,000,000
‚Ç± 125,000,000
‚Ç± 61,813,675
‚Ç± 98,000,989
‚Ç± 60,000,000
‚Ç± 65,000,000
‚Ç± 120,000,000
‚Ç± 105,000,000
‚Ç± 65,000,000
‚Ç± 420,000,000
‚Ç± 87,990,000
‚Ç± 170,000,000
‚Ç± 200,000,000
‚Ç± 149,900,000
‚Ç± 49,839,570
‚Ç± 20,500,000
‚Ç± 58,000,000
‚Ç± 135,000,000
‚Ç± 88,000,000
‚Ç± 92,500,000
‚Ç± 80,000,000
‚Ç± 410,000,000


In [8]:
import re
import requests
import pandas as pd

# --- Target URL ---
url = "https://www.lamudi.com.ph/buy/metro-manila/muntinlupa/taguig/house/"

# --- Headers to mimic a browser ---
headers = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    ),
    "Accept-Language": "en-US,en;q=0.9",
}

# --- Fetch page ---
response = requests.get(url, headers=headers)
html = response.text

# --- Regex patterns ---
price_pattern = r'<div class="snippet__content__price">\s*([^<]+)\s*</div>'
location_pattern = r'<span[^>]*data-test="snippet-content-location"[^>]*>\s*([^<]+)\s*</span>'

# --- Find all matches ---
prices = [p.strip() for p in re.findall(price_pattern, html)]
locations = [l.strip() for l in re.findall(location_pattern, html)]

# --- Align lists (in case of mismatch) ---
min_len = min(len(prices), len(locations))
prices, locations = prices[:min_len], locations[:min_len]

# --- Create DataFrame ---
df = pd.DataFrame({
    "Price": prices,
    "Location": locations
})

# --- Clean price column (optional numeric cleaning) ---
df["Clean_Price"] = (
    df["Price"]
    .str.replace("‚Ç±", "", regex=False)
    .str.replace(",", "", regex=False)
    .str.strip()
)

# --- Display results ---
print(df.head())

# --- Optionally save to CSV ---
# df.to_csv("lamudi_prices.csv", index=False)
# print("\nData saved to lamudi_prices.csv")


           Price                Location Clean_Price
0    ‚Ç± 1,384,600  Fort Bonifacio, Taguig     1384600
1   ‚Ç± 85,000,000         Bicutan, Taguig    85000000
2   ‚Ç± 48,000,000    Taguig, Metro Manila    48000000
3  ‚Ç± 200,000,000   McKinley Hill, Taguig   200000000
4            ‚Ç± 1   McKinley Hill, Taguig           1


In [9]:
df

Unnamed: 0,Price,Location,Clean_Price
0,"‚Ç± 1,384,600","Fort Bonifacio, Taguig",1384600
1,"‚Ç± 85,000,000","Bicutan, Taguig",85000000
2,"‚Ç± 48,000,000","Taguig, Metro Manila",48000000
3,"‚Ç± 200,000,000","McKinley Hill, Taguig",200000000
4,‚Ç± 1,"McKinley Hill, Taguig",1
5,‚Ç± 1,"McKinley Hill, Taguig",1
6,‚Ç± 1,"Fort Bonifacio, Taguig",1
7,"‚Ç± 50,000,000","Bicutan, Taguig",50000000
8,"‚Ç± 198,000,000","Fort Bonifacio, Taguig",198000000
9,"‚Ç± 196,000,000","Fort Bonifacio, Taguig",196000000


Scraping Numbeo

In [None]:
import json

# Sample data (your snippet shortened for demo)
sample_data = {
  "city_id": 6512,
  "prices": [
    {
      "data_points": 70,
      "item_id": 14,
      "lowest_price": 6,
      "average_price": 8,
      "highest_price": 10,
      "item_name": "Bottle of Wine (Mid-Range), Markets"
    },
    {
      "data_points": 52,
      "item_id": 15,
      "lowest_price": 1,
      "average_price": 1.9053921568627454,
      "highest_price": 4,
      "item_name": "Domestic Beer (0.5 liter bottle), Markets"
    }
  ]
}

# Simulate what fetch_numbeo_prices() would return
data = sample_data

# Pretty-print output
for item in data.get("prices", []):
    print(
        f"{item['item_name']}: avg {item['average_price']} "
        f"(low {item['lowest_price']} ‚Äì high {item['highest_price']})"
    )
