# Melbourne Housing Price Prediction - Task 8.2
## 1. Data Acquisition and Database Setup 
This section collects housing data from releastate.com and stores it in a SQLite database for further analysis.

In [6]:
import sqlite3
from pathlib import Path

DATA_DIR = Path("../data")
DB_PATH = DATA_DIR / "melbourne_housing.db"
DATA_DIR.mkdir(exist_ok=True)

with sqlite3.connect(DB_PATH) as conn:
    conn.execute("""
    CREATE TABLE IF NOT EXISTS sales (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        suburb TEXT NOT NULL,
        postcode TEXT,
        address TEXT,
        property_type TEXT,
        bedrooms INTEGER,
        bathrooms INTEGER,
        car_spaces INTEGER,
        land_size_m2 REAL,
        building_size_m2 REAL,
        sold_price INTEGER,
        sold_date TEXT,
        source_url TEXT,
        source TEXT DEFAULT 'realestate.com.au',
        scraped_at TEXT DEFAULT (datetime('now'))
    );
    """)

    conn.execute("""
    CREATE UNIQUE INDEX IF NOT EXISTS idx_unique_listing
    ON sales(suburb, address, sold_date, sold_price);
    """)

print("✅ Database created successfully at:", DB_PATH.resolve())


✅ Database created successfully at: /Users/baoan/Study/University/Deakin/Sem 3 - 2025/SIT720 - ML/Week tasks/Week8/8.2/Melbourne-housing-regression/data/melbourne_housing.db


# Create CSV Template

In [8]:
import pandas as pd 

#CSV template
template_cols = [
    "suburb", "postcode", "address", "property_type",
    "bedrooms", "bathrooms", "car_spaces",
    "land_size_m2", "building_size_m2",
    "sold_price", "sold_date",
    "source_url"
]

template_path = DATA_DIR / "sales_template.csv"
pd.DataFrame(columns=template_cols).to_csv(template_path, index=False)

print(f"Template CSV file created at {template_path}")

Template CSV file created at ../data/sales_template.csv


# Locate and verify HTML files 

In [21]:
from pathlib import Path

HTML_DIR = DATA_DIR / "html"
files = sorted(HTML_DIR.glob("*.html"))

print("HTML files found:")
for f in files:
    print("-", f.name)


HTML files found:
- Camberwell1.html
- Camberwell2.html
- Melbourne1.html
- Melbourne2.html
- Toorak1.html
- Toorak2.html


# Helper functions (infer suburb + extract links)

In [22]:
from bs4 import BeautifulSoup

def infer_suburb(filename: str) -> str:
    name = filename.lower()
    if "melbourne" in name:
        return "Melbourne"
    elif "toorak" in name:
        return "Toorak"
    elif "camberwell" in name:
        return "Camberwell"
    else:
        return "Unknown"

def extract_property_links_from_file(path: Path) -> list[str]:
    html = path.read_text(encoding="utf-8", errors="ignore")
    soup = BeautifulSoup(html, "lxml")

    links = set()
    for a in soup.select("a[href]"):
        href = a.get("href", "")
        if "/property-" in href or "/sold/" in href:
            if href.startswith("/"):
                href = "https://www.realestate.com.au" + href
            if href.startswith("https://www.realestate.com.au/"):
                links.add(href.split("?")[0])

    return sorted(links)

In [24]:
results = {"Melbourne": [], "Toorak": [], "Camberwell": []}
seen = set()

for f in files:
    suburb = infer_suburb(f.name)
    if suburb == "Unknown":
        print("Skipping:", f.name)
        continue

    urls = extract_property_links_from_file(f)
    new_urls = [u for u in urls if u not in seen]

    for u in new_urls:
        seen.add(u)
        results[suburb].append(u)

    print(f"{f.name}: found={len(urls)}, new={len(new_urls)}")

print("\nSummary:")
total = 0
for s in results:
    print(f"{s}: {len(results[s])}")
    total += len(results[s])

print("TOTAL links:", total)


Camberwell1.html: found=55, new=55
Camberwell2.html: found=55, new=0
Melbourne1.html: found=55, new=53
Melbourne2.html: found=51, new=25
Toorak1.html: found=59, new=54
Toorak2.html: found=55, new=25

Summary:
Melbourne: 78
Toorak: 79
Camberwell: 55
TOTAL links: 212
