In [12]:
import pandas as pd
import numpy as np

In [13]:



# 1. Load the raw data
df = pd.read_csv("propertypro_lagos_10k.csv")
print(f"Original shape: {df.shape}")

# 2. Drop Duplicates
df = df.drop_duplicates(subset=['Title', 'Price', 'Location'])
print(f"Shape after removing duplicates: {df.shape}")

# 3. Clean 'Price'
def clean_price(price):
    if isinstance(price, str):
        # Keep only digits
        clean_str = ''.join(filter(str.isdigit, price))
        if clean_str:
            return float(clean_str)
    return np.nan

df['cleaned_price'] = df['Price'].apply(clean_price)

# 4. Filter Outliers (The "Lagos Factor")
df = df[(df['cleaned_price'] >= 1_000_000) & (df['cleaned_price'] <= 50_000_000_000)]

# 5. Clean 'Location'
def clean_location(loc):
    if not isinstance(loc, str):
        return "Unknown"
    loc = loc.replace("Lagos", "").strip()
    loc = loc.strip(", ")
    return loc

df['cleaned_location'] = df['Location'].apply(clean_location)

# 6. Clean Bedrooms/Bathrooms
df['Bedrooms'] = pd.to_numeric(df['Bedrooms'], errors='coerce').fillna(0)
df['Bathrooms'] = pd.to_numeric(df['Bathrooms'], errors='coerce').fillna(0)



Original shape: (5412, 6)
Shape after removing duplicates: (4323, 6)


In [14]:
df["Full_Location"] = df["Location"]
df = df.drop(columns=['URL','Location'])
df.head()

Unnamed: 0,Title,Price,Bedrooms,Bathrooms,cleaned_price,cleaned_location,Full_Location
0,2 Bedroom Apartment,"₦ 250,000,000",2,0,250000000.0,Lekki Phase 1 Lekki,Lekki Phase 1 Lekki Lagos
1,"Promo Sales On Landview City Eleko, Lagos","₦ 45,000,000",0,0,45000000.0,"Amen Estate 1 & 2, Jenifa Gardens Eleko Ibeju ...","Amen Estate 1 & 2, Jenifa Gardens Eleko Ibeju ..."
2,5 Bedroom Detached Duplex With Bq,"₦ 680,000,000",5,5,680000000.0,2nd Toll Gate Lekki,2nd Toll Gate Lekki Lagos
3,5 Bedroom Detached Duplex,"₦ 600,000,000",5,5,600000000.0,"Lekki County, Megamound Estate Ikota Lekki","Lekki County, Megamound Estate Ikota Lekki Lagos"
4,5 Bedroom Detached Duplex With Bq,"₦ 300,000,000",5,5,300000000.0,Ajah,Ajah Lagos


In [None]:
from geopy.geocoders import Nominatim
from geopy.distance import geodesic
import time

# 1. Load the data
print(f"Original unique locations: {df['cleaned_location'].nunique()}")

# 2. Define the "Major Zones" of Lagos
zone_mapping = {
    'banana': 'Banana Island',
    'ikoyi': 'Ikoyi',
    'victoria island': 'Victoria Island',
    'vi ': 'Victoria Island',
    'lekki': 'Lekki',
    'vgc': 'VGC',
    'chevron': 'Chevron Drive',
    'ajah': 'Ajah',
    'sangotedo': 'Sangotedo',
    'badore': 'Badore',
    'ibeju': 'Ibeju Lekki',
    'eats': 'Epe', 
    'epe': 'Epe',
    
    'ikeja': 'Ikeja',
    'magodo': 'Magodo',
    'ogba': 'Ogba',
    'maryland': 'Maryland',
    'gbagada': 'Gbagada',
    'surulere': 'Surulere',
    'yaba': 'Yaba',
    'festac': 'Festac',
    'apapa': 'Apapa',
    'ikorodu': 'Ikorodu',
    'alimosho': 'Alimosho',
    'agege': 'Agege',
    'ipaja': 'Ipaja'
}

def standardize_location(text):
    text = str(text).lower()
    for keyword, standard_name in zone_mapping.items():
        if keyword in text:
            return standard_name
    return "Other" # If it doesn't match our main zones, mark as Other

# 3. Apply the Standardization
print("Standardizing locations...")
df['Standard_Location'] = df['cleaned_location'].apply(standardize_location)

# Remove "Other"
df = df[df['Standard_Location'] != "Other"]

print(f"Reduced to {df['Standard_Location'].nunique()} clean zones.")
print(df['Standard_Location'].value_counts())

# 4. Geocode ONLY the Standard Zones
unique_zones = df['Standard_Location'].unique()
geolocator = Nominatim(user_agent="lagos_house_project_fix_v2")

zone_coords = {}
lekki_toll = (6.4490, 3.4623)
ikeja_mall = (6.6018, 3.3515)

print("\nGeocoding the ~20 major zones...")

for zone in unique_zones:
    try:
        location = geolocator.geocode(f"{zone}, Lagos, Nigeria", timeout=10)
        if location:
            coords = (location.latitude, location.longitude)
            zone_coords[zone] = {
                'lat': location.latitude,
                'lon': location.longitude,
                'dist_to_lekki': geodesic(coords, lekki_toll).km,
                'dist_to_ikeja': geodesic(coords, ikeja_mall).km
            }
            print(f"{zone}: Found")
        else:
            print(f"{zone}: Not Found")
            # If not found, use a default or drop
            zone_coords[zone] = {'lat': None, 'lon': None, 'dist_to_lekki': None, 'dist_to_ikeja': None}
            
        time.sleep(1) 
    except Exception as e:
        print(f"Error {zone}: {e}")

# 5. Map back to DataFrame
df['lat'] = df['Standard_Location'].map(lambda x: zone_coords.get(x, {}).get('lat'))
df['lon'] = df['Standard_Location'].map(lambda x: zone_coords.get(x, {}).get('lon'))
df['dist_to_lekki'] = df['Standard_Location'].map(lambda x: zone_coords.get(x, {}).get('dist_to_lekki'))
df['dist_to_ikeja'] = df['Standard_Location'].map(lambda x: zone_coords.get(x, {}).get('dist_to_ikeja'))

# Drop failed geocodes
df = df.dropna(subset=['dist_to_lekki'])


In [16]:
df.head()

Unnamed: 0,Title,Price,Bedrooms,Bathrooms,cleaned_price,cleaned_location,Full_Location,Standard_Location,lat,lon,dist_to_lekki,dist_to_ikeja
0,2 Bedroom Apartment,"₦ 250,000,000",2,0,250000000.0,Lekki Phase 1 Lekki,Lekki Phase 1 Lekki Lagos,Lekki,6.471125,3.81475,39.063833,53.234785
1,"Promo Sales On Landview City Eleko, Lagos","₦ 45,000,000",0,0,45000000.0,"Amen Estate 1 & 2, Jenifa Gardens Eleko Ibeju ...","Amen Estate 1 & 2, Jenifa Gardens Eleko Ibeju ...",Lekki,6.471125,3.81475,39.063833,53.234785
2,5 Bedroom Detached Duplex With Bq,"₦ 680,000,000",5,5,680000000.0,2nd Toll Gate Lekki,2nd Toll Gate Lekki Lagos,Lekki,6.471125,3.81475,39.063833,53.234785
3,5 Bedroom Detached Duplex,"₦ 600,000,000",5,5,600000000.0,"Lekki County, Megamound Estate Ikota Lekki","Lekki County, Megamound Estate Ikota Lekki Lagos",Lekki,6.471125,3.81475,39.063833,53.234785
4,5 Bedroom Detached Duplex With Bq,"₦ 300,000,000",5,5,300000000.0,Ajah,Ajah Lagos,Ajah,6.469472,3.562386,11.300377,27.534919


In [17]:
df["Location"] = df["Standard_Location"]
df.head()

Unnamed: 0,Title,Price,Bedrooms,Bathrooms,cleaned_price,cleaned_location,Full_Location,Standard_Location,lat,lon,dist_to_lekki,dist_to_ikeja,Location
0,2 Bedroom Apartment,"₦ 250,000,000",2,0,250000000.0,Lekki Phase 1 Lekki,Lekki Phase 1 Lekki Lagos,Lekki,6.471125,3.81475,39.063833,53.234785,Lekki
1,"Promo Sales On Landview City Eleko, Lagos","₦ 45,000,000",0,0,45000000.0,"Amen Estate 1 & 2, Jenifa Gardens Eleko Ibeju ...","Amen Estate 1 & 2, Jenifa Gardens Eleko Ibeju ...",Lekki,6.471125,3.81475,39.063833,53.234785,Lekki
2,5 Bedroom Detached Duplex With Bq,"₦ 680,000,000",5,5,680000000.0,2nd Toll Gate Lekki,2nd Toll Gate Lekki Lagos,Lekki,6.471125,3.81475,39.063833,53.234785,Lekki
3,5 Bedroom Detached Duplex,"₦ 600,000,000",5,5,600000000.0,"Lekki County, Megamound Estate Ikota Lekki","Lekki County, Megamound Estate Ikota Lekki Lagos",Lekki,6.471125,3.81475,39.063833,53.234785,Lekki
4,5 Bedroom Detached Duplex With Bq,"₦ 300,000,000",5,5,300000000.0,Ajah,Ajah Lagos,Ajah,6.469472,3.562386,11.300377,27.534919,Ajah


In [18]:
df = df.drop(columns=['cleaned_location', 'Standard_Location', 'Price'])
df.head()

Unnamed: 0,Title,Bedrooms,Bathrooms,cleaned_price,Full_Location,lat,lon,dist_to_lekki,dist_to_ikeja,Location
0,2 Bedroom Apartment,2,0,250000000.0,Lekki Phase 1 Lekki Lagos,6.471125,3.81475,39.063833,53.234785,Lekki
1,"Promo Sales On Landview City Eleko, Lagos",0,0,45000000.0,"Amen Estate 1 & 2, Jenifa Gardens Eleko Ibeju ...",6.471125,3.81475,39.063833,53.234785,Lekki
2,5 Bedroom Detached Duplex With Bq,5,5,680000000.0,2nd Toll Gate Lekki Lagos,6.471125,3.81475,39.063833,53.234785,Lekki
3,5 Bedroom Detached Duplex,5,5,600000000.0,"Lekki County, Megamound Estate Ikota Lekki Lagos",6.471125,3.81475,39.063833,53.234785,Lekki
4,5 Bedroom Detached Duplex With Bq,5,5,300000000.0,Ajah Lagos,6.469472,3.562386,11.300377,27.534919,Ajah


In [None]:

# 1. Define the Island Locations (The "High Value" Zone)
island_neighborhoods = [
    'Ikoyi', 'Victoria Island', 'Lekki', 'Ajah', 'Banana Island', 
    'VGC', 'Chevron Drive', 'Sangotedo', 'Badore', 'Ibeju Lekki', 
    'Epe', 'Oniru'
]

# 2. Create the function
def categorize_zone(location):
    # Check if the location string matches any island neighborhood
    if location in island_neighborhoods:
        return 1 # 1 = Island
    else:
        return 0 # 0 = Mainland

# 4. Apply it
df['Is_Island'] = df['Location'].apply(categorize_zone)



Average Price on Mainland vs Island:
Is_Island
0    ₦685,906,641
1    ₦603,934,041
Name: cleaned_price, dtype: object


In [20]:

# --- INTERNAL FEATURES ---

# A. Property Type Extraction
def get_property_type(title):
    title = str(title).lower()
    if 'land' in title: return 'Land'
    if 'detached' in title: return 'Detached Duplex'
    if 'semi' in title and 'detached' in title: return 'Semi-Detached'
    if 'terrace' in title: return 'Terrace'
    if 'flat' in title or 'apartment' in title: return 'Flat'
    if 'bungalow' in title: return 'Bungalow'
    if 'duplex' in title: return 'Duplex' 
    return 'Other'

df['Prop_Type'] = df['Title'].apply(get_property_type)

# B. Luxury Flags (Binary: 1 or 0)
# We look for keywords that scream "Money"
df['Has_Pool'] = df['Title'].str.contains('pool|swimming', case=False, regex=True).astype(int)
df['Has_BQ'] = df['Title'].str.contains('bq|boys quarter', case=False, regex=True).astype(int)
df['New_Build'] = df['Title'].str.contains('newly built|brand new', case=False, regex=True).astype(int)

# C. Convenience Stats
# Avoid division by zero
df['Bathroom_per_Bedroom'] = df['Bathrooms'] / df['Bedrooms'].replace(0, 1)


# --- EXTERNAL FEATURES ---

# D. Estate Status (Security Premium)
df['In_Estate'] = df['Title'].str.contains('Estate', case=False, regex=True).astype(int)

# E. Serviced Apartment (Power Premium)
df['Is_Serviced'] = df['Title'].str.contains('serviced', case=False, regex=True).astype(int)

# --- CLEANUP ---
df = df[df['Prop_Type'] != 'Land']


In [21]:
# Look inside the 'Title'
# We also look for specific estate keywords like "Gated", "Secure", "Estate"
estate_keywords = 'estate|gated|secure|serviced|vgc|chevron|banana'
df['In_Estate'] = df['Title'].str.contains(estate_keywords, case=False, regex=True).astype(int)

# --- NEW: NEIGHBORHOOD TIERS---
# Calculate the average price per location
loc_prices = df.groupby('Location')['cleaned_price'].median().sort_values()

# Create 5 Tiers based on price quantiles
def get_tier(price):
    if price < 40_000_000: return 1  # Budget
    if price < 80_000_000: return 2  # Mid-Range
    if price < 150_000_000: return 3 # Premium Mainland / Outer Island
    if price < 400_000_000: return 4 # High-End Island
    return 5                         # Ultra Luxury (Ikoyi/Banana)

# Map the median price of the neighborhood to a Tier
loc_tier_map = loc_prices.apply(get_tier).to_dict()
df['Neighborhood_Tier'] = df['Location'].map(loc_tier_map)

# --- NEW: INTERACTION FEATURES ---
df['Island_x_Beds'] = df['Is_Island'] * df['Bedrooms']

# "Luxury Score" - Combining Pool + Estate + New Build
df['Luxury_Score'] = df['Has_Pool'] + df['In_Estate'] + df['New_Build']

# --- FINAL CLEANUP ---
cols_to_drop = ['Standard_Location', 'Raw_Features', 'Is_Serviced']
# Only drop if they exist
df = df.drop(columns=[c for c in cols_to_drop if c in df.columns], errors='ignore')

# Save the "Gold Standard" dataset
df.to_csv("lagos_houses_prices.csv", index=False)


In [22]:
df['Luxury_Score'].value_counts()

Luxury_Score
0    3001
1     637
2      19
Name: count, dtype: int64