# Advanced Spatial Econometrics and Geospatial Modeling Project
This notebook contains all components of the project combined into a single workflow.

## data_loader.py

In [19]:
import pandas as pd
import numpy as np

def convert_sqft(x):
    try:
        tokens = x.split('-')
        if len(tokens) == 2:
            return (float(tokens[0]) + float(tokens[1])) / 2
        return float(x)
    except:
        return None

def load_and_clean_property_data(filepath):
    df = pd.read_csv(filepath)
    df['total_sqft'] = df['total_sqft'].apply(convert_sqft)
    df['bhk'] = df['size'].apply(lambda x: int(x.split(' ')[0]) if isinstance(x, str) else None)
    df['price_per_sqft'] = df['price']*100000/df['total_sqft']
    df = df.dropna(subset=['total_sqft','bhk','bath','price','location'])
    df = df[df['total_sqft']<10000]
    df['location']=df['location'].apply(lambda x: x.strip())
    return df

def load_metro_stations(green_path, purple_path):
    green = pd.read_csv(green_path)
    purple = pd.read_csv(purple_path)
    stations = pd.concat([green, purple], ignore_index=True)
    return list(zip(stations['Y'], stations['X']))


## geospatial.py

In [20]:
import pandas as pd
import numpy as np
import os
from math import radians, cos, sin, asin, sqrt
from sklearn.cluster import KMeans
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
from geopy.exc import GeocoderTimedOut, GeocoderUnavailable

# Cache file to store coordinates
CACHE_FILE = 'loc_coords_cache.csv'

# --- MANUAL DATABASE OF TOP 40 LOCATIONS ---
# This ensures the project works even if the API blocks us.
KNOWN_LOCATIONS = {
    'Whitefield': (12.9698, 77.7500), 'Sarjapur  Road': (12.9166, 77.6736),
    'Electronic City': (12.8452, 77.6602), 'Kanakpura Road': (12.8654, 77.5336),
    'Thanisandra': (13.0547, 77.6339), 'Yelahanka': (13.1007, 77.5963),
    'Uttarahalli': (12.9055, 77.5512), 'Hebbal': (13.0359, 77.5970),
    'Marathahalli': (12.9591, 77.6974), 'Raja Rajeshwari Nagar': (12.9274, 77.5155),
    'Bannerghatta Road': (12.8766, 77.5990), 'Hennur Road': (13.0603, 77.6423),
    '7th Phase JP Nagar': (12.9024, 77.5802), 'Haralur Road': (12.9081, 77.6476),
    'Electronic City Phase II': (12.8469, 77.6773), 'Rajaji Nagar': (12.9982, 77.5530),
    'Chandapura': (12.7968, 77.6944), 'Bellandur': (12.9304, 77.6684),
    'Hoodi': (12.9920, 77.7130), 'KR Puram': (13.0075, 77.6959),
    'Yeshwanthpur': (13.0238, 77.5529), 'Kothanur': (13.0559, 77.6322),
    'Koramangala': (12.9352, 77.6245), 'Indira Nagar': (12.9784, 77.6408),
    'Malleshwaram': (13.0031, 77.5643), 'Jayanagar': (12.9308, 77.5802),
    'Banashankari': (12.9155, 77.5736), 'JP Nagar': (12.9076, 77.5736),
    'Peenya': (13.0285, 77.5460), 'Ulsoor': (12.9733, 77.6204),
    'BTM Layout': (12.9166, 77.6101), 'Ramamurthy Nagar': (13.0165, 77.6777),
    'Basaveshwara Nagar': (12.9847, 77.5491), 'Chamarajpet': (12.9569, 77.5635),
    'Varthur': (12.9389, 77.7412), 'Madiwala': (12.9226, 77.6174),
    'Basavanagudi': (12.9421, 77.5658), 'RT Nagar': (13.0108, 77.5763),
    'Domlur': (12.9606, 77.6416), 'Frazer Town': (12.9990, 77.6124),
    'HSR Layout': (12.9121, 77.6446), 'Banaswadi': (13.0104, 77.6482)
}

def map_locality_coords(df):
    """
    Maps locations using a manual database first.
    ATTEMPTS to fetch missing ones, but stops gracefully if errors occur.
    """
    unique_locs = df['location'].unique()

    # 1. Start with our Manual Database
    loc_coords_dict = KNOWN_LOCATIONS.copy()

    # 2. Load Cache if exists
    if os.path.exists(CACHE_FILE):
        try:
            coords_df = pd.read_csv(CACHE_FILE, index_col=0)
            cache_dict = {loc: (lat, lon) for loc, lat, lon in coords_df.itertuples()}
            loc_coords_dict.update(cache_dict)
        except:
            pass # Ignore cache errors

    # 3. Identify missing locations
    missing_locs = [loc for loc in unique_locs if loc not in loc_coords_dict]

    # 4. Attempt to fetch others (WITH FAIL-SAFE)
    if missing_locs:
        print(f"[Geo] Processing {len(missing_locs)} remaining locations...")
        print("[Geo] NOTE: If API fails, we will proceed with known locations only.")

        # Increased timeout significantly
        geolocator = Nominatim(user_agent="bengaluru_student_project_final", timeout=10)
        geocode = RateLimiter(geolocator.geocode, min_delay_seconds=2.0)

        error_count = 0
        MAX_ERRORS = 3 # Stop after 3 consecutive errors to prevent hanging

        for i, loc in enumerate(missing_locs):
            if error_count >= MAX_ERRORS:
                print("!!! Too many connection errors. Stopping live fetch to ensure completion.")
                break

            try:
                query = f"{loc}, Bengaluru"
                location = geocode(query)

                if location:
                    loc_coords_dict[loc] = (location.latitude, location.longitude)
                    error_count = 0 # Reset error count on success
                else:
                    loc_coords_dict[loc] = (None, None)

            except (GeocoderTimedOut, GeocoderUnavailable) as e:
                print(f"   -> Network Error on '{loc}'. Skipping.")
                error_count += 1
            except Exception as e:
                print(f"   -> Error: {e}")
                error_count += 1

    # 5. Save what we have
    clean_dict = {k:v for k,v in loc_coords_dict.items() if v[0] is not None}
    pd.DataFrame.from_dict(clean_dict, orient='index', columns=['lat', 'lon']).to_csv(CACHE_FILE)

    # 6. Map and Drop Missing
    df['lat'] = df['location'].map(lambda x: loc_coords_dict.get(x, (None, None))[0])
    df['lon'] = df['location'].map(lambda x: loc_coords_dict.get(x, (None, None))[1])

    original_len = len(df)
    df = df.dropna(subset=['lat', 'lon'])
    print(f"[Geo] Mapped {len(df)} properties successfully (Dropped {original_len - len(df)} unknown).")

    return df

def calculate_metro_distance(df, station_coords):
    print("[Geo] Calculating distances to metro stations...")
    def get_min_dist(row):
        prop_lat, prop_lon = row['lat'], row['lon']
        min_d = float('inf')
        R = 6371
        for s_lat, s_lon in station_coords:
            dlat = radians(s_lat - prop_lat)
            dlon = radians(s_lon - prop_lon)
            a = sin(dlat/2)**2 + cos(radians(prop_lat)) * cos(radians(s_lat)) * sin(dlon/2)**2
            c = 2 * asin(sqrt(a))
            d = R * c
            if d < min_d: min_d = d
        return min_d

    df['dist_to_metro'] = df.apply(get_min_dist, axis=1)
    return df

def generate_spatial_clusters(df, n_clusters=5):
    print(f"[Geo] Generating {n_clusters} spatial clusters...")
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    df['geo_cluster'] = kmeans.fit_predict(df[['lat', 'lon']])
    return df

## models.py

In [21]:
import numpy as np
import statsmodels.api as sm
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

def run_ols_regression(df):
    df['dist_x_sqft']=df['dist_to_metro']*df['total_sqft']
    df['log_price']=np.log(df['price'])
    X=df[['dist_to_metro','total_sqft','bhk','bath','dist_x_sqft']]
    X=sm.add_constant(X)
    y=df['log_price']
    model=sm.OLS(y,X).fit()
    print(model.summary())
    return model

def train_gbm(df):
    df['log_price']=np.log(df['price'])
    X=df[['dist_to_metro','total_sqft','bhk','bath','lat','lon']]
    y=df['log_price']
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)
    gbm=GradientBoostingRegressor(n_estimators=200,learning_rate=0.1,max_depth=4,random_state=42)
    gbm.fit(X_train,y_train)
    y_pred=gbm.predict(X_test)
    print("GBM R2:",r2_score(y_test,y_pred))
    return gbm,X_train,X_test,y_train,y_test,y_pred


## visualizer.py

In [22]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.inspection import PartialDependenceDisplay

def plot_bid_rent_curve(df):
    plt.figure(figsize=(10,6))
    sns.regplot(x='dist_to_metro',y='price_per_sqft',data=df,scatter_kws={'alpha':0.3},line_kws={'color':'red'},order=2)
    plt.savefig('bid_rent_curve.png'); plt.close()

def plot_pdp(model,X_train):
    fig,ax=plt.subplots(figsize=(10,6))
    PartialDependenceDisplay.from_estimator(
        model,
        X_train,
        ['dist_to_metro'],
        ax=ax,
        kind='average',
        subsample=1000,
        grid_resolution=100)
    plt.savefig('pdp_distance.png'); plt.close()

def plot_spatial_residuals(df,y_test,y_pred):
    df_test=df.loc[y_test.index].copy()
    df_test['pred']=np.exp(y_pred)
    df_test['actual']=np.exp(y_test)
    df_test['err']=df_test['actual']-df_test['pred']
    plt.figure(figsize=(10,8))
    sc=plt.scatter(df_test['lon'],df_test['lat'],c=df_test['err'],cmap='coolwarm',s=80)
    plt.colorbar(sc)
    plt.savefig('spatial_residuals.png'); plt.close()


## spatial_econometrics.py

In [23]:
import numpy as np
import pandas as pd

def build_knn_weights(df, k=8):
    from libpysal.weights import KNN
    coords=df[['lat','lon']].to_numpy()
    w=KNN.from_array(coords,k=k)
    w.transform='r'
    return w,coords

def morans_i(df,w,variable='price_per_sqft'):
    from esda.moran import Moran
    m=Moran(df[variable].values,w)
    print("Moran's I:",m.I," p:",m.p_sim)
    return m

def add_spatial_lag_feature(df,w,var='log_price',new_col='W_log_price'):
    y=df[var].to_numpy()
    wy=w.sparse.dot(y)
    df[new_col]=wy
    return df

def run_sar_sem(df,feature_cols,dependent,w):
    from spreg import ML_Lag,ML_Error
    X=df[feature_cols].astype(float).values
    y=df[dependent].astype(float).values.reshape(-1,1)
    sar=ML_Lag(y,X,w=w,name_y=dependent,name_x=feature_cols)
    sem=ML_Error(y,X,w=w,name_y=dependent,name_x=feature_cols)
    print(sar.summary); print(sem.summary)
    return sar,sem


## main.py

In [24]:
# MAIN WORKFLOW
import pandas as pd
import numpy as np

df = load_and_clean_property_data('Bengaluru_House_Data.csv')
stations = load_metro_stations('green.csv','purple.csv')

df = map_locality_coords(df)
df = calculate_metro_distance(df, stations)
df = generate_spatial_clusters(df,5)

# Remove unrealistic prices
df = df[df['price_per_sqft'] < 20000]
# Remove impossible metro distances
df = df[df['dist_to_metro'] < 30]    # metro within city limits
# Remove sqft errors
df = df[df['total_sqft'] > 200]      # typical size for apartments


# OLS
ols_model = run_ols_regression(df)

# GBM
gbm,X_train,X_test,y_train,y_test,y_pred = train_gbm(df)

# Visuals
plot_bid_rent_curve(df)
plot_pdp(gbm,X_train)
plot_spatial_residuals(df,y_test,y_pred)

# Spatial Econometrics
print("[Stats] Calculating Spatial Weights...")
df['log_price'] = np.log(df['price'])

# Build Weights
w, coords = build_knn_weights(df, k=8)

# Moran's I Check
moran = morans_i(df, w, variable='log_price') # Suggest using log_price for consistency

df = add_spatial_lag_feature(df, w, var='log_price', new_col='W_log_price')

# Fix: Exclude 'W_log_price' from the regressors for SAR/SEM
print("[Stats] Running Spatial Regression...")
sar, sem = run_sar_sem(df,
                       feature_cols=['dist_to_metro', 'total_sqft', 'bhk', 'bath'], # Corrected
                       dependent='log_price',
                       w=w)

print("DONE")

[Geo] Processing 354 remaining locations...
[Geo] NOTE: If API fails, we will proceed with known locations only.
[Geo] Mapped 11932 properties successfully (Dropped 1241 unknown).
[Geo] Calculating distances to metro stations...
[Geo] Generating 5 spatial clusters...
                            OLS Regression Results                            
Dep. Variable:              log_price   R-squared:                       0.662
Model:                            OLS   Adj. R-squared:                  0.662
Method:                 Least Squares   F-statistic:                     4571.
Date:                Tue, 18 Nov 2025   Prob (F-statistic):               0.00
Time:                        01:44:40   Log-Likelihood:                -5590.1
No. Observations:               11656   AIC:                         1.119e+04
Df Residuals:                   11650   BIC:                         1.124e+04
Df Model:                           5                                         
Covariance Type:     

 There are 186 disconnected components.
  W.__init__(self, neighbors, id_order=ids, **kwargs)


Moran's I: 0.2913779902303134  p: 0.001
[Stats] Running Spatial Regression...


  res = minimize_scalar(


REGRESSION RESULTS
------------------

SUMMARY OF OUTPUT: MAXIMUM LIKELIHOOD SPATIAL LAG (METHOD = FULL)
-----------------------------------------------------------------
Data set            :     unknown
Weights matrix      :     unknown
Dependent Variable  :   log_price                Number of Observations:       11656
Mean dependent var  :      4.3552                Number of Variables   :           6
S.D. dependent var  :      0.6727                Degrees of Freedom    :       11650
Pseudo R-squared    :      0.7258
Spatial Pseudo R-squared:  0.6728
Log likelihood      :  -4431.7536
Sigma-square ML     :      0.1241                Akaike info criterion :    8875.507
S.E of regression   :      0.3523                Schwarz criterion     :    8919.689

------------------------------------------------------------------------------------
            Variable     Coefficient       Std.Error     z-Statistic     Probability
---------------------------------------------------------------