In [40]:
import pandas as pd
import numpy as np
import math
import re
import json
import time
import os
from geopy.geocoders import Nominatim
import matplotlib.pyplot as plt
import seaborn as sns  # Assuming user has seaborn; if not, remove sns.set_style
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import NearestNeighbors
from keras.models import Model
from keras.layers import Input, Dense, LeakyReLU
from keras.callbacks import EarlyStopping


import math
import numpy as np

def vincenty(lat1, lon1, lat2, lon2):
    """
    Calculate the geodesic distance between two points on the Earth using Vincenty's formula (in km).
    More accurate than Haversine as it accounts for Earth's ellipsoidal shape.
    """
    # WGS-84 ellipsoid parameters
    a = 6378137.0  # semi-major axis in meters
    f = 1 / 298.257223563  # flattening
    b = a * (1 - f)  # semi-minor axis

    # Convert degrees to radians
    phi1 = math.radians(lat1)
    lambda1 = math.radians(lon1)
    phi2 = math.radians(lat2)
    lambda2 = math.radians(lon2)

    # Reduced latitudes
    U1 = math.atan((1 - f) * math.tan(phi1))
    U2 = math.atan((1 - f) * math.tan(phi2))

    L = lambda2 - lambda1  # longitude difference
    lam = L  # initial approximation of lambda

    sinU1 = math.sin(U1)
    cosU1 = math.cos(U1)
    sinU2 = math.sin(U2)
    cosU2 = math.cos(U2)

    iter_limit = 100
    for _ in range(iter_limit):
        sin_lam = math.sin(lam)
        cos_lam = math.cos(lam)

        sin_sigma = math.sqrt((cosU2 * sin_lam)**2 + (cosU1 * sinU2 - sinU1 * cosU2 * cos_lam)**2)
        if sin_sigma == 0:
            return 0.0  # coincident points

        cos_sigma = sinU1 * sinU2 + cosU1 * cosU2 * cos_lam
        sigma = math.atan2(sin_sigma, cos_sigma)

        sin_alpha = (cosU1 * cosU2 * sin_lam) / sin_sigma
        cos_sq_alpha = 1 - sin_alpha**2

        if cos_sq_alpha == 0:
            return (a * math.pi) / 1000  # antipodal points, approximate

        cos_2sigma_m = cos_sigma - (2 * sinU1 * sinU2) / cos_sq_alpha

        C = f / 16 * cos_sq_alpha * (4 + f * (4 - 3 * cos_sq_alpha))

        lam_prev = lam
        lam = L + (1 - C) * f * sin_alpha * (sigma + C * sin_sigma * (cos_2sigma_m + C * cos_sigma * (-1 + 2 * cos_2sigma_m**2)))

        if abs(lam - lam_prev) < 1e-12:
            break
    else:
        raise ValueError("Vincenty formula failed to converge after 100 iterations")

    u_sq = cos_sq_alpha * (a**2 - b**2) / (b**2)
    A = 1 + u_sq / 16384 * (4096 + u_sq * (-768 + u_sq * (320 - 175 * u_sq)))
    B = u_sq / 1024 * (256 + u_sq * (-128 + u_sq * (74 - 47 * u_sq)))

    delta_sigma = B * sin_sigma * (cos_2sigma_m + B / 4 * (cos_sigma * (-1 + 2 * cos_2sigma_m**2) - B / 6 * cos_2sigma_m * (-3 + 4 * sin_sigma**2) * (-3 + 4 * cos_2sigma_m**2)))

    s = b * A * (sigma - delta_sigma) / 1000  # distance in km

    return s

def haversine(lat1, lon1, lat2, lon2):
    """
    Calculate the great-circle distance between two points on the Earth (in km).
    """
    R = 6371.0  # Earth radius in km
    lat1_rad = np.radians(lat1)
    lon1_rad = np.radians(lon1)
    lat2_rad = np.radians(lat2)
    lon2_rad = np.radians(lon2)
    
    dlat = lat2_rad - lat1_rad
    dlon = lon2_rad - lon1_rad
    
    a = np.sin(dlat / 2)**2 + np.cos(lat1_rad) * np.cos(lat2_rad) * np.sin(dlon / 2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    
    return R * c

def get_or_create_geo_mapping(df):
    """
    Loads geocoding from a file or creates it by calling an API.
    """
    json_path = 'geo_mapping.json'
    if os.path.exists(json_path):
        print(f"✅ Found existing geocoding file at '{json_path}'. Loading...")
        with open(json_path, 'r') as f:
            return json.load(f)

    print(f"⚠️ No geocoding file found. Creating a new one using geopy...")
    print("This will take several minutes as it respects API rate limits.")

    geolocator = Nominatim(user_agent="property_matcher_app_v6")
    unique_locations = df['Location'].dropna().unique()
    geo_mapping = {}

    city_prefix_map = {
        'A': 'Ahmedabad', 'P': 'Pune', 'G': 'Gandhinagar',
        'S': 'Surat', 'V': 'Vadodara', 'B': 'Vadodara'
    }

    for location_str in unique_locations:
        if not isinstance(location_str, str) or '-' not in location_str:
            continue

        parts = location_str.split('-', 1)
        prefix = parts[0].strip()
        area = parts[1].strip()
        city = city_prefix_map.get(prefix, 'Ahmedabad')

        query = f"{area}, {city}, India"
        try:
            location_data = geolocator.geocode(query, timeout=10)
            if location_data:
                geo_mapping[location_str] = (location_data.latitude, location_data.longitude)
                print(f"✅ Found: {query} -> ({location_data.latitude:.4f}, {location_data.longitude:.4f})")
            else:
                print(f"⚠️ Not Found: {query}")
            time.sleep(1)
        except Exception as e:
            print(f"❌ Error for '{query}': {e}")

    with open(json_path, 'w') as f:
        json.dump(geo_mapping, f, indent=4)
    print(f"✅ Geocoding complete. Saved to '{json_path}'.")
    return geo_mapping

def run_data_pipeline():
    """
    Loads, cleans, and preprocesses the property data, including geocoding.
    Uses latitude and longitude directly as numerical features for better geographic similarity.
    Adds a derived 'Price_Per_SqFt' feature.
    """
    try:
        property_df = pd.read_csv('PropertyData.csv', low_memory=False)
        print("✅ Successfully loaded PropertyData.csv")
    except FileNotFoundError:
        print("❌ Error: 'PropertyData.csv' not found.")
        return None, None, None, None

    df = property_df.copy()
    # Ensure necessary columns are included
    features_to_use = [
        'BHK', 'Property-Price', 'City1', 'Property-On-Floor', 'Property-Facing',
        'Age-Of-Property', 'Super-Built-up-Construction-Area', 'Carpet-Construction-Area',
        'Bathroom', 'Furniture-Details', 'Property-Status', 'Current-Status', 'Location', 'Parking-Details',
        'No-Of-Lift-Per-Block', 'Service_Expiry_Date', 'Tag'
    ]
    df = df[features_to_use]
    print(f"✅ Selected {len(features_to_use)} features for modeling.")

    # --- UPDATED: Three-State Status Calculation ---
    print("\n🗓️ Calculating property status (Active/Expired/Sold)...")
    
    # Define what constitutes a 'Sold' property
    sold_statuses = ['Sold-CD', 'Sold-Others', 'Rented-CD']
    
    # Define the reference date for checking expiry (updated to current date)
    current_date = pd.to_datetime('2025-08-23')
    
    # Convert Service_Expiry_Date to datetime objects
    df['Service_Expiry_Date'] = pd.to_datetime(df['Service_Expiry_Date'], errors='coerce', dayfirst=True)
    
    # Create the new reliable status column with three states using np.select
    conditions = [
        df['Property-Status'].isin(sold_statuses),
        df['Service_Expiry_Date'] < current_date
    ]
    choices = ['Sold', 'Expired']
    df['Calculated_Status'] = np.select(conditions, choices, default='Active')
    
    print(f"✅ Status calculation complete. Status distribution:")
    print(df['Calculated_Status'].value_counts())

    geo_mapping = get_or_create_geo_mapping(df)
    median_lat = np.median([val[0] for val in geo_mapping.values() if val and val[0] is not None])
    median_lon = np.median([val[1] for val in geo_mapping.values() if val and val[1] is not None])

    df[['Latitude', 'Longitude']] = df['Location'].apply(
        lambda x: pd.Series(geo_mapping.get(str(x), (median_lat, median_lon)))
    )

    print("\n⚙️ Starting data cleaning process...")
    def clean_price(price):
        if not isinstance(price, str): return np.nan
        price_str = price.lower()
        try:
            numbers = re.findall(r'[\d\.]+', price_str)
            if not numbers: return np.nan
            value = float(numbers[0])
            if 'cr' in price_str: return value * 100
            return value
        except (ValueError, IndexError): return np.nan

    def clean_area(area):
        if not isinstance(area, str): return np.nan
        area_str = area.lower()
        try:
            numbers = re.findall(r'[\d\.]+', area_str)
            if not numbers: return np.nan
            value = float(numbers[0])
            if 'yard' in area_str: return value * 9
            return value
        except (ValueError, IndexError): return np.nan

    def clean_floor(floor):
        if not isinstance(floor, str): return np.nan
        floor_str = floor.lower().replace('g', '0')
        try:
            numbers = re.findall(r'\d+', floor_str)
            if numbers: return int(numbers[0])
            return np.nan
        except (ValueError, IndexError): return np.nan

    def clean_age(age):
        if not isinstance(age, str): return np.nan
        age_str = age.lower()
        if 'new' in age_str or 'under' in age_str: return 0
        try:
            numbers = [int(s) for s in re.findall(r'\d+', age_str)]
            if numbers: return sum(numbers) / len(numbers)
            return np.nan
        except (ValueError, IndexError): return np.nan

    df.replace('-', np.nan, inplace=True)
    df['Property-Price'] = df['Property-Price'].apply(clean_price)
    df['Super-Built-up-Construction-Area'] = df['Super-Built-up-Construction-Area'].apply(clean_area)
    df['Carpet-Construction-Area'] = df['Carpet-Construction-Area'].apply(clean_area)
    df['Property-On-Floor'] = df['Property-On-Floor'].apply(clean_floor)
    df['Age-Of-Property'] = df['Age-Of-Property'].apply(clean_age)
    df['BHK'] = pd.to_numeric(df['BHK'].astype(str).str.extract(r'(\d+)', expand=False), errors='coerce')
    df['Bathroom'] = pd.to_numeric(df['Bathroom'], errors='coerce')
    df['No-Of-Lift-Per-Block'] = pd.to_numeric(df['No-Of-Lift-Per-Block'], errors='coerce')
    df.loc[df['Bathroom'] > 20, 'Bathroom'] = np.nan

    # Add derived feature: Price per square foot (using Carpet area, fallback to median if zero)
    df['Price_Per_SqFt'] = df['Property-Price'] / df['Carpet-Construction-Area'].clip(lower=1)

    numerical_cols = ['Property-Price', 'Super-Built-up-Construction-Area', 'Carpet-Construction-Area',
                      'Property-On-Floor', 'Age-Of-Property', 'BHK', 'Bathroom', 'No-Of-Lift-Per-Block',
                      'Latitude', 'Longitude', 'Price_Per_SqFt']
    for col in numerical_cols:
        df[col].fillna(df[col].median(), inplace=True)

    categorical_cols = ['City1', 'Property-Facing', 'Furniture-Details', 'Property-Status',
                        'Current-Status', 'Parking-Details', 'Calculated_Status']
    for col in categorical_cols:
        df[col] = df[col].astype(str)
        df[col].fillna(df[col].mode()[0], inplace=True)

    print("✅ Data cleaning and imputation complete.")
    
    # Create Preprocessor for use in later steps
    numerical_features = [col for col in numerical_cols]
    # We exclude our new Calculated_Status from the training features
    categorical_features = [col for col in categorical_cols if col != 'Calculated_Status']
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', MinMaxScaler(), numerical_features),
            ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
        ],
        remainder='drop'
    )
    
    return df, numerical_cols, categorical_cols, preprocessor

    
# --- Block 2: Exploratory Data Analysis (EDA) ---
def perform_eda(df_cleaned):
    print("\n\n--- 🔬 Starting Exploratory Data Analysis ---")
    print("\n--- 📊 Descriptive Statistics ---")
    print(df_cleaned.describe())
    sns.set_style("whitegrid")
    print("\n--- 📈 Visualizing Numerical Distributions ---")
    df_cleaned.hist(bins=30, figsize=(20, 15), layout=(4, 3))
    plt.suptitle('Distribution of Numerical Features', size=20, y=1.02)
    plt.tight_layout()
    plt.show()

# Block 3- Training the model
def train_autoencoder(df_cleaned, preprocessor):
    """
    Builds and trains the tuned autoencoder model using the provided preprocessor.
    Added early stopping for better training efficiency.
    """
    print("\n\n--- 🤖 Preparing data for the model ---")
    
    # Use the preprocessor that was created in the data pipeline
    X_processed = preprocessor.fit_transform(df_cleaned)
    print(f"✅ Preprocessing complete. Shape of model input data: {X_processed.shape}")

    input_dim = X_processed.shape[1]
    embedding_dim = 64

    # --- Model Architecture ---
    input_layer = Input(shape=(input_dim,))
    encoder = Dense(256)(input_layer)
    encoder = LeakyReLU()(encoder)
    encoder = Dense(128)(encoder)
    encoder = LeakyReLU()(encoder)
    
    bottleneck = Dense(embedding_dim)(encoder)
    bottleneck = LeakyReLU()(bottleneck)

    decoder = Dense(128)(bottleneck)
    decoder = LeakyReLU()(decoder)
    decoder = Dense(256)(decoder)
    decoder = LeakyReLU()(decoder)
    output_layer = Dense(input_dim, activation='sigmoid')(decoder)

    autoencoder = Model(inputs=input_layer, outputs=output_layer)
    encoder_model = Model(inputs=input_layer, outputs=bottleneck)
    autoencoder.compile(optimizer='adam', loss='mean_squared_error')
    print("\n✅ Tuned autoencoder model built.")
    autoencoder.summary()

    print("\n--- 🏋️‍♂️ Training the Autoencoder ---")
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    autoencoder.fit(
        X_processed, X_processed,
        epochs=200,  # Increased max epochs, but early stopping will prevent overfitting
        batch_size=32,
        shuffle=True,
        validation_split=0.1,
        verbose=1,
        callbacks=[early_stopping]
    )
    print("✅ Model training complete.")
    return encoder_model, autoencoder, X_processed
    
# BLOCK - 4 
def find_similar_properties(encoder_model, autoencoder, X_processed, df_cleaned):
    """
    Uses the trained encoder to find and display similar properties.
    Reversed the query/database to match business goal: query active properties (receiving leads) to find similar expired properties.
    Added geographic distance filter using haversine for better location-aware matching.
    Added sorting of filtered matches by similarity score.
    Returns the embeddings for further analysis.
    """
    print("\n\n--- 🎯 Finding Similar Properties ---")
    
    loss = autoencoder.evaluate(X_processed, X_processed, verbose=0)
    print(f"\n--- 📈 Model Evaluation ---")
    print(f"Final Mean Squared Error (Reconstruction Loss): {loss:.6f}")
    
    all_embeddings = encoder_model.predict(X_processed)

    query_mask = df_cleaned['Calculated_Status'] == 'Active'  # Queries are active properties (receiving leads)
    database_mask = df_cleaned['Calculated_Status'] == 'Expired'  # Database is expired properties to match to

    query_indices = df_cleaned[query_mask].index
    database_indices = df_cleaned[database_mask].index
    
    database_embeddings = all_embeddings[database_indices]

    print(f"\nFound {len(database_indices)} expired properties to match against.")
    print(f"Found {len(query_indices)} active properties to find matches for.")

    if len(database_indices) > 0 and len(query_indices) > 0:
        K = 10 
        knn = NearestNeighbors(n_neighbors=K, metric='cosine')
        knn.fit(database_embeddings)

        query_original_index = np.random.choice(query_indices)
        query_embedding = all_embeddings[query_original_index].reshape(1, -1)

        print("\n--- Example Match ---")
        print(f"Finding similar properties for ACTIVE property at index: {query_original_index}")
        print("Query Property Details:")
        query_property = df_cleaned.iloc[query_original_index]
        print(query_property)
        print("-" * 20)

        distances, indices = knn.kneighbors(query_embedding)

        print(f"Top {K} initial matches found. Filtering for price and geographic relevance...")
        
        query_price = query_property['Property-Price']
        price_tolerance = 0.30
        lower_bound = query_price * (1 - price_tolerance)
        upper_bound = query_price * (1 + price_tolerance)
        
        query_lat = query_property['Latitude']
        query_lon = query_property['Longitude']
        max_distance_km = 5.0  # Maximum allowed distance in km for a match
        
        filtered_matches = []
        for i, index in enumerate(indices[0]):
            original_df_index = database_indices[index]
            match_price = df_cleaned.iloc[original_df_index]['Property-Price']
            match_lat = df_cleaned.iloc[original_df_index]['Latitude']
            match_lon = df_cleaned.iloc[original_df_index]['Longitude']
            dist_km = vincenty(query_lat, query_lon, match_lat, match_lon)
            
            if lower_bound <= match_price <= upper_bound and dist_km <= max_distance_km:
                similarity_score = 1 - distances[0][i]
                filtered_matches.append((original_df_index, similarity_score, dist_km))

        # Sort filtered matches by similarity score descending
        filtered_matches.sort(key=lambda x: x[1], reverse=True)

        print(f"\n✅ Found {len(filtered_matches)} matches within {price_tolerance*100}% price range and {max_distance_km} km.")
        print(f"--- Top {min(5, len(filtered_matches))} Financially and Geographically Relevant EXPIRED Properties ---")
        
        if not filtered_matches:
            print("No expired properties found within the price and distance constraints.")
        else:
            for i, (original_df_index, similarity_score, dist_km) in enumerate(filtered_matches[:5]):
                print(f"\nRank {i+1}: Property at index {original_df_index} (Similarity: {similarity_score:.4f}, Distance: {dist_km:.2f} km)")
                print(df_cleaned.iloc[original_df_index])
    else:
        print("\nCould not perform matching: no active or expired properties found to process.")
        
    # --- ADD THIS LINE ---
    return all_embeddings


In [41]:
df_cleaned, numerical_cols, categorical_cols, preprocessor = run_data_pipeline()

✅ Successfully loaded PropertyData.csv
✅ Selected 17 features for modeling.

🗓️ Calculating property status (Active/Expired/Sold)...
✅ Status calculation complete. Status distribution:
Calculated_Status
Expired    1055
Active      720
Sold         85
Name: count, dtype: int64
✅ Found existing geocoding file at 'geo_mapping.json'. Loading...

⚙️ Starting data cleaning process...
✅ Data cleaning and imputation complete.


In [23]:
# This call will now work correctly
encoder_model, autoencoder, X_processed = train_autoencoder(df_cleaned, preprocessor)



--- 🤖 Preparing data for the model ---
✅ Preprocessing complete. Shape of model input data: (1860, 46)

✅ Tuned autoencoder model built.



--- 🏋️‍♂️ Training the Autoencoder ---
Epoch 1/200
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - loss: 0.1183 - val_loss: 0.0368
Epoch 2/200
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0283 - val_loss: 0.0227
Epoch 3/200
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0189 - val_loss: 0.0179
Epoch 4/200
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0146 - val_loss: 0.0123
Epoch 5/200
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0094 - val_loss: 0.0103
Epoch 6/200
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.0075 - val_loss: 0.0084
Epoch 7/200
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0062 - val_loss: 0.0062
Epoch 8/200
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0049 - val_loss: 0.0052
Epoch 9/

In [33]:
if 'encoder_model' in locals():
    # This now passes the 'autoencoder' variable correctly
    find_similar_properties(encoder_model, autoencoder, X_processed, df_cleaned)



--- 🎯 Finding Similar Properties ---

--- 📈 Model Evaluation ---
Final Mean Squared Error (Reconstruction Loss): 0.000509
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step

Found 955 expired properties to match against.
Found 820 active properties to find matches for.

--- Example Match ---
Finding similar properties for ACTIVE property at index: 477
Query Property Details:
BHK                                                         3.0
Property-Price                                             58.0
City1                                                 Ahmedabad
Property-On-Floor                                           6.0
Property-Facing                                            East
Age-Of-Property                                             2.0
Super-Built-up-Construction-Area                          768.0
Carpet-Construction-Area                                  461.0
Bathroom                                                    1.0
Furniture-Details     

In [None]:
import pandas as pd
import numpy as np
import re
import json
import time
import os
from geopy.geocoders import Nominatim
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
from keras.models import Model
from keras.layers import Input, Dense, LeakyReLU
from keras.callbacks import EarlyStopping
import math


import numpy as np

def haversine(lat1, lon1, lat2, lon2):
    """
    Calculate the great-circle distance between two points on Earth using the Haversine formula.
    Parameters:
        lat1, lon1: Latitude and longitude of the first point (in degrees).
        lat2, lon2: Latitude and longitude of the second point (in degrees).
    Returns:
        Distance in kilometers.
    """
    # Convert degrees to radians
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    
    # Haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    R = 6371  # Earth's radius in km
    return c * R

def vincenty(lat1, lon1, lat2, lon2):
    """
    Calculate the geodesic distance between two points on the Earth using Vincenty's formula (in km).
    More accurate than Haversine as it accounts for Earth's ellipsoidal shape.
    """
    # WGS-84 ellipsoid parameters
    a = 6378137.0  # semi-major axis in meters
    f = 1 / 298.257223563  # flattening
    b = a * (1 - f)  # semi-minor axis

    # Convert degrees to radians
    phi1 = math.radians(lat1)
    lambda1 = math.radians(lon1)
    phi2 = math.radians(lat2)
    lambda2 = math.radians(lon2)

    # Reduced latitudes
    U1 = math.atan((1 - f) * math.tan(phi1))
    U2 = math.atan((1 - f) * math.tan(phi2))

    L = lambda2 - lambda1  # longitude difference
    lam = L  # initial approximation of lambda

    sinU1 = math.sin(U1)
    cosU1 = math.cos(U1)
    sinU2 = math.sin(U2)
    cosU2 = math.cos(U2)

    iter_limit = 100
    for _ in range(iter_limit):
        sin_lam = math.sin(lam)
        cos_lam = math.cos(lam)

        sin_sigma = math.sqrt((cosU2 * sin_lam)**2 + (cosU1 * sinU2 - sinU1 * cosU2 * cos_lam)**2)
        if sin_sigma == 0:
            return 0.0  # coincident points

        cos_sigma = sinU1 * sinU2 + cosU1 * cosU2 * cos_lam
        sigma = math.atan2(sin_sigma, cos_sigma)

        sin_alpha = (cosU1 * cosU2 * sin_lam) / sin_sigma
        cos_sq_alpha = 1 - sin_alpha**2

        if cos_sq_alpha == 0:
            return (a * math.pi) / 1000  # antipodal points, approximate

        cos_2sigma_m = cos_sigma - (2 * sinU1 * sinU2) / cos_sq_alpha

        C = f / 16 * cos_sq_alpha * (4 + f * (4 - 3 * cos_sq_alpha))

        lam_prev = lam
        lam = L + (1 - C) * f * sin_alpha * (sigma + C * sin_sigma * (cos_2sigma_m + C * cos_sigma * (-1 + 2 * cos_2sigma_m**2)))

        if abs(lam - lam_prev) < 1e-12:
            break
    else:
        raise ValueError("Vincenty formula failed to converge after 100 iterations")

    u_sq = cos_sq_alpha * (a**2 - b**2) / (b**2)
    A = 1 + u_sq / 16384 * (4096 + u_sq * (-768 + u_sq * (320 - 175 * u_sq)))
    B = u_sq / 1024 * (256 + u_sq * (-128 + u_sq * (74 - 47 * u_sq)))

    delta_sigma = B * sin_sigma * (cos_2sigma_m + B / 4 * (cos_sigma * (-1 + 2 * cos_2sigma_m**2) - B / 6 * cos_2sigma_m * (-3 + 4 * sin_sigma**2) * (-3 + 4 * cos_2sigma_m**2)))

    s = b * A * (sigma - delta_sigma) / 1000  # distance in km

    return s

def get_or_create_geo_mapping(df):
    """
    Loads geocoding from a file or creates it by calling an API.
    """
    json_path = 'geo_mapping.json'
    if os.path.exists(json_path):
        print(f"✅ Found existing geocoding file at '{json_path}'. Loading...")
        with open(json_path, 'r') as f:
            return json.load(f)

    print(f"⚠️ No geocoding file found. Creating a new one using geopy...")
    print("This will take several minutes as it respects API rate limits.")

    geolocator = Nominatim(user_agent="property_matcher_app_v6")
    unique_locations = df['Location'].dropna().unique()
    geo_mapping = {}

    city_prefix_map = {
        'A': 'Ahmedabad', 'P': 'Pune', 'G': 'Gandhinagar',
        'S': 'Surat', 'V': 'Vadodara', 'B': 'Vadodara'
    }

    for location_str in unique_locations:
        if not isinstance(location_str, str) or '-' not in location_str:
            continue

        parts = location_str.split('-', 1)
        prefix = parts[0].strip()
        area = parts[1].strip()
        city = city_prefix_map.get(prefix, 'Ahmedabad')

        query = f"{area}, {city}, India"
        try:
            location_data = geolocator.geocode(query, timeout=10)
            if location_data:
                geo_mapping[location_str] = (location_data.latitude, location_data.longitude)
                print(f"✅ Found: {query} -> ({location_data.latitude:.4f}, {location_data.longitude:.4f})")
            else:
                print(f"⚠️ Not Found: {query}")
            time.sleep(1)
        except Exception as e:
            print(f"❌ Error for '{query}': {e}")

    with open(json_path, 'w') as f:
        json.dump(geo_mapping, f, indent=4)
    print(f"✅ Geocoding complete. Saved to '{json_path}'.")
    return geo_mapping

def run_data_pipeline():
    """
    Loads, cleans, and preprocesses the property data, with robust location normalization.
    """
    try:
        property_df = pd.read_csv('PropertyData.csv', low_memory=False)
        print("✅ Successfully loaded PropertyData.csv")
    except FileNotFoundError:
        print("❌ Error: 'PropertyData.csv' not found.")
        return None, None, None, None

    df = property_df.copy()
    features_to_use = [
        'BHK', 'Property-Price', 'City1', 'Property-On-Floor', 'Property-Facing',
        'Age-Of-Property', 'Super-Built-up-Construction-Area', 'Carpet-Construction-Area',
        'Bathroom', 'Furniture-Details', 'Property-Status', 'Current-Status', 'Location',
        'Parking-Details', 'No-Of-Lift-Per-Block', 'Service_Expiry_Date', 'Tag',
        'Property_Type', 'Residential-Property', 'Commercial-Property-Type'
    ]
    df = df[features_to_use]
    print(f"✅ Selected {len(features_to_use)} features for modeling.")

    # --- NEW: Robust Location String Normalization ---
    print("\n🧹 Normalizing location strings...")
    def normalize_location(loc_str):
        if not isinstance(loc_str, str):
            return loc_str
        # 1. Strip leading/trailing whitespace
        cleaned_str = loc_str.strip()
        # 2. Correct full city names to prefixes (e.g., "Pune-" to "P-")
        city_to_prefix = {
            'Pune-': 'P-', 'Ahmedabad-': 'A-', 'Gandhinagar-': 'G-',
            'Surat-': 'S-', 'Vadodara-': 'V-'
        }
        for city, prefix in city_to_prefix.items():
            if cleaned_str.startswith(city):
                cleaned_str = cleaned_str.replace(city, prefix)
        # 3. Standardize all Vadodara prefixes to 'V-' to match your JSON
        if cleaned_str.startswith('B-'):
            cleaned_str = cleaned_str.replace('B-', 'V-')
        return cleaned_str

    df['Location'] = df['Location'].apply(normalize_location)
    
    # Status Calculation
    print("\n🗓️ Calculating property status (Active/Expired/Sold)...")
    sold_statuses = ['Sold-CD', 'Sold-Others', 'Rented-CD']
    current_date = pd.to_datetime('2025-08-23')
    df['Service_Expiry_Date'] = pd.to_datetime(df['Service_Expiry_Date'], errors='coerce', dayfirst=True)
    
    conditions = [
        df['Property-Status'].isin(sold_statuses),
        df['Service_Expiry_Date'] < current_date
    ]
    choices = ['Sold', 'Expired']
    df['Calculated_Status'] = np.select(conditions, choices, default='Active')
    print(f"✅ Status calculation complete.")

    # Load manually created geo_mapping.json
    print("\n📍 Loading geo_mapping.json...")
    try:
        with open('geo_mapping.json', 'r') as f:
            geo_mapping = json.load(f)
    except FileNotFoundError:
        print("❌ Error: 'geo_mapping.json' not found.")
        return None, None, None, None

    # Validate locations AFTER normalization
    missing_locations = set(df['Location'].dropna().unique()) - set(geo_mapping.keys())
    if missing_locations:
        print(f"❌ Error: {len(missing_locations)} locations not in geo_mapping.json: {missing_locations}")
        print("Please update geo_mapping.json with these locations before proceeding.")
        return None, None, None, None

    df[['Latitude', 'Longitude']] = df['Location'].apply(
        lambda x: pd.Series(geo_mapping.get(x, (np.nan, np.nan)))
    )
    
    # (The rest of the data cleaning, clustering, and preprocessing logic continues here...)
    print("\n⚙️ Starting data cleaning process...")
    def clean_price(price):
        if not isinstance(price, str): return np.nan
        price_str = price.lower()
        try:
            numbers = re.findall(r'[\d\.]+', price_str)
            if not numbers: return np.nan
            value = float(numbers[0])
            if 'cr' in price_str: return value * 100
            return value
        except (ValueError, IndexError): return np.nan

    def clean_area(area):
        if not isinstance(area, str): return np.nan
        area_str = area.lower()
        try:
            numbers = re.findall(r'[\d\.]+', area_str)
            if not numbers: return np.nan
            value = float(numbers[0])
            if 'yard' in area_str: return value * 9
            return value
        except (ValueError, IndexError): return np.nan

    df.replace('-', np.nan, inplace=True)
    df['Property-Price'] = df['Property-Price'].apply(clean_price)
    df['Super-Built-up-Construction-Area'] = df['Super-Built-up-Construction-Area'].apply(clean_area)
    df['Carpet-Construction-Area'] = df['Carpet-Construction-Area'].apply(clean_area)
    
    for col in ['BHK', 'Bathroom', 'No-Of-Lift-Per-Block', 'Property-On-Floor', 'Age-Of-Property']:
        df[col] = pd.to_numeric(df[col], errors='coerce')

    numerical_cols = ['Property-Price', 'Super-Built-up-Construction-Area', 'Carpet-Construction-Area',
                      'Property-On-Floor', 'Age-Of-Property', 'BHK', 'Bathroom', 'No-Of-Lift-Per-Block',
                      'Latitude', 'Longitude']
    for col in numerical_cols:
        df[col].fillna(df[col].median(), inplace=True)

    categorical_cols = ['City1', 'Property-Facing', 'Furniture-Details', 'Property-Status',
                       'Current-Status', 'Parking-Details', 'Calculated_Status',
                       'Property_Type', 'Residential-Property', 'Commercial-Property-Type']
    for col in categorical_cols:
        if col in df.columns:
            df[col] = df[col].astype(str)
            df[col].replace('nan', 'None', inplace=True)
            df[col].fillna('None', inplace=True)

    print("✅ Data cleaning and imputation complete.")

    numerical_features = [col for col in numerical_cols if col in df.columns]
    categorical_features = [col for col in categorical_cols if col != 'Calculated_Status' and col in df.columns]
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', MinMaxScaler(), numerical_features),
            ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
        ],
        remainder='drop'
    )
    
    return df, numerical_cols, categorical_cols, preprocessor


    
def perform_eda(df_cleaned):
    print("\n\n--- 🔬 Starting Exploratory Data Analysis ---")
    print("\n--- 📊 Descriptive Statistics ---")
    print(df_cleaned.describe())
    sns.set_style("whitegrid")
    print("\n--- 📈 Visualizing Numerical Distributions ---")
    df_cleaned.hist(bins=30, figsize=(20, 15), layout=(4, 3))
    plt.suptitle('Distribution of Numerical Features', size=20, y=1.02)
    plt.tight_layout()
    plt.show()

# Block 3- Training the model
def train_autoencoder(df_cleaned, preprocessor):
    """
    Builds and trains the tuned autoencoder model using the provided preprocessor.
    Added early stopping for better training efficiency.
    """
    print("\n\n--- 🤖 Preparing data for the model ---")
    
    # Use the preprocessor that was created in the data pipeline
    X_processed = preprocessor.fit_transform(df_cleaned)
    print(f"✅ Preprocessing complete. Shape of model input data: {X_processed.shape}")

    input_dim = X_processed.shape[1]
    embedding_dim = 64

    # --- Model Architecture ---
    input_layer = Input(shape=(input_dim,))
    encoder = Dense(256)(input_layer)
    encoder = LeakyReLU()(encoder)
    encoder = Dense(128)(encoder)
    encoder = LeakyReLU()(encoder)
    
    bottleneck = Dense(embedding_dim)(encoder)
    bottleneck = LeakyReLU()(bottleneck)

    decoder = Dense(128)(bottleneck)
    decoder = LeakyReLU()(decoder)
    decoder = Dense(256)(decoder)
    decoder = LeakyReLU()(decoder)
    output_layer = Dense(input_dim, activation='sigmoid')(decoder)

    autoencoder = Model(inputs=input_layer, outputs=output_layer)
    encoder_model = Model(inputs=input_layer, outputs=bottleneck)
    autoencoder.compile(optimizer='adam', loss='mean_squared_error')
    print("\n✅ Tuned autoencoder model built.")
    autoencoder.summary()

    print("\n--- 🏋️‍♂️ Training the Autoencoder ---")
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    autoencoder.fit(
        X_processed, X_processed,
        epochs=200,  # Increased max epochs, but early stopping will prevent overfitting
        batch_size=32,
        shuffle=True,
        validation_split=0.1,
        verbose=1,
        callbacks=[early_stopping]
    )
    print("✅ Model training complete.")
    return encoder_model, autoencoder, X_processed


def find_similar_properties(encoder_model, autoencoder, X_processed, df_cleaned):
    """
    Creates a mapping from each active property to similar expired properties.
    Uses Haversine formula for distance and strict geographic filter.
    """
    print("\n\n--- 🎯 Finding Similar Properties ---")
    
    if df_cleaned['Tag'].duplicated().any():
        raise ValueError("Duplicate values found in Tag column; it must be unique.")
    if df_cleaned['Tag'].isna().any():
        raise ValueError("Null values found in Tag column; all properties must have a Tag.")
    
    assert len(X_processed) == len(df_cleaned), "Mismatch between X_processed and df_cleaned rows"
    
    loss = autoencoder.evaluate(X_processed, X_processed, verbose=0)
    print(f"\n--- 📈 Model Evaluation ---")
    print(f"Final Mean Squared Error (Reconstruction Loss): {loss:.6f}")
    
    all_embeddings = encoder_model.predict(X_processed)
    assert len(all_embeddings) == len(df_cleaned), "Embedding row count mismatch"

    query_mask = df_cleaned['Calculated_Status'] == 'Active'
    database_mask = df_cleaned['Calculated_Status'] == 'Expired'

    query_indices = df_cleaned[query_mask].index
    database_indices = df_cleaned[database_mask].index
    
    query_positions = df_cleaned.index.get_indexer(query_indices)
    database_positions = df_cleaned.index.get_indexer(database_indices)
    
    active_embeddings = all_embeddings[query_positions]
    database_embeddings = all_embeddings[database_positions]

    print(f"\nFound {len(query_indices)} active properties to query.")
    print(f"Found {len(database_indices)} expired properties to match against.")

    active_to_expired_mapping = {}
    rejection_reasons = {}

    if len(database_indices) > 0 and len(query_indices) > 0:
        K = 20
        min_similarity_threshold = 0.6
        max_distance_km = 5.0
        price_tolerance = 0.40
        max_matches = 10
        
        knn = NearestNeighbors(n_neighbors=min(K, len(database_embeddings)), metric='cosine')
        knn.fit(database_embeddings)

        print("\n--- Computing KNN for Active Properties ---")
        distances, indices = knn.kneighbors(active_embeddings)

        print("\n--- Creating Mapping for All Active Properties ---")
        for i, active_idx in enumerate(query_indices):
            active_tag = str(df_cleaned.loc[active_idx, 'Tag'])
            active_price = df_cleaned.loc[active_idx, 'Property-Price']
            active_lat = df_cleaned.loc[active_idx, 'Latitude']
            active_lon = df_cleaned.loc[active_idx, 'Longitude']
            active_location = df_cleaned.loc[active_idx, 'Location']
            active_property_type = df_cleaned.loc[active_idx, 'Property_Type'] if 'Property_Type' in df_cleaned.columns else None
            
            lower_bound = active_price * (1 - price_tolerance)
            upper_bound = active_price * (1 + price_tolerance)
            
            filtered_matches = []
            for j, candidate_idx in enumerate(indices[i]):
                expired_position = candidate_idx
                expired_idx = database_indices[expired_position]
                expired_tag = str(df_cleaned.loc[expired_idx, 'Tag'])
                expired_price = df_cleaned.loc[expired_idx, 'Property-Price']
                expired_lat = df_cleaned.loc[expired_idx, 'Latitude']
                expired_lon = df_cleaned.loc[expired_idx, 'Longitude']
                expired_location = df_cleaned.loc[expired_idx, 'Location']
                expired_property_type = df_cleaned.loc[expired_idx, 'Property_Type'] if 'Property_Type' in df_cleaned.columns else None
                
                dist_km = haversine(active_lat, active_lon, expired_lat, expired_lon)
                similarity = 1 - distances[i][j]
                
                reasons = []
                if similarity < min_similarity_threshold:
                    reasons.append(f"Similarity {similarity:.4f} < {min_similarity_threshold}")
                if not (lower_bound <= expired_price <= upper_bound):
                    reasons.append(f"Price {expired_price} outside [{lower_bound:.2f}, {upper_bound:.2f}]")
                if dist_km > max_distance_km:
                    reasons.append(f"Distance {dist_km:.2f} km ({active_location} to {expired_location}) > {max_distance_km} km")
                if active_property_type and expired_property_type and active_property_type != expired_property_type:
                    reasons.append(f"Property_Type mismatch: {active_property_type} != {expired_property_type}")
                
                if reasons:
                    rejection_reasons.setdefault(expired_tag, []).extend(reasons)
                
                if (lower_bound <= expired_price <= upper_bound and 
                    dist_km <= max_distance_km and 
                    similarity >= min_similarity_threshold and
                    (not active_property_type or active_property_type == expired_property_type)):
                    filtered_matches.append({
                        'expired_tag': expired_tag,
                        'expired_index': int(expired_idx),
                        'similarity': float(similarity),
                        'distance_km': float(dist_km),
                        'expired_location': expired_location
                    })
            
            filtered_matches.sort(key=lambda x: x['similarity'], reverse=True)
            active_to_expired_mapping[active_tag] = filtered_matches[:max_matches]
            
            if i == 0:
                print(f"\n--- Example Match for Active Property Tag: {active_tag} ---")
                print("Query Property Details:")
                print(df_cleaned.loc[active_idx])
                print(f"\nFound {len(filtered_matches)} matches within {price_tolerance*100}% price range and {max_distance_km} km.")
                for rank, match in enumerate(filtered_matches[:max_matches], 1):
                    print(f"\nRank {rank}: Expired Property Tag {match['expired_tag']} (Index: {match['expired_index']}, Similarity: {match['similarity']:.4f}, Distance: {match['distance_km']:.2f} km, Location: {match['expired_location']})")
                    print(df_cleaned.loc[match['expired_index']])
    
    else:
        print("\nCould not perform matching: no active or expired properties found.")
    
    with open('active_to_expired_mapping.json', 'w') as f:
        json.dump(active_to_expired_mapping, f, indent=4)
    print("\n✅ Mapping saved to 'active_to_expired_mapping.json'")

    rejection_data = []
    for expired_tag, reasons in rejection_reasons.items():
        rejection_data.append({
            'expired_tag': expired_tag,
            'rejection_reasons': '; '.join(set(reasons))
        })
    rejection_df = pd.DataFrame(rejection_data)
    rejection_df.to_csv('rejection_reasons.csv', index=False)
    print("Rejection reasons for unmatched properties saved to 'rejection_reasons.csv'")

    return all_embeddings, active_to_expired_mapping

    

if __name__ == "__main__":
    df_cleaned, numerical_cols, categorical_cols, preprocessor = run_data_pipeline()
    if df_cleaned is not None:
        # perform_eda(df_cleaned)  # Uncomment if you want to run EDA
        encoder_model, autoencoder, X_processed = train_autoencoder(df_cleaned, preprocessor)
        all_embeddings, mapping = find_similar_properties(encoder_model, autoencoder, X_processed, df_cleaned)

        # --- Analysis of Unmatched Expired Properties ---
        print("\n\n--- 🔍 Analyzing Unmatched Expired Properties ---")
        
        # Get all expired properties' Tags
        expired_mask = df_cleaned['Calculated_Status'] == 'Expired'
        expired_tags = set(df_cleaned[expired_mask]['Tag'].astype(str))
        total_expired = len(expired_tags)
        print(f"Total Expired Properties: {total_expired}")

        # Get all matched expired Tags from the mapping
        matched_expired_tags = set()
        for active_tag, matches in mapping.items():
            for match in matches:
                matched_expired_tags.add(match['expired_tag'])

        # Find unmatched expired properties
        unmatched_expired_tags = expired_tags - matched_expired_tags
        num_unmatched = len(unmatched_expired_tags)
        unmatched_percentage = (num_unmatched / total_expired * 100) if total_expired > 0 else 0

        print(f"Unmatched Expired Properties: {num_unmatched}")
        print(f"Percentage of Expired Properties Unmatched: {unmatched_percentage:.2f}%")

        # Save unmatched expired properties to CSV for review
        if num_unmatched > 0:
            unmatched_df = df_cleaned[df_cleaned['Tag'].isin(unmatched_expired_tags)][
                ['Tag', 'Property-Price', 'Location', 'BHK', 'Carpet-Construction-Area', 'Service_Expiry_Date']
            ]
            unmatched_df.to_csv('unmatched_expired_properties.csv', index=False)
            print("Details of unmatched expired properties saved to 'unmatched_expired_properties.csv'")
        else:
            print("All expired properties were matched to at least one active property.")

In [51]:
df = pd.read_csv('PropertyData.csv')
print(df[df['Property_Type'] == 'Residential']['Commercial-Property-Type'].value_counts())

Commercial-Property-Type
-                  858
Others               6
Shops-Showrooms      5
Office-Space         4
Name: count, dtype: int64


In [53]:
import json
with open('geo_mapping.json', 'r') as f:
    geo_mapping = json.load(f)
print("Kasindra:", geo_mapping.get('A-Kasindra'))
print("Vatva:", geo_mapping.get('A-Vatva'))
print("Chandkheda:", geo_mapping.get('A-Chandkheda'))
print("New Maninagar:", geo_mapping.get('A-New Maninagar'))

Kasindra: [22.8924, 72.4913]
Vatva: [22.9664, 72.6159]
Chandkheda: [23.1091, 72.5849]
New Maninagar: [22.9857, 72.6432]


In [56]:
def haversine(lat1, lon1, lat2, lon2):
    """
    Calculate the great-circle distance between two points on Earth using the Haversine formula.
    Parameters:
        lat1, lon1: Latitude and longitude of the first point (in degrees).
        lat2, lon2: Latitude and longitude of the second point (in degrees).
    Returns:
        Distance in kilometers.
    """
    # Convert degrees to radians
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    
    # Haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    R = 6371  # Earth's radius in km
    return c * R

In [57]:
print("Kasindra to Vatva:", haversine(22.8924, 72.4913, 22.9664, 72.6159), "km")
print("Chandkheda to New Maninagar:", haversine(23.1091, 72.5849, 22.9857, 72.6432), "km")
print("Kasindra to Chandkheda:", haversine(22.8924, 72.4913, 23.1091, 72.5849), "km")
print("Vatva to New Maninagar:", haversine(22.9664, 72.6159, 22.9857, 72.6432), "km")


Kasindra to Vatva: 15.183162248921903 km
Chandkheda to New Maninagar: 14.962025332545908 km
Kasindra to Chandkheda: 25.930649912521414 km
Vatva to New Maninagar: 3.523703832756868 km
