In [50]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
import re


In [13]:
# --- 1. Load Data ---
try:
    # Load the dataset
    property_df = pd.read_csv('PropertyData.csv', low_memory=False)
    print("Successfully loaded PropertyData.csv")
except FileNotFoundError:
    print("Error: 'PropertyData.csv' not found. Please ensure the file is in the correct directory.")
    # Exit if the file isn't found to prevent further errors
    exit()

Successfully loaded PropertyData.csv


In [32]:
property_df.columns

Index(['Contact', 'Type', 'Company', 'Assignee', 'Source', 'Phone', 'Email',
       'Feedback', 'Date_of_Site_Visit', 'Time_of_Site_Visit', 'BHK',
       'Price_Range_Lacs_Rs', 'Property_Type', 'Residential-Property',
       'Commercial-Property-Type', 'Location', 'City1',
       'Time-Duration-of-Purchase', 'Did-i-ask-for-sell-lead', 'Sell-Rent',
       'FEST-Details', 'Welcome Call Date', 'Service_Expiry_Date',
       'Payment-Package', 'Form-Number', 'Property-Price', 'Payment-Mode',
       'Receipt-Number', 'Service-Validity', 'Sales-Executive',
       'Package-Amount', 'Net-Amount', 'Due-Amount', 'Total-Package-Amount',
       'No-of-Inventory', 'Sales-Executive-Revenue-Amount',
       'Property-Register-Date', 'Relationship-Manager', 'JRM-Mobile-Number',
       'GST', 'Property-Address', 'Current-Status', 'Property-On-Floor',
       'Property-Facing', 'No-Of-Lift-Per-Block', 'Furniture-Details',
       'Age-Of-Property', 'Parking-Details', 'Super-Built-up-Plot-Space',
       'Sup

In [47]:
# Create a copy to work with, preserving the original dataframe
df = property_df.copy()


In [55]:
property_df['Property-Status'].value_counts()

Property-Status
Live-Property      2157
Service-Expired     127
Sold-CD             119
-                    13
Sold-Others           3
Rented-CD             2
Name: count, dtype: int64

In [48]:
# --- 2. Feature Selection ---
# Choose the columns most relevant for determining property similarity
features_to_use = [
    'BHK', 'Property-Price', 'Property_Type', 'City1', 'Property-On-Floor',
    'Property-Facing', 'Age-Of-Property', 'Super-Built-up-Construction-Area',
    'Carpet-Construction-Area', 'Bathroom', 'Furniture-Details', 'Current-Status', 'Location','No-Of-Lift-Per-Block',
    'Parking-Details', 'Property-Status'
]
df = df[features_to_use]
print(f"Selected {len(features_to_use)} features for modeling.")


Selected 15 features for modeling.


In [51]:
# --- 3. Custom Cleaning Functions ---
print("\n⚙️ Starting advanced data cleaning process...")

def clean_price(price):
    """Normalizes price strings (e.g., '1.55 Cr.', '65.00 Lacs') to a single unit (Lakhs)."""
    if not isinstance(price, str):
        return np.nan
    price_str = price.lower()
    try:
        # Find all numbers in the string
        numbers = re.findall(r'[\d\.]+', price_str)
        if not numbers:
            return np.nan
        value = float(numbers[0])
        # Convert Crores to Lakhs
        if 'cr' in price_str:
            return value * 100
        # Assume Lakhs if 'lac' or no unit is present
        return value
    except (ValueError, IndexError):
        return np.nan

def clean_area(area):
    """Normalizes area strings (e.g., '120 Sq.Yard', '634 Sq.Feet') to a single unit (Sq.Feet)."""
    if not isinstance(area, str):
        return np.nan
    area_str = area.lower()
    try:
        numbers = re.findall(r'[\d\.]+', area_str)
        if not numbers:
            return np.nan
        value = float(numbers[0])
        # Convert Sq.Yards to Sq.Feet (1 Sq.Yard = 9 Sq.Feet)
        if 'yard' in area_str:
            return value * 9
        # Assume Sq.Feet if 'feet' or no unit is present
        return value
    except (ValueError, IndexError):
        return np.nan

def clean_floor(floor):
    """Extracts the primary floor number from complex strings."""
    if not isinstance(floor, str):
        return np.nan
    floor_str = floor.lower().replace('g', '0') # Treat Ground floor as 0
    try:
        # Find the first number in the string
        numbers = re.findall(r'\d+', floor_str)
        if numbers:
            return int(numbers[0])
        return np.nan
    except (ValueError, IndexError):
        return np.nan

def clean_age(age):
    """Parses various age formats into an average age in years."""
    if not isinstance(age, str):
        return np.nan
    age_str = age.lower()
    if 'new' in age_str or 'under' in age_str:
        return 0
    try:
        # Find all numbers, average them if it's a range (e.g., '1-5')
        numbers = [int(s) for s in re.findall(r'\d+', age_str)]
        if numbers:
            return sum(numbers) / len(numbers)
        return np.nan
    except (ValueError, IndexError):
        return np.nan

# --- 4. Applying Cleaning and Imputation ---

# Replace placeholder '-' with NaN for consistent handling
df.replace('-', np.nan, inplace=True)

# Apply cleaning functions to their respective columns
df['Property-Price'] = df['Property-Price'].apply(clean_price)
df['Super-Built-up-Construction-Area'] = df['Super-Built-up-Construction-Area'].apply(clean_area)
df['Carpet-Construction-Area'] = df['Carpet-Construction-Area'].apply(clean_area)
df['Property-On-Floor'] = df['Property-On-Floor'].apply(clean_floor)
df['Age-Of-Property'] = df['Age-Of-Property'].apply(clean_age)

# Clean simple numeric columns by extracting numbers
df['BHK'] = pd.to_numeric(df['BHK'].astype(str).str.extract(r'(\d+)', expand=False), errors='coerce')
df['Bathroom'] = pd.to_numeric(df['Bathroom'], errors='coerce')
df['No-Of-Lift-Per-Block'] = pd.to_numeric(df['No-Of-Lift-Per-Block'], errors='coerce')

# Remove extreme outliers from 'Bathroom' (e.g., values > 20)
df.loc[df['Bathroom'] > 20, 'Bathroom'] = np.nan

# Impute (fill) missing numerical values with the median
for col in ['Property-Price', 'Super-Built-up-Construction-Area', 'Carpet-Construction-Area',
            'Property-On-Floor', 'Age-Of-Property', 'BHK', 'Bathroom', 'No-Of-Lift-Per-Block']:
    df[col].fillna(df[col].median(), inplace=True)

# Impute missing categorical values with the mode (most frequent value)
for col in ['City1', 'Property-Facing', 'Furniture-Details', 'Current-Status', 'Location', 'Parking-Details']:
    df[col].fillna(df[col].mode()[0], inplace=True)

print("✅ Data cleaning and imputation complete.")



⚙️ Starting advanced data cleaning process...
✅ Data cleaning and imputation complete.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)


In [52]:
# --- 5. Preprocessing Pipeline ---
print("\n⚙️ Applying preprocessing pipeline (scaling and encoding)...")

numerical_features = [
    'BHK', 'Property-Price', 'Property-On-Floor', 'Age-Of-Property',
    'Super-Built-up-Construction-Area', 'Carpet-Construction-Area', 'Bathroom', 'No-Of-Lift-Per-Block'
]
categorical_features = [
    'City1', 'Property-Facing', 'Furniture-Details', 'Current-Status', 'Location', 'Parking-Details'
]

# Create the preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ],
    remainder='drop'
)

# Fit and transform the data
X_processed = preprocessor.fit_transform(df)

print(f"✅ Preprocessing complete. Shape of the final data: {X_processed.shape}")

# --- 6. Save Outputs ---
np.save('processed_property_data.npy', X_processed)
df.to_csv('cleaned_property_data.csv', index=False)
print("\n💾 Cleaned dataframe saved to 'cleaned_property_data.csv'")
print("💾 Processed numpy array saved to 'processed_property_data.npy'")
print("\n🚀 Ready for the autoencoder model training step.")



⚙️ Applying preprocessing pipeline (scaling and encoding)...
✅ Preprocessing complete. Shape of the final data: (2423, 208)

💾 Cleaned dataframe saved to 'cleaned_property_data.csv'
💾 Processed numpy array saved to 'processed_property_data.npy'

🚀 Ready for the autoencoder model training step.
