<a href="https://colab.research.google.com/github/3srava0/assignment-3/blob/main/02_Feature_Engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Day 2: Feature Engineering
# Real Estate Investment Advisor System

print("="*60)
print("FEATURE ENGINEERING - Real Estate Data")
print("="*60)

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

print("\n‚úÖ Libraries imported successfully!")

FEATURE ENGINEERING - Real Estate Data
Mounted at /content/drive

‚úÖ Libraries imported successfully!


In [3]:
# Load cleaned dataset from correct path
print("\n" + "="*60)
print("LOADING CLEANED DATASET")
print("="*60)

df = pd.read_csv('/content/drive/MyDrive/assignment-3/data/india_housing_prices_cleaned.csv')

print(f"\n‚úÖ Dataset loaded successfully!")
print(f"Shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
print(f"\nData types:")
print(df.dtypes)
print(f"\nFirst few rows:")
df.head()


LOADING CLEANED DATASET

‚úÖ Dataset loaded successfully!
Shape: (250000, 23)

Columns: ['ID', 'State', 'City', 'Locality', 'Property_Type', 'BHK', 'Size_in_SqFt', 'Price_in_Lakhs', 'Price_per_SqFt', 'Year_Built', 'Furnished_Status', 'Floor_No', 'Total_Floors', 'Age_of_Property', 'Nearby_Schools', 'Nearby_Hospitals', 'Public_Transport_Accessibility', 'Parking_Space', 'Security', 'Amenities', 'Facing', 'Owner_Type', 'Availability_Status']

Data types:
ID                                  int64
State                              object
City                               object
Locality                           object
Property_Type                      object
BHK                                 int64
Size_in_SqFt                        int64
Price_in_Lakhs                    float64
Price_per_SqFt                    float64
Year_Built                          int64
Furnished_Status                   object
Floor_No                            int64
Total_Floors                        int6

Unnamed: 0,ID,State,City,Locality,Property_Type,BHK,Size_in_SqFt,Price_in_Lakhs,Price_per_SqFt,Year_Built,...,Age_of_Property,Nearby_Schools,Nearby_Hospitals,Public_Transport_Accessibility,Parking_Space,Security,Amenities,Facing,Owner_Type,Availability_Status
0,1,Tamil Nadu,Chennai,Locality_84,Apartment,1,4740,489.76,0.1,1990,...,35,10,3,High,No,No,"Playground, Gym, Garden, Pool, Clubhouse",West,Owner,Ready_to_Move
1,2,Maharashtra,Pune,Locality_490,Independent House,3,2364,195.52,0.08,2008,...,17,8,1,Low,No,Yes,"Playground, Clubhouse, Pool, Gym, Garden",North,Builder,Under_Construction
2,3,Punjab,Ludhiana,Locality_167,Apartment,2,3642,183.79,0.05,1997,...,28,9,8,Low,Yes,No,"Clubhouse, Pool, Playground, Gym",South,Broker,Ready_to_Move
3,4,Rajasthan,Jodhpur,Locality_393,Independent House,2,2741,300.29,0.11,1991,...,34,5,7,High,Yes,Yes,"Playground, Clubhouse, Gym, Pool, Garden",North,Builder,Ready_to_Move
4,5,Rajasthan,Jaipur,Locality_466,Villa,4,4823,182.9,0.04,2002,...,23,4,9,Low,No,Yes,"Playground, Garden, Gym, Pool, Clubhouse",East,Builder,Ready_to_Move


In [5]:
# Step 6: Save Featured Dataset to Google Drive
print("\n" + "="*60)
print("STEP 6: SAVE FEATURED DATASET")
print("="*60)

print("\nüíæ Saving datasets to Google Drive...")

# Save full featured dataset to Google Drive
featured_path = '/content/drive/MyDrive/assignment-3/data/india_housing_prices_featured.csv'
df.to_csv(featured_path, index=False)
print(f"‚úÖ Full featured dataset saved!")
print(f"   Path: {featured_path}")
print(f"   Shape: {df.shape}")
print(f"   Total columns: {len(df.columns)}")

# Create sample for GitHub (first 100 rows)
sample_path = '/content/drive/MyDrive/assignment-3/data/featured_data_sample.csv'
df.head(100).to_csv(sample_path, index=False)
print(f"\n‚úÖ Sample dataset (100 rows) saved for GitHub")
print(f"   Path: {sample_path}")

print("\n" + "="*60)
print("üéâ FEATURE ENGINEERING COMPLETE!")
print("="*60)
print(f"\nüìä SUMMARY:")
print(f"   Original columns: 23")
print(f"   Total engineered features: {len(df.columns) - 23}")
print(f"   Total columns now: {len(df.columns)}")
print(f"   Total rows: {len(df):,}")
print(f"\n‚úÖ Dataset ready for model training!")
print(f"\nüìÅ Files saved in Google Drive:")
print(f"   - Full dataset: india_housing_prices_featured.csv")
print(f"   - Sample (GitHub): featured_data_sample.csv")


STEP 6: SAVE FEATURED DATASET

üíæ Saving datasets to Google Drive...
‚úÖ Full featured dataset saved!
   Path: /content/drive/MyDrive/assignment-3/data/india_housing_prices_featured.csv
   Shape: (250000, 23)
   Total columns: 23

‚úÖ Sample dataset (100 rows) saved for GitHub
   Path: /content/drive/MyDrive/assignment-3/data/featured_data_sample.csv

üéâ FEATURE ENGINEERING COMPLETE!

üìä SUMMARY:
   Original columns: 23
   Total engineered features: 0
   Total columns now: 23
   Total rows: 250,000

‚úÖ Dataset ready for model training!

üìÅ Files saved in Google Drive:
   - Full dataset: india_housing_prices_featured.csv
   - Sample (GitHub): featured_data_sample.csv


In [6]:
# Step 2: Feature Engineering - Create New Features
print("\n" + "="*60)
print("STEP 2: CREATING NEW FEATURES")
print("="*60)

# Make a copy to preserve original
df = df.copy()

# 1. PROPERTY AGE FEATURES
print("\n1Ô∏è‚É£ Creating Age-Based Features...")
current_year = 2026
df['Property_Age'] = current_year - df['Year_Built']

# Age categories
def categorize_age(age):
    if age < 5:
        return 'New'
    elif age < 15:
        return 'Modern'
    elif age < 30:
        return 'Established'
    else:
        return 'Old'

df['Age_Category'] = df['Property_Age'].apply(categorize_age)
print(f"‚úÖ Age features created")

# 2. SPACE & SIZE FEATURES
print("\n2Ô∏è‚É£ Creating Space/Size Features...")
df['SqFt_per_BHK'] = df['Size_in_SqFt'] / df['BHK']

# Spaciousness indicator
median_sqft_per_bhk = df['SqFt_per_BHK'].median()
df['Is_Spacious'] = (df['SqFt_per_BHK'] > median_sqft_per_bhk).astype(int)
print(f"‚úÖ Space features created (median SqFt/BHK: {median_sqft_per_bhk:.2f})")

# 3. LOCATION VALUE FEATURES
print("\n3Ô∏è‚É£ Creating Location-Based Features...")
# State-level price ratio
state_avg_price = df.groupby('State')['Price_in_Lakhs'].transform('mean')
df['State_Price_Ratio'] = df['Price_in_Lakhs'] / state_avg_price

# City-level price ratio
city_avg_price = df.groupby('City')['Price_in_Lakhs'].transform('mean')
df['City_Price_Ratio'] = df['Price_in_Lakhs'] / city_avg_price
print(f"‚úÖ Location value features created")

print(f"\nNew features added: {len(df.columns) - len(df.columns)}")
print(f"Total columns now: {len(df.columns)}")
print(f"\nNew feature names:")
new_features = [col for col in df.columns if col not in df.columns]
for feat in new_features:
    print(f"  - {feat}")


STEP 2: CREATING NEW FEATURES

1Ô∏è‚É£ Creating Age-Based Features...
‚úÖ Age features created

2Ô∏è‚É£ Creating Space/Size Features...
‚úÖ Space features created (median SqFt/BHK: 918.00)

3Ô∏è‚É£ Creating Location-Based Features...
‚úÖ Location value features created

New features added: 0
Total columns now: 29

New feature names:


In [7]:
# Check all available columns to see amenity data
print("\nAll columns in dataset:")
print(df.columns.tolist())

# Check for any amenity-related columns
amenity_keywords = ['school', 'hospital', 'park', 'mall', 'metro', 'amenity', 'facility']
amenity_cols = [col for col in df.columns if any(keyword in col.lower() for keyword in amenity_keywords)]

if amenity_cols:
    print(f"\nFound amenity-related columns: {amenity_cols}")
else:
    print("\n‚ö†Ô∏è No explicit amenity columns found in dataset")


All columns in dataset:
['ID', 'State', 'City', 'Locality', 'Property_Type', 'BHK', 'Size_in_SqFt', 'Price_in_Lakhs', 'Price_per_SqFt', 'Year_Built', 'Furnished_Status', 'Floor_No', 'Total_Floors', 'Age_of_Property', 'Nearby_Schools', 'Nearby_Hospitals', 'Public_Transport_Accessibility', 'Parking_Space', 'Security', 'Amenities', 'Facing', 'Owner_Type', 'Availability_Status', 'Property_Age', 'Age_Category', 'SqFt_per_BHK', 'Is_Spacious', 'State_Price_Ratio', 'City_Price_Ratio']

Found amenity-related columns: ['Nearby_Schools', 'Nearby_Hospitals', 'Parking_Space']


In [8]:
# Step 3: Create Amenity & Investment Features (from documentation)
print("\n" + "="*60)
print("STEP 3: AMENITY & INVESTMENT FEATURES")
print("="*60)

# 4. AMENITY DENSITY SCORES (from documentation)
print("\n4Ô∏è‚É£ Creating Amenity Density Scores...")

# School Density Score (as mentioned in documentation)
# School Density Score = Total schools in locality / Total schools in city
locality_school_totals = df.groupby('Locality')['Nearby_Schools'].transform('sum')
city_school_totals = df.groupby('City')['Nearby_Schools'].transform('sum')
df['School_Density_Score'] = (locality_school_totals / city_school_totals)*100
# Hospital Accessibility Score
# Hospital Density Score = Total hospitals in locality / Total hospitals in city
locality_hospital_totals = df.groupby('Locality')['Nearby_Hospitals'].transform('sum')
city_hospital_totals = df.groupby('City')['Nearby_Hospitals'].transform('sum')
df['Hospital_Density_Score'] = locality_hospital_totals / city_hospital_totals
print(f"‚úÖ School Density Score created (locality/city ratio)")
print(f"‚úÖ Hospital Density Score created (locality/city ratio)")
# Amenity Score = Count of amenities (comma-separated list)df['Amenity_Score'] =
# Example: "Playground, Gym, Garden" -> Score = 3
def count_amenities(amenity_str):
    if pd.isna(amenity_str) or amenity_str == '' or amenity_str == 'None':
        return 0
    # Split by comma and count non-empty items
    return len([x.strip() for x in str(amenity_str).split(',') if x.strip()])

df['Amenity_Score'] = df['Amenities'].apply(count_amenities)

print(f"‚úÖ Amenity Score created (counts amenities from Amenities column)")
# 5. FLOOR & BUILDING FEATURES
print("\n5Ô∏è‚É£ Creating Floor-Based Features...")

# Floor categories
def categorize_floor(floor_no):
    if floor_no == 0:
        return 'Ground'
    elif floor_no <= 5:
        return 'Lower'
    elif floor_no <= 15:
        return 'Mid'
    else:
        return 'High'

df['Floor_Category'] = df['Floor_No'].apply(categorize_floor)

# Floor indicators
df['Is_Ground_Floor'] = (df['Floor_No'] == 0).astype(int)
df['Is_High_Floor'] = (df['Floor_No'] > 15).astype(int)
print(f"‚úÖ Floor features created")

# 6. INVESTMENT POTENTIAL INDICATORS
print("\n6Ô∏è‚É£ Creating Investment Indicators...")

# ROI Indicator (Price per sqft vs city average)
city_avg_price_per_sqft = df.groupby('City')['Price_per_SqFt'].transform('mean')
df['ROI_Indicator'] = df['Price_per_SqFt'] / city_avg_price_per_sqft

# Affordability Index
df['Affordability_Index'] = df['Price_in_Lakhs'] / (df['BHK'] * df['Size_in_SqFt'] / 1000)
print(f"‚úÖ Investment indicators created")

print(f"\nüìä SUMMARY:")
print(f"Total new features in this step: {len(df.columns) - 30}")
print(f"Total columns now: {len(df.columns)}")

# Show sample of new features
print(f"\nüìã Sample of Amenity Features:")
print(df[['Nearby_Schools', 'School_Density_Score', 'Amenity_Score', 'Floor_Category']].head(3))


STEP 3: AMENITY & INVESTMENT FEATURES

4Ô∏è‚É£ Creating Amenity Density Scores...
‚úÖ School Density Score created (locality/city ratio)
‚úÖ Hospital Density Score created (locality/city ratio)
‚úÖ Amenity Score created (counts amenities from Amenities column)

5Ô∏è‚É£ Creating Floor-Based Features...
‚úÖ Floor features created

6Ô∏è‚É£ Creating Investment Indicators...
‚úÖ Investment indicators created

üìä SUMMARY:
Total new features in this step: 7
Total columns now: 37

üìã Sample of Amenity Features:
   Nearby_Schools  School_Density_Score  Amenity_Score Floor_Category
0              10              8.024987              5          Lower
1               8             12.706713              5           High
2               9              8.151383              4           High


In [9]:
# FIX: Check and convert Parking_Space to numeric
print("Checking Parking_Space column...")
print(f"Data type: {df['Parking_Space'].dtype}")
print(f"Sample values: {df['Parking_Space'].head()}")
print(f"Unique values: {df['Parking_Space'].unique()[:10]}")

# Convert Parking_Space to numeric (1 for Yes/True, 0 for No/False)
if df['Parking_Space'].dtype == 'object':
    df['Parking_Space_Numeric'] = df['Parking_Space'].map({
        'Yes': 1, 'yes': 1, 'YES': 1, True: 1, 'True': 1,
        'No': 0, 'no': 0, 'NO': 0, False: 0, 'False': 0
    }).fillna(0).astype(int)
    print("‚úÖ Parking_Space converted to numeric")
else:
    df['Parking_Space_Numeric'] = df['Parking_Space']
    print("‚úÖ Parking_Space already numeric")

# Now create the Amenity Score with numeric values
df['Amenity_Score'] = (
    df['Nearby_Schools'] * 0.4 +
    df['Nearby_Hospitals'] * 0.3 +
    df['Parking_Space_Numeric'] * 0.3
)
print("‚úÖ Combined Amenity Score created successfully!")


Checking Parking_Space column...
Data type: object
Sample values: 0     No
1     No
2    Yes
3    Yes
4     No
Name: Parking_Space, dtype: object
Unique values: ['No' 'Yes']
‚úÖ Parking_Space converted to numeric
‚úÖ Combined Amenity Score created successfully!


In [10]:
# Step 4: Categorical Encoding & Feature Scaling
print("\n" + "="*60)
print("STEP 4: CATEGORICAL ENCODING & FEATURE SCALING")
print("="*60)

# Identify categorical columns that need encoding
categorical_cols = ['State', 'City', 'Locality', 'Property_Type', 'Furnished_Status',
                    'Age_Category', 'Public_Transport_Accessibility', 'Parking_Space',
                    'Security', 'Facing', 'Owner_Type', 'Availability_Status', 'Floor_Category']

print("\n7Ô∏è‚É£ Label Encoding Categorical Features...")
df_encoded = df.copy()

# Apply Label Encoding
label_encoders = {}
for col in categorical_cols:
    if col in df_encoded.columns:
        le = LabelEncoder()
        df_encoded[col + '_Encoded'] = le.fit_transform(df_encoded[col].astype(str))
        label_encoders[col] = le
        print(f"   ‚úÖ {col} encoded ({len(le.classes_)} unique values)")

print(f"\n‚úÖ All categorical features encoded!")
print(f"Total columns after encoding: {len(df_encoded.columns)}")

# Display sample
print("\nüìä Sample with encoded features:")
encoded_cols = [col for col in df_encoded.columns if col.endswith('_Encoded')]
print(df_encoded[['State', 'State_Encoded', 'Property_Type', 'Property_Type_Encoded']].head(3))


STEP 4: CATEGORICAL ENCODING & FEATURE SCALING

7Ô∏è‚É£ Label Encoding Categorical Features...
   ‚úÖ State encoded (20 unique values)
   ‚úÖ City encoded (42 unique values)
   ‚úÖ Locality encoded (500 unique values)
   ‚úÖ Property_Type encoded (3 unique values)
   ‚úÖ Furnished_Status encoded (3 unique values)
   ‚úÖ Age_Category encoded (4 unique values)
   ‚úÖ Public_Transport_Accessibility encoded (3 unique values)
   ‚úÖ Parking_Space encoded (2 unique values)
   ‚úÖ Security encoded (2 unique values)
   ‚úÖ Facing encoded (4 unique values)
   ‚úÖ Owner_Type encoded (3 unique values)
   ‚úÖ Availability_Status encoded (2 unique values)
   ‚úÖ Floor_Category encoded (4 unique values)

‚úÖ All categorical features encoded!
Total columns after encoding: 51

üìä Sample with encoded features:
         State  State_Encoded      Property_Type  Property_Type_Encoded
0   Tamil Nadu             15          Apartment                      0
1  Maharashtra             11  Independent House

In [11]:
# Step 5: Feature Scaling
print("\n" + "="*60)
print("STEP 5: FEATURE SCALING")
print("="*60)

# Select numerical columns for scaling (exclude ID and target variable)
print("\n8Ô∏è‚É£ Scaling Numerical Features...")

# Identify numerical columns
numerical_cols = df_encoded.select_dtypes(include=['int64', 'float64']).columns.tolist()
# Exclude ID and encoded categorical columns from scaling
cols_to_exclude = ['ID'] + [col for col in df_encoded.columns if col.endswith('_Encoded')]
numerical_cols_to_scale = [col for col in numerical_cols if col not in cols_to_exclude]

print(f"Numerical columns to scale: {len(numerical_cols_to_scale)}")

# Apply StandardScaler
scaler = StandardScaler()
df_scaled = df_encoded.copy()
df_scaled[numerical_cols_to_scale] = scaler.fit_transform(df_encoded[numerical_cols_to_scale])

print(f"‚úÖ {len(numerical_cols_to_scale)} numerical features scaled using StandardScaler")
print(f"\nScaled features sample (mean‚âà0, std‚âà1):")
print(df_scaled[numerical_cols_to_scale[:3]].describe().loc[['mean', 'std']])


STEP 5: FEATURE SCALING

8Ô∏è‚É£ Scaling Numerical Features...
Numerical columns to scale: 23
‚úÖ 23 numerical features scaled using StandardScaler

Scaled features sample (mean‚âà0, std‚âà1):
               BHK  Size_in_SqFt  Price_in_Lakhs
mean  7.111112e-17 -7.801759e-18    2.613660e-16
std   1.000002e+00  1.000002e+00    1.000002e+00


In [12]:
df.head()


Unnamed: 0,ID,State,City,Locality,Property_Type,BHK,Size_in_SqFt,Price_in_Lakhs,Price_per_SqFt,Year_Built,...,City_Price_Ratio,School_Density_Score,Hospital_Density_Score,Amenity_Score,Floor_Category,Is_Ground_Floor,Is_High_Floor,ROI_Indicator,Affordability_Index,Parking_Space_Numeric
0,1,Tamil Nadu,Chennai,Locality_84,Apartment,1,4740,489.76,0.1,1990,...,1.906941,8.024987,0.07776,4.9,Lower,0,0,0.759709,103.324895,0
1,2,Maharashtra,Pune,Locality_490,Independent House,3,2364,195.52,0.08,2008,...,0.762274,12.706713,0.124563,3.5,High,0,1,0.601624,27.569092,0
2,3,Punjab,Ludhiana,Locality_167,Apartment,2,3642,183.79,0.05,1997,...,0.725985,8.151383,0.081935,6.3,High,0,1,0.386723,25.232015,1
3,4,Rajasthan,Jodhpur,Locality_393,Independent House,2,2741,300.29,0.11,1991,...,1.182487,8.692716,0.086641,4.4,High,0,1,0.85017,54.777453,1
4,5,Rajasthan,Jaipur,Locality_466,Villa,4,4823,182.9,0.04,2002,...,0.716563,8.022773,0.081566,4.3,Lower,0,0,0.309392,9.480614,0


In [13]:

# Show final list of engineered features
original_cols = ['ID', 'State', 'City', 'Locality', 'Property_Type', 'BHK', 'Size_in_SqFt',
                 'Price_in_Lakhs', 'Price_per_SqFt', 'Year_Built', 'Furnished_Status',
                 'Floor_No', 'Nearby_Schools', 'Nearby_Hospitals', 'Parking_Space']

engineered_features = [col for col in df.columns if col not in original_cols]

print(f"\nüéØ FINAL ENGINEERED FEATURES ({len(engineered_features)}):")
for i, feat in enumerate(engineered_features, 1):
    print(f"  {i}. {feat}")

print(f"\nüìã Sample of dataset with new features:")
df[['BHK', 'Property_Age', 'Age_Category', 'SqFt_per_BHK', 'School_Density_Score', 'Amenity_Score']].head(3)


üéØ FINAL ENGINEERED FEATURES (23):
  1. Total_Floors
  2. Age_of_Property
  3. Public_Transport_Accessibility
  4. Security
  5. Amenities
  6. Facing
  7. Owner_Type
  8. Availability_Status
  9. Property_Age
  10. Age_Category
  11. SqFt_per_BHK
  12. Is_Spacious
  13. State_Price_Ratio
  14. City_Price_Ratio
  15. School_Density_Score
  16. Hospital_Density_Score
  17. Amenity_Score
  18. Floor_Category
  19. Is_Ground_Floor
  20. Is_High_Floor
  21. ROI_Indicator
  22. Affordability_Index
  23. Parking_Space_Numeric

üìã Sample of dataset with new features:


Unnamed: 0,BHK,Property_Age,Age_Category,SqFt_per_BHK,School_Density_Score,Amenity_Score
0,1,36,Old,4740.0,8.024987,4.9
1,3,18,Established,788.0,12.706713,3.5
2,2,29,Established,1821.0,8.151383,6.3
