<a href="https://colab.research.google.com/github/3srava0/assignment-3/blob/main/02_Feature_Engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Day 2: Feature Engineering
# Real Estate Investment Advisor System

print("="*60)
print("FEATURE ENGINEERING - Real Estate Data")
print("="*60)

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

print("\n✅ Libraries imported successfully!")

In [None]:
# Load cleaned dataset from correct path
print("\n" + "="*60)
print("LOADING CLEANED DATASET")
print("="*60)

df = pd.read_csv('/content/drive/MyDrive/assignment-3/data/india_housing_prices_cleaned.csv')
original_columns = df.columns.tolist()
original_column_count = len(original_columns)

print(f"\n✅ Dataset loaded successfully!")
print(f"Shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
print(f"\nData types:")
print(df.dtypes)
print(f"\nFirst few rows:")
df.head()


In [None]:
# Step 2: Feature Engineering - Create New Features
print("\n" + "="*60)
print("STEP 2: CREATING NEW FEATURES")
print("="*60)

df = df.copy()
cols_before_step2 = set(df.columns)

print("\n1️⃣ Creating Age-Based Features...")
current_year = pd.Timestamp.today().year
df['Property_Age'] = current_year - df['Year_Built']
df['Property_Age'] = df['Property_Age'].clip(lower=0)

def categorize_age(age):
    if age < 5:
        return 'New'
    elif age < 15:
        return 'Modern'
    elif age < 30:
        return 'Established'
    else:
        return 'Old'

df['Age_Category'] = df['Property_Age'].apply(categorize_age)
print(f"✅ Age features created")

print("\n2️⃣ Creating Space/Size Features...")
df['SqFt_per_BHK'] = np.where(df['BHK'] > 0, df['Size_in_SqFt'] / df['BHK'], np.nan)

median_sqft_per_bhk = df['SqFt_per_BHK'].median()
df['Is_Spacious'] = (df['SqFt_per_BHK'] > median_sqft_per_bhk).astype(int)
print(f"✅ Space features created (median SqFt/BHK: {median_sqft_per_bhk:.2f})")

print("\n3️⃣ Creating Location-Based Features...")
state_avg_size = df.groupby('State')['Size_in_SqFt'].transform('mean').replace(0, np.nan)
df['State_Size_Ratio'] = df['Size_in_SqFt'] / state_avg_size

city_avg_size = df.groupby('City')['Size_in_SqFt'].transform('mean').replace(0, np.nan)
df['City_Size_Ratio'] = df['Size_in_SqFt'] / city_avg_size
print(f"✅ Location context features created")

new_features = [col for col in df.columns if col not in cols_before_step2]
print(f"\nNew features added: {len(new_features)}")
print(f"Total columns now: {len(df.columns)}")
print(f"\nNew feature names:")
for feat in new_features:
    print(f"  - {feat}")


In [None]:
# Check all available columns to see amenity data
print("\nAll columns in dataset:")
print(df.columns.tolist())

# Check for any amenity-related columns
amenity_keywords = ['school', 'hospital', 'park', 'mall', 'metro', 'amenity', 'facility']
amenity_cols = [col for col in df.columns if any(keyword in col.lower() for keyword in amenity_keywords)]

if amenity_cols:
    print(f"\nFound amenity-related columns: {amenity_cols}")
else:
    print("\n⚠️ No explicit amenity columns found in dataset")

In [None]:
# Step 3: Create Amenity & Investment Features
print("\n" + "="*60)
print("STEP 3: AMENITY & INVESTMENT FEATURES")
print("="*60)

cols_before_step3 = set(df.columns)

print("\n4️⃣ Creating Amenity Density Scores...")
locality_school_totals = df.groupby('Locality')['Nearby_Schools'].transform('sum')
city_school_totals = df.groupby('City')['Nearby_Schools'].transform('sum').replace(0, np.nan)
df['School_Density_Score'] = (locality_school_totals / city_school_totals) * 100

locality_hospital_totals = df.groupby('Locality')['Nearby_Hospitals'].transform('sum')
city_hospital_totals = df.groupby('City')['Nearby_Hospitals'].transform('sum').replace(0, np.nan)
df['Hospital_Density_Score'] = locality_hospital_totals / city_hospital_totals
print(f"✅ School Density Score created (locality/city ratio)")
print(f"✅ Hospital Density Score created (locality/city ratio)")

def count_amenities(amenity_str):
    if pd.isna(amenity_str) or str(amenity_str).strip() in ['', 'None']:
        return 0
    return len([x.strip() for x in str(amenity_str).split(',') if x.strip()])

df['Amenity_Count_Score'] = df['Amenities'].apply(count_amenities)
print(f"✅ Amenity Count Score created")

print("\n5️⃣ Creating Floor-Based Features...")
def categorize_floor(floor_no):
    if floor_no == 0:
        return 'Ground'
    elif floor_no <= 5:
        return 'Lower'
    elif floor_no <= 15:
        return 'Mid'
    else:
        return 'High'

df['Floor_Category'] = df['Floor_No'].apply(categorize_floor)
df['Is_Ground_Floor'] = (df['Floor_No'] == 0).astype(int)
df['Is_High_Floor'] = (df['Floor_No'] > 15).astype(int)
print(f"✅ Floor features created")

print("\n6️⃣ Creating Investment Indicators...")
city_avg_price_per_sqft = df.groupby('City')['Price_per_SqFt'].transform('mean').replace(0, np.nan)
df['ROI_Indicator'] = df['Price_per_SqFt'] / city_avg_price_per_sqft

denominator = (df['BHK'] * df['Size_in_SqFt'] / 1000).replace(0, np.nan)
df['Affordability_Index'] = df['Price_in_Lakhs'] / denominator
print(f"✅ Investment indicators created")

new_features_step3 = [col for col in df.columns if col not in cols_before_step3]
print(f"\n📊 SUMMARY:")
print(f"Total new features in this step: {len(new_features_step3)}")
print(f"Total columns now: {len(df.columns)}")

print(f"\n📋 Sample of Amenity Features:")
print(df[['Nearby_Schools', 'School_Density_Score', 'Amenity_Count_Score', 'Floor_Category']].head(3))


In [None]:
# FIX: Check and convert Parking_Space to numeric
print("Checking Parking_Space column...")
print(f"Data type: {df['Parking_Space'].dtype}")
print(f"Sample values: {df['Parking_Space'].head()}")
print(f"Unique values: {df['Parking_Space'].unique()[:10]}")

if df['Parking_Space'].dtype == 'object':
    df['Parking_Space_Numeric'] = df['Parking_Space'].map({
        'Yes': 1, 'yes': 1, 'YES': 1, True: 1, 'True': 1,
        'No': 0, 'no': 0, 'NO': 0, False: 0, 'False': 0
    }).fillna(0).astype(int)
    print("✅ Parking_Space converted to numeric")
else:
    df['Parking_Space_Numeric'] = df['Parking_Space']
    print("✅ Parking_Space already numeric")

df['Amenity_Weighted_Score'] = (
    df['Nearby_Schools'] * 0.4 +
    df['Nearby_Hospitals'] * 0.3 +
    df['Parking_Space_Numeric'] * 0.3
)
print("✅ Amenity Weighted Score created successfully!")


In [None]:
# Step 4: Categorical Encoding & Feature Scaling
print("\n" + "="*60)
print("STEP 4: CATEGORICAL ENCODING & FEATURE SCALING")
print("="*60)

# Identify categorical columns that need encoding
categorical_cols = ['State', 'City', 'Locality', 'Property_Type', 'Furnished_Status',
                    'Age_Category', 'Public_Transport_Accessibility', 'Parking_Space',
                    'Security', 'Facing', 'Owner_Type', 'Availability_Status', 'Floor_Category']

print("\n7️⃣ Label Encoding Categorical Features...")
df_encoded = df.copy()

# Apply Label Encoding
label_encoders = {}
for col in categorical_cols:
    if col in df_encoded.columns:
        le = LabelEncoder()
        df_encoded[col + '_Encoded'] = le.fit_transform(df_encoded[col].astype(str))
        label_encoders[col] = le
        print(f"   ✅ {col} encoded ({len(le.classes_)} unique values)")

print(f"\n✅ All categorical features encoded!")
print(f"Total columns after encoding: {len(df_encoded.columns)}")

# Display sample
print("\n📊 Sample with encoded features:")
encoded_cols = [col for col in df_encoded.columns if col.endswith('_Encoded')]
print(df_encoded[['State', 'State_Encoded', 'Property_Type', 'Property_Type_Encoded']].head(3))

In [None]:
# Step 5: Feature Scaling
print("\n" + "="*60)
print("STEP 5: FEATURE SCALING")
print("="*60)

print("\n8️⃣ Scaling Numerical Features...")

numerical_cols = df_encoded.select_dtypes(include=['int64', 'float64']).columns.tolist()
target_col = 'Price_in_Lakhs'
cols_to_exclude = ['ID', target_col] + [col for col in df_encoded.columns if col.endswith('_Encoded')]

binary_like_cols = [
    'Is_Spacious', 'Is_Ground_Floor', 'Is_High_Floor', 'Parking_Space_Numeric'
]
cols_to_exclude += [col for col in binary_like_cols if col in df_encoded.columns]

numerical_cols_to_scale = [col for col in numerical_cols if col not in cols_to_exclude]
print(f"Numerical columns to scale: {len(numerical_cols_to_scale)}")

scaler = StandardScaler()
df_scaled = df_encoded.copy()
df_scaled[numerical_cols_to_scale] = scaler.fit_transform(df_encoded[numerical_cols_to_scale])

print(f"✅ {len(numerical_cols_to_scale)} numerical features scaled using StandardScaler")
print(f"\nScaled features sample (mean≈0, std≈1):")
print(df_scaled[numerical_cols_to_scale[:3]].describe().loc[['mean', 'std']])


In [None]:
df.head()


In [None]:
# Show final list of engineered features
engineered_features = [col for col in df.columns if col not in original_columns]

print(f"\n🎯 FINAL ENGINEERED FEATURES ({len(engineered_features)}):")
for i, feat in enumerate(engineered_features, 1):
    print(f"  {i}. {feat}")

print(f"\n📋 Sample of dataset with new features:")
preview_cols = ['BHK', 'Property_Age', 'Age_Category', 'SqFt_per_BHK',
                'School_Density_Score', 'Amenity_Count_Score', 'Amenity_Weighted_Score']
preview_cols = [col for col in preview_cols if col in df.columns]
df[preview_cols].head(3)


In [None]:
# Step 6: Save Featured Dataset to Google Drive
print("\n" + "="*60)
print("STEP 6: SAVE FEATURED DATASET")
print("="*60)

print("\n💾 Saving datasets to Google Drive...")

featured_path = '/content/drive/MyDrive/assignment-3/data/india_housing_prices_featured.csv'
df.to_csv(featured_path, index=False)
print(f"✅ Engineered dataset saved!")
print(f"   Path: {featured_path}")
print(f"   Shape: {df.shape}")
print(f"   Total columns: {len(df.columns)}")

model_ready_path = '/content/drive/MyDrive/assignment-3/data/india_housing_prices_model_ready.csv'
df_scaled.to_csv(model_ready_path, index=False)
print(f"✅ Model-ready dataset saved!")
print(f"   Path: {model_ready_path}")
print(f"   Shape: {df_scaled.shape}")

sample_path = '/content/drive/MyDrive/assignment-3/data/featured_data_sample.csv'
df.head(100).to_csv(sample_path, index=False)
print(f"\n✅ Sample dataset (100 rows) saved for GitHub")
print(f"   Path: {sample_path}")

print("\n" + "="*60)
print("🎉 FEATURE ENGINEERING COMPLETE!")
print("="*60)
print(f"\n📊 SUMMARY:")
print(f"   Original columns: {original_column_count}")
print(f"   Total engineered features: {len(df.columns) - original_column_count}")
print(f"   Total columns now: {len(df.columns)}")
print(f"   Total rows: {len(df):,}")
print(f"\n✅ Dataset ready for model training!")
print(f"\n📁 Files saved in Google Drive:")
print(f"   - Engineered dataset: india_housing_prices_featured.csv")
print(f"   - Model-ready dataset: india_housing_prices_model_ready.csv")
print(f"   - Sample (GitHub): featured_data_sample.csv")
